diff --git a/_toc.yml b/_toc.yml index 0ea8ff82..c425fe27 100644 --- a/_toc.yml +++ b/_toc.yml @@ -10,3 +10,4 @@ chapters: - file: part7a_bitstream.ipynb - file: part7b_deployment.ipynb - file: part7c_validation.ipynb + - file: part8_symbolic_regression.ipynb diff --git a/docker/Dockerfile b/docker/Dockerfile index 21c994c5..a4db3be3 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -30,7 +30,8 @@ RUN mamba install -y -c conda-forge \ RUN pip install \ hls4ml[profiling]==0.8.0 \ qkeras==0.9.0 \ - conifer==0.2b0 + conifer==0.2b0 \ + pysr==0.16.3 RUN mamba clean --all -f -y && \ mamba list && \ fix-permissions "${CONDA_DIR}" && \ diff --git a/docker/Dockerfile.vivado b/docker/Dockerfile.vivado index 262e7822..166fa098 100644 --- a/docker/Dockerfile.vivado +++ b/docker/Dockerfile.vivado @@ -35,7 +35,8 @@ RUN mamba install -y -c conda-forge \ RUN pip install \ hls4ml[profiling]==0.8.0 \ qkeras==0.9.0 \ - conifer==0.2b0 + conifer==0.2b0 \ + pysr==0.16.3 RUN mamba clean --all -f -y && \ mamba list && \ fix-permissions "${CONDA_DIR}" && \ diff --git a/environment.yml b/environment.yml index 9b7bc2a8..406c5a43 100644 --- a/environment.yml +++ b/environment.yml @@ -19,3 +19,4 @@ dependencies: - hls4ml[profiling]==0.8.0 - qkeras==0.9.0 - conifer==0.2b0 + - pysr==0.16.3 diff --git a/part8_symbolic_regression.ipynb b/part8_symbolic_regression.ipynb new file mode 100644 index 00000000..af93c1a3 --- /dev/null +++ b/part8_symbolic_regression.ipynb @@ -0,0 +1,502 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "79933ff7", + "metadata": {}, + "source": [ + "# Part 8: Symbolic Regression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ede2226f", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import sympy\n", + "import matplotlib.pyplot as plt\n", + "import hls4ml\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler, LabelEncoder\n", + "from sklearn.metrics import roc_curve, auc, accuracy_score\n", + "from tensorflow.keras.utils import to_categorical\n", + "from sklearn.datasets import fetch_openml" + ] + }, + { + "cell_type": "markdown", + "id": "d9e2b159", + "metadata": {}, + "source": [ + "## Load the LHC jet tagging dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee6d96bd", + "metadata": {}, + "outputs": [], + "source": [ + "data = fetch_openml('hls4ml_lhc_jets_hlf')\n", + "X, Y = data['data'].to_numpy(), data['target'].to_numpy()\n", + "print(data['feature_names'])\n", + "print(X.shape, Y.shape)\n", + "print(Y[:10])\n", + "\n", + "LE = LabelEncoder()\n", + "Y = LE.fit_transform(Y)\n", + "Y = to_categorical(Y, 5)\n", + "\n", + "Y = 2 * Y - 1\n", + "print(Y[:10])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0502aea8", + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=123)\n", + "\n", + "scaler = StandardScaler().fit(X_train)\n", + "X_train = scaler.transform(X_train)\n", + "X_test = scaler.transform(X_test)\n", + "\n", + "# PySR (or any genetic programming based SR) not happy with too many training data\n", + "X_train = X_train[:8000]\n", + "Y_train = Y_train[:8000]\n", + "\n", + "print('X_train.shape: ' + str(X_train.shape))\n", + "print('Y_train.shape: ' + str(Y_train.shape))\n", + "print('X_test.shape: ' + str(X_test.shape))\n", + "print('Y_test.shape: ' + str(Y_test.shape))" + ] + }, + { + "cell_type": "markdown", + "id": "7ec86106", + "metadata": {}, + "source": [ + "## Perform SR with PySR (if installed)" + ] + }, + { + "cell_type": "markdown", + "id": "57e7896d", + "metadata": {}, + "source": [ + "If you want to run `PySR` (a genetic programming-based symbolic regression software), please see https://github.com/MilesCranmer/PySR for installation and intructions.\n", + "\n", + "Below is an example configuration script to run training in `PySR`, where one can specify the allowed primitive functions `unary_operators` `binary_operators` (e.g. `+`, `*`, `sin`) and constraints `complexity_of_operators` `constraints` `nested_constraints` in the equation seacrhing. The training results will be stored in a `.pkl` file that contains the final equations selected by the training strategy `model_selection`.\n", + "\n", + "We also provide an already trained PySR model `sr/example.pkl` in the following sections for demonstrating the HLS implementation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96a651dd", + "metadata": {}, + "outputs": [], + "source": [ + "from pysr import PySRRegressor\n", + "\n", + "!export JULIA_NUM_THREADS=32\n", + "\n", + "model_pysr = PySRRegressor(\n", + " model_selection='accuracy',\n", + " niterations=40,\n", + " timeout_in_seconds=60 * 60 * 1,\n", + " maxsize=40,\n", + " select_k_features=6,\n", + " binary_operators=['+', '-', '*'],\n", + " unary_operators=['sin', 'sc(x)=sin(x)*cos(x)'],\n", + " complexity_of_operators={'+': 1, '-': 1, '*': 1, 'sin': 1, 'sc': 1},\n", + " constraints={'sin': 20, 'sc': 20},\n", + " nested_constraints={'sin': {'sin': 0, 'sc': 0}, 'sc': {'sin': 0, 'sc': 0}},\n", + " extra_sympy_mappings={'sc': lambda x: sympy.sin(x) * sympy.cos(x)},\n", + " loss='L2MarginLoss()', # (1 - y*y')^2\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f4d9501", + "metadata": {}, + "outputs": [], + "source": [ + "model_pysr.fit(X_train, Y_train)" + ] + }, + { + "cell_type": "markdown", + "id": "846e710b", + "metadata": {}, + "source": [ + "## Prepare symbolic expressions in strings first" + ] + }, + { + "cell_type": "markdown", + "id": "c7aaf105", + "metadata": {}, + "source": [ + "We provide a trained model for the HLS demonstration.\n", + "\n", + "**If you have `PySR` installed**, you can directly load the trained expressions from the output file `sr/example.pkl`.\n", + "`PySR` allows custom functions to be defined, such as sc(x):=sin(x)*cos(x) in this example, they need to be re-defined through `extra_sympy_mappings` and a new `sympy` class when retrieving the equations for evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3d5d2cd", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "from pysr import PySRRegressor\n", + "\n", + "model_pysr = PySRRegressor.from_file('sr/example.pkl')\n", + "with sympy.evaluate(True):\n", + " for i in range(5):\n", + " print('Tagger {} = '.format(i) + str(model_pysr.sympy()[i]) + '\\n------------------------------------------')\n", + "\n", + "# Re-write custom operator defined from PySR config: sc(x) = sin(x)*cos(x)\n", + "model_pysr.set_params(extra_sympy_mappings={\"sc\": lambda x: sympy.sin(x) * sympy.cos(x)})\n", + "model_pysr.refresh()\n", + "\n", + "\n", + "class sc(sympy.Function):\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "id": "699d2e05", + "metadata": {}, + "source": [ + "There are two options for evaluating math functions in `hls4ml`, one is using the standard HLS math library (`func`), another one is using approximation with user-defined lookup tables (`func_lut`) for resources saving. We will define the lookup tables (table range and size) for `func_lut` later.\n", + "\n", + "We have the equations in the `sympy` format, now convert them into strings: `expr` for using the standard functions and `expr_lut` for using the approximation with lookup tables. We will re-parse `expr` and `expr_lut` from strings in `sympy` format for the `hls4ml` converter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7219a874", + "metadata": {}, + "outputs": [], + "source": [ + "expr = []\n", + "expr_lut = []\n", + "for i in range(5):\n", + " expr.append(str(model_pysr.sympy()[i]))\n", + " expr_lut.append(expr[i].replace(\"sin\", \"sin_lut\").replace(\"cos\", \"cos_lut\"))" + ] + }, + { + "cell_type": "markdown", + "id": "0abcba26", + "metadata": {}, + "source": [ + "**If you don't have PySR installed**, you can also write your expressions directly in strings and parse in `sympy` format, which can then be fed to `hls4ml` converter. Here again, `expr` for using standard math library, `expr_lut` for using approximation with lookup tables." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3356d1e6", + "metadata": {}, + "outputs": [], + "source": [ + "# Expressions from 'sr/example.pkl'\n", + "\n", + "# Expressions that will use Vivado math library\n", + "expr = [\n", + " '-0.1630426*(sin(-0.75052315)*cos(-0.75052315) - 0.84283006)*sin(2*x14 - 1.03665108)*cos(2*x14 - 1.03665108) - sin(x14 - (0.9237657 - 0.11933863*x3)*(-x15 + 2*x2 - 0.3817056) + 1.761264957)',\n", + " '(-(0.5822144*sin(0.83811*x14)*cos(0.83811*x14) - 0.5324657)*(sin(0.3923645*x2)*cos(0.3923645*x2) - 0.63548696) + sin(x14 - 0.3923645*x15 + x3 + 0.51168373)*cos(x14 - 0.3923645*x15 + x3 + 0.51168373))*(0.561041303633489*sin(x15) - 0.47277835) - 0.84055585',\n", + " '0.49239117*(sin(x3)*cos(x3) + sin(x15 + 0.76784414*x3)*cos(x15 + 0.76784414*x3))*(sin(-0.13417026)*cos(-0.13417026) + sin(0.5180547)*cos(0.5180547) + sin(x2)*cos(x2)) - sin(x14 + 0.25715914*x15*x3 - x2 - x3 + 0.66443527)',\n", + " '0.41071504*(0.9298677 - sin(0.59376544*x15))*(sin(x14)*cos(x14) + 5.2546763*sin(0.71913457 - x3)*cos(0.71913457 - x3))*(-sin(2*x3)*cos(2*x3) + sin(5.2546763*x14 + x3 + 0.77032656)*cos(5.2546763*x14 + x3 + 0.77032656) + 0.32492808) - 0.863786752431664',\n", + " '(1.0745832 - sin(-x14 - 0.4094719)*cos(-x14 - 0.4094719))*(-0.15737492*x15 - sin(x14 - 4.2594776)*cos(x14 - 4.2594776) + sin(3*x14 - x3*(x14 - 4.1772995) - x3 + 3.087878)*cos(3*x14 - x3*(x14 - 4.1772995) - x3 + 3.087878) - 0.690204005690814)',\n", + "]\n", + "# Expressions that will use look-up table approximated math functions\n", + "expr_lut = []\n", + "for i in range(len(expr)):\n", + " expr_lut.append(expr[i].replace(\"sin\", \"sin_lut\").replace(\"cos\", \"cos_lut\"))" + ] + }, + { + "cell_type": "markdown", + "id": "788ee608", + "metadata": {}, + "source": [ + "## Then parse the strings to sympy expressions" + ] + }, + { + "cell_type": "markdown", + "id": "03fc8284", + "metadata": {}, + "source": [ + "Define the lookup tables for approximating math functions. The table range and size can be customized for each function to be approximated, they depend on how much precision can be compromised to save more resources." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "920e2326", + "metadata": {}, + "outputs": [], + "source": [ + "from hls4ml.utils.symbolic_utils import init_pysr_lut_functions\n", + "\n", + "# For functions approximated with look-up table, define the table range and size\n", + "function_definitions = [\n", + " 'sin_lut(x) = math_lut(sin, x, N=256, range_start=-8, range_end=8)',\n", + " 'cos_lut(x) = math_lut(cos, x, N=256, range_start=-8, range_end=8)',\n", + "]\n", + "init_pysr_lut_functions(init_defaults=True, function_definitions=function_definitions)\n", + "\n", + "lut_functions = {\n", + " 'sin_lut': {'math_func': 'sin', 'range_start': -8, 'range_end': 8, 'table_size': 256},\n", + " 'cos_lut': {'math_func': 'cos', 'range_start': -8, 'range_end': 8, 'table_size': 256},\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "8be93891", + "metadata": {}, + "source": [ + "Parse `expr` and `expr_lut` to sympy expressions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96f61066", + "metadata": {}, + "outputs": [], + "source": [ + "# Use sympy to parse strings into sympy expressions\n", + "for i in range(len(expr)):\n", + " print('expr =\\n' + expr[i])\n", + " print(\"----------------------------------------\")\n", + " print('expr_LUT =\\n' + expr_lut[i])\n", + " print(\"========================================\")\n", + " expr[i] = sympy.parsing.sympy_parser.parse_expr(expr[i])\n", + " expr_lut[i] = sympy.parsing.sympy_parser.parse_expr(expr_lut[i])" + ] + }, + { + "cell_type": "markdown", + "id": "f7548c93", + "metadata": {}, + "source": [ + "Use `hls4ml.converters.convert_from_symbolic_expression` to convert sympy expressions and compile." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46ff4b5e", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# Use hls4ml to convert sympy expressions into HLS model\n", + "hls_model = hls4ml.converters.convert_from_symbolic_expression(\n", + " expr, n_symbols=16, output_dir='my-hls-test', precision='ap_fixed<16,6>', part='xcvu9p-flga2577-2-e'\n", + ")\n", + "hls_model.write()\n", + "hls_model.compile()\n", + "\n", + "hls_model_lut = hls4ml.converters.convert_from_symbolic_expression(\n", + " expr_lut,\n", + " n_symbols=16,\n", + " output_dir='my-hls-test-lut',\n", + " precision='ap_fixed<16,6>',\n", + " part='xcvu9p-flga2577-2-e',\n", + " lut_functions=lut_functions,\n", + ")\n", + "hls_model_lut.write()\n", + "hls_model_lut.compile()" + ] + }, + { + "cell_type": "markdown", + "id": "08682628", + "metadata": {}, + "source": [ + "## Compare outputs: PySR vs HLS vs HLS(LUT)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39269441", + "metadata": {}, + "outputs": [], + "source": [ + "test_vector = np.random.rand(1, 16) * 4 - 2\n", + "# print(model_pysr.predict(test_vector))\n", + "print(hls_model.predict(test_vector))\n", + "print(hls_model_lut.predict(test_vector))" + ] + }, + { + "cell_type": "markdown", + "id": "08795fca", + "metadata": {}, + "source": [ + "## Compare performance on the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05894f0b", + "metadata": {}, + "outputs": [], + "source": [ + "# Y_pysr = model_pysr.predict(X_test)\n", + "Y_hls = hls_model.predict(X_test)\n", + "Y_hls_lut = hls_model_lut.predict(X_test)\n", + "# auc_pysr=[]\n", + "auc_hls = []\n", + "auc_hls_lut = []\n", + "for x, label in enumerate(LE.classes_):\n", + " # fpr_pysr, tpr_pysr, _ = roc_curve(Y_test[:, x], Y_pysr[:, x])\n", + " fpr_hls, tpr_hls, _ = roc_curve(Y_test[:, x], Y_hls[:, x])\n", + " fpr_hls_lut, tpr_hls_lut, _ = roc_curve(Y_test[:, x], Y_hls_lut[:, x])\n", + " # auc_pysr.append(auc(fpr_pysr, tpr_pysr))\n", + " auc_hls.append(auc(fpr_hls, tpr_hls))\n", + " auc_hls_lut.append(auc(fpr_hls_lut, tpr_hls_lut))\n", + "\n", + "# print('PySR acc = {0:.3f}'.format(accuracy_score(np.argmax(Y_test, axis=1), np.argmax(Y_pysr, axis=1))))\n", + "# print('PySR auc = {0:.3f},{1:.3f},{2:.3f},{3:.3f},{4:.3f}'.format(auc_pysr[0],auc_pysr[1],auc_pysr[2],auc_pysr[3],auc_pysr[4]))\n", + "print('HLS acc = {0:.3f}'.format(accuracy_score(np.argmax(Y_test, axis=1), np.argmax(Y_hls, axis=1))))\n", + "print(\n", + " 'HLS auc = {0:.3f},{1:.3f},{2:.3f},{3:.3f},{4:.3f}'.format(\n", + " auc_hls[0], auc_hls[1], auc_hls[2], auc_hls[3], auc_hls[4]\n", + " )\n", + ")\n", + "print('HLS_LUT acc = {0:.3f}'.format(accuracy_score(np.argmax(Y_test, axis=1), np.argmax(Y_hls_lut, axis=1))))\n", + "print(\n", + " 'HLS_LUT auc = {0:.3f},{1:.3f},{2:.3f},{3:.3f},{4:.3f}'.format(\n", + " auc_hls_lut[0], auc_hls_lut[1], auc_hls_lut[2], auc_hls_lut[3], auc_hls_lut[4]\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "002643a3", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_roc(y_test, y_pred, labels, model):\n", + " color = ['blue', 'orange', 'green', 'red', 'purple']\n", + " for x, label in enumerate(labels):\n", + " fpr, tpr, _ = roc_curve(y_test[:, x], y_pred[:, x])\n", + " if model == 'pysr':\n", + " plt.plot(\n", + " tpr,\n", + " fpr,\n", + " label='{0}, PySR, AUC = {1:.1f}'.format(label, auc(fpr, tpr) * 100.0),\n", + " linestyle='solid',\n", + " color=color[x],\n", + " lw=1.5,\n", + " )\n", + " if model == 'hls':\n", + " plt.plot(\n", + " tpr,\n", + " fpr,\n", + " label='{0}, HLS, AUC = {1:.1f}'.format(label, auc(fpr, tpr) * 100.0),\n", + " linestyle='dotted',\n", + " color=color[x],\n", + " lw=1.5,\n", + " )\n", + " if model == 'hls_lut':\n", + " plt.plot(\n", + " tpr,\n", + " fpr,\n", + " label='{0}, HLS LUT, AUC = {1:.1f}'.format(label, auc(fpr, tpr) * 100.0),\n", + " linestyle='None',\n", + " color=color[x],\n", + " lw=1,\n", + " marker='o',\n", + " ms=1,\n", + " )\n", + " plt.semilogy()\n", + " plt.xlabel('True positive rate', size=15, loc='right')\n", + " plt.ylabel('False positive rate', size=15, loc='top')\n", + " plt.tick_params(axis='both', which='major', direction='in', length=6, width=1.2, labelsize=12, right=True, top=True)\n", + " plt.tick_params(axis='both', which='minor', direction='in', length=2, width=1, labelsize=12, right=True, top=True)\n", + " plt.xlim(0, 1)\n", + " plt.ylim(0.001, 1)\n", + " plt.grid(True)\n", + " plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=12)\n", + "\n", + "\n", + "plt.figure(figsize=(15, 15))\n", + "axes = plt.subplot(2, 2, 1)\n", + "# plot_roc(Y_test, Y_pysr, LE.classes_, 'pysr')\n", + "plot_roc(Y_test, Y_hls, LE.classes_, 'hls')\n", + "plot_roc(Y_test, Y_hls_lut, LE.classes_, 'hls_lut')" + ] + }, + { + "cell_type": "markdown", + "id": "7beb92ea", + "metadata": {}, + "source": [ + "## Run synthesis from command line" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4047f52", + "metadata": {}, + "outputs": [], + "source": [ + "!source ${XILINX_VIVADO}/settings64.sh\n", + "!vivado_hls -f build_prj.tcl \"reset=1 synth=1 csim=0 cosim=0 validation=0 export=0 vsynth=0\"\n", + "!cat my-hls-test/myproject_prj/solution1/syn/report/myproject_csynth.rpt" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/sr/example.pkl b/sr/example.pkl new file mode 100644 index 00000000..d3fecff7 Binary files /dev/null and b/sr/example.pkl differ