From 30d3dbdb371796a06de4b5f72bc57b2b36fa2123 Mon Sep 17 00:00:00 2001 From: Geert Jan Bex Date: Wed, 16 Jul 2025 14:22:11 +0200 Subject: [PATCH 01/10] Add initial work on duckdb --- source-code/README.md | 2 + .../duckdb/data/patient_experiment.csv | 63 ++++++++ source-code/duckdb/data/patient_metadata.csv | 11 ++ source-code/duckdb/patients.ipynb | 147 ++++++++++++++++++ 4 files changed, 223 insertions(+) create mode 100644 source-code/duckdb/data/patient_experiment.csv create mode 100644 source-code/duckdb/data/patient_metadata.csv create mode 100644 source-code/duckdb/patients.ipynb diff --git a/source-code/README.md b/source-code/README.md index 064c59f..4f9bfdc 100644 --- a/source-code/README.md +++ b/source-code/README.md @@ -22,6 +22,8 @@ to create it. There is some material not covered in the presentation as well. soup and graph representation using networkx. * [`xarray`](xarray): illustrates the xarray library for pandas-like operations on multi-dimensional arrays. +* [`duckdb`](duckdb): illustrates the DuckDB library for SQL-like operations + on dataframes, including integration with pandas and polars. **Note:** material on dashboards has been moved to a [dedicated repository](https://github.com/gjbex/Python-dashboards). diff --git a/source-code/duckdb/data/patient_experiment.csv b/source-code/duckdb/data/patient_experiment.csv new file mode 100644 index 0000000..034e2c7 --- /dev/null +++ b/source-code/duckdb/data/patient_experiment.csv @@ -0,0 +1,63 @@ +,patient,dose,date,temperature +0,1,0.0,2012-10-02 10:00:00,38.3 +1,1,2.0,2012-10-02 11:00:00,38.5 +2,1,2.0,2012-10-02 12:00:00,38.1 +3,1,2.0,2012-10-02 13:00:00,37.3 +4,1,0.0,2012-10-02 14:00:00,37.5 +5,1,0.0,2012-10-02 15:00:00,37.1 +6,1,0.0,2012-10-02 16:00:00,36.8 +7,2,0.0,2012-10-02 10:00:00,39.3 +8,2,5.0,2012-10-02 11:00:00,39.4 +9,2,5.0,2012-10-02 12:00:00,38.1 +10,2,5.0,2012-10-02 13:00:00,37.3 +11,2,0.0,2012-10-02 14:00:00,36.8 +12,2,0.0,2012-10-02 15:00:00,36.8 +13,2,0.0,2012-10-02 16:00:00,36.8 +14,3,0.0,2012-10-02 10:00:00,37.9 +15,3,2.0,2012-10-02 11:00:00,39.5 +16,3,5.0,2012-10-02 12:00:00,38.3 +17,3,2.0,2012-10-02 13:00:00, +18,3,2.0,2012-10-02 14:00:00,37.7 +19,3,2.0,2012-10-02 15:00:00,37.1 +20,3,0.0,2012-10-02 16:00:00,36.7 +21,4,0.0,2012-10-02 10:00:00,38.1 +22,4,5.0,2012-10-02 11:00:00,37.2 +23,4,5.0,2012-10-02 12:00:00,36.1 +24,4,0.0,2012-10-02 13:00:00,35.9 +25,4,,2012-10-02 14:00:00,36.3 +26,4,0.0,2012-10-02 15:00:00,36.6 +27,4,0.0,2012-10-02 16:00:00,36.7 +28,5,0.0,2012-10-02 10:00:00,37.9 +29,5,3.0,2012-10-02 11:00:00,39.5 +30,5,7.0,2012-10-02 12:00:00,38.3 +31,5,5.0,2012-10-02 13:00:00,38.5 +32,5,9.0,2012-10-02 14:00:00,39.4 +33,5,3.0,2012-10-02 15:00:00,37.9 +34,5,0.0,2012-10-02 16:00:00,37.2 +35,6,0.0,2012-10-02 10:00:00,37.5 +36,6,2.0,2012-10-02 11:00:00,38.1 +37,6,3.0,2012-10-02 12:00:00,37.9 +38,6,2.0,2012-10-02 13:00:00,37.7 +39,6,1.0,2012-10-02 14:00:00,37.2 +40,6,0.0,2012-10-02 15:00:00,36.8 +41,7,0.0,2012-10-02 10:00:00,39.5 +42,7,10.0,2012-10-02 11:00:00,40.7 +43,7,5.0,2012-10-02 12:00:00,39.8 +44,7,8.0,2012-10-02 13:00:00,40.2 +45,7,3.0,2012-10-02 14:00:00,38.3 +46,7,3.0,2012-10-02 15:00:00,37.6 +47,7,1.0,2012-10-02 16:00:00,37.3 +48,8,0.0,2012-10-02 10:00:00,37.8 +49,8,0.0,2012-10-02 11:00:00,37.9 +50,8,0.0,2012-10-02 12:00:00,37.4 +51,8,0.0,2012-10-02 13:00:00,37.6 +52,8,0.0,2012-10-02 14:00:00,37.3 +53,8,0.0,2012-10-02 15:00:00,37.1 +54,8,0.0,2012-10-02 16:00:00,36.8 +55,9,0.0,2012-10-02 10:00:00,38.3 +56,9,10.0,2012-10-02 11:00:00,39.5 +57,9,12.0,2012-10-02 12:00:00,40.2 +58,9,4.0,2012-10-02 13:00:00,39.1 +59,9,4.0,2012-10-02 14:00:00,37.9 +60,9,0.0,2012-10-02 15:00:00,37.1 +61,9,0.0,2012-10-02 16:00:00,37.3 diff --git a/source-code/duckdb/data/patient_metadata.csv b/source-code/duckdb/data/patient_metadata.csv new file mode 100644 index 0000000..59e23ac --- /dev/null +++ b/source-code/duckdb/data/patient_metadata.csv @@ -0,0 +1,11 @@ +,patient,gender,condition +0,1,M,A +1,2,F,A +2,3,M,A +3,5,M,A +4,6,F,B +5,7,M,B +6,8,F,B +7,9,M,B +8,10,F,B +9,11,M,B diff --git a/source-code/duckdb/patients.ipynb b/source-code/duckdb/patients.ipynb new file mode 100644 index 0000000..ca19c8f --- /dev/null +++ b/source-code/duckdb/patients.ipynb @@ -0,0 +1,147 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "4c0d0975-71ce-4e9a-986c-ac6cbfb251bb", + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "861daf66-0810-48d9-968c-18e1dabe0e4f", + "metadata": {}, + "outputs": [], + "source": [ + "conn = duckdb.connect('data/patient_experiment.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "c0f56de5-2351-4473-b768-234a89a705ac", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
patient
05
19
22
38
43
56
64
77
81
\n", + "
" + ], + "text/plain": [ + " patient\n", + "0 5\n", + "1 9\n", + "2 2\n", + "3 8\n", + "4 3\n", + "5 6\n", + "6 4\n", + "7 7\n", + "8 1" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute('SELECT DISTINCT patient FROM \"data/patient_experiment.csv\";').df()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c4e1ad8-a9ae-4808-901b-c036e4e0ee17", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 53da957572b9e3128bb88fc605c9db4c4a68ebcd Mon Sep 17 00:00:00 2001 From: Geert Jan Bex Date: Wed, 15 Oct 2025 09:03:12 +0200 Subject: [PATCH 02/10] Add material requirements --- docs/README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/README.md b/docs/README.md index 1d2c156..24a5947 100644 --- a/docs/README.md +++ b/docs/README.md @@ -54,6 +54,14 @@ from scratch. Familiarity with numpy is not required, but would be beneficial. If you plan to do Python programming in a Linux or HPC environment you should be familiar with these as well. +For following along hands-on, you need +* laptop or desktop with internet access. +* a system set up so you can connect to an HPC system, an account on an HPC + system (e.g., VSC, CECI, ...), compute credits if that is required to run + jobs on the HPC system if you want to use an HPC system; +* a Python environment that can run Jupyter Lab if you want to use your own system; +* access to Google Colaboratory if you prefer not to install software. + ## Level From 014249d12482c5ca1a5b14febf6b35378785951e Mon Sep 17 00:00:00 2001 From: Geert Jan Bex Date: Tue, 4 Nov 2025 11:56:50 +0100 Subject: [PATCH 03/10] Added pivot operation --- source-code/duckdb/patients.ipynb | 336 ++++++++++++++++++++++++++++-- 1 file changed, 313 insertions(+), 23 deletions(-) diff --git a/source-code/duckdb/patients.ipynb b/source-code/duckdb/patients.ipynb index ca19c8f..380233b 100644 --- a/source-code/duckdb/patients.ipynb +++ b/source-code/duckdb/patients.ipynb @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 2, "id": "861daf66-0810-48d9-968c-18e1dabe0e4f", "metadata": {}, "outputs": [], @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 25, "id": "c0f56de5-2351-4473-b768-234a89a705ac", "metadata": {}, "outputs": [ @@ -54,39 +54,39 @@ " \n", " \n", " 0\n", - " 5\n", + " 1\n", " \n", " \n", " 1\n", - " 9\n", + " 3\n", " \n", " \n", " 2\n", - " 2\n", + " 6\n", " \n", " \n", " 3\n", - " 8\n", + " 7\n", " \n", " \n", " 4\n", - " 3\n", + " 8\n", " \n", " \n", " 5\n", - " 6\n", + " 5\n", " \n", " \n", " 6\n", - " 4\n", + " 9\n", " \n", " \n", " 7\n", - " 7\n", + " 2\n", " \n", " \n", " 8\n", - " 1\n", + " 4\n", " \n", " \n", "\n", @@ -94,32 +94,322 @@ ], "text/plain": [ " patient\n", - "0 5\n", - "1 9\n", - "2 2\n", - "3 8\n", - "4 3\n", - "5 6\n", - "6 4\n", - "7 7\n", - "8 1" + "0 1\n", + "1 3\n", + "2 6\n", + "3 7\n", + "4 8\n", + "5 5\n", + "6 9\n", + "7 2\n", + "8 4" ] }, - "execution_count": 37, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "conn.execute('SELECT DISTINCT patient FROM \"data/patient_experiment.csv\";').df()" + "conn.execute('SELECT DISTINCT patient FROM \"patient_experiment\";').df()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "0c4e1ad8-a9ae-4808-901b-c036e4e0ee17", "metadata": {}, "outputs": [], + "source": [ + "sql_pivot = '''\n", + "CREATE TABLE time_series AS\n", + " PIVOT \"data/patient_experiment.csv\"\n", + " ON patient\n", + " USING\n", + " first(temperature) AS temperature,\n", + " first(dose) AS dose\n", + " GROUP BY date;\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "6ff65df2-1997-4983-b3c8-1753034c3218", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute(sql_pivot)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "95b029d4-0329-4462-8737-94f036dc8147", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
date
02012-10-02 14:00:00
12012-10-02 11:00:00
22012-10-02 10:00:00
32012-10-02 13:00:00
42012-10-02 12:00:00
52012-10-02 15:00:00
62012-10-02 16:00:00
\n", + "
" + ], + "text/plain": [ + " date\n", + "0 2012-10-02 14:00:00\n", + "1 2012-10-02 11:00:00\n", + "2 2012-10-02 10:00:00\n", + "3 2012-10-02 13:00:00\n", + "4 2012-10-02 12:00:00\n", + "5 2012-10-02 15:00:00\n", + "6 2012-10-02 16:00:00" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute('SELECT date FROM time_series;').df()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "78a996cf-4bc2-40dc-8bce-27bcc6731b7b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
name
0file
1patient_experiment
2time_series
\n", + "
" + ], + "text/plain": [ + " name\n", + "0 file\n", + "1 patient_experiment\n", + "2 time_series" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute('show tables;').df()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "ea5cf7e3-1b67-45fd-bdde-1ee294575149", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
table_schematable_nametable_type
0maintime_seriesBASE TABLE
1mainfileVIEW
2mainpatient_experimentVIEW
\n", + "
" + ], + "text/plain": [ + " table_schema table_name table_type\n", + "0 main time_series BASE TABLE\n", + "1 main file VIEW\n", + "2 main patient_experiment VIEW" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.sql(\"\"\"\n", + " SELECT table_schema, table_name, table_type\n", + " FROM information_schema.tables\n", + " WHERE table_schema NOT IN ('information_schema', 'pg_catalog')\n", + "\"\"\").df()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "843f9d0f-8384-4977-916e-49ef59e5cbad", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "┌─────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐\n", + "│ column_name │ column_type │ null │ key │ default │ extra │\n", + "│ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │\n", + "├─────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤\n", + "│ column0 │ BIGINT │ YES │ NULL │ NULL │ NULL │\n", + "│ patient │ BIGINT │ YES │ NULL │ NULL │ NULL │\n", + "│ dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ date │ TIMESTAMP │ YES │ NULL │ NULL │ NULL │\n", + "│ temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "└─────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.sql('''dESCRIBE patient_experiment;''')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8570b365-09d5-4993-908e-4eaac974204d", + "metadata": {}, + "outputs": [], "source": [] } ], From f01bc969eb0187aeca9913822df91cc19b19f677 Mon Sep 17 00:00:00 2001 From: Geert Jan Bex Date: Thu, 20 Nov 2025 17:33:53 +0100 Subject: [PATCH 04/10] Add notebook on apply method --- source-code/pandas/README.md | 5 +- source-code/pandas/apply.ipynb | 360 +++++++++++++++++++++++++++++++++ 2 files changed, 364 insertions(+), 1 deletion(-) create mode 100644 source-code/pandas/apply.ipynb diff --git a/source-code/pandas/README.md b/source-code/pandas/README.md index 746a50d..313b336 100644 --- a/source-code/pandas/README.md +++ b/source-code/pandas/README.md @@ -25,7 +25,10 @@ easy to use. 1. `pipes.ipynb`: consolidating data processing using pipes. 1. `screenshots`: screenshots made for the slides. 1. `generate_csv_files.py`: script to generate CSV files in different - formats. + formatg. 1. `copy_on_write.ipynb`: Jupyter notebook that illustrates how data is shared between related notebooks and the role Copy-on-Write plays in order to prevent accidental data modifications in more than one dataframe. +1. `apply.ipynb`: Jupyter notebook that illustrates the use of the `apply` method + in pandas dataframes for applying functions along rows or columns. It includes + a comparison of performance between using `apply` and vectorized operations. diff --git a/source-code/pandas/apply.ipynb b/source-code/pandas/apply.ipynb new file mode 100644 index 0000000..35fc9c7 --- /dev/null +++ b/source-code/pandas/apply.ipynb @@ -0,0 +1,360 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "c507c033-f47a-40f3-9d9d-d24d23e25474", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "c973633e-eccd-4a0f-873d-faf43fa3b836", + "metadata": {}, + "source": [ + "## apply" + ] + }, + { + "cell_type": "markdown", + "id": "f1401362-5955-495e-be57-5436a7446530", + "metadata": {}, + "source": [ + "Code that uses `.apply()` looks clean, but it is rather slow when used row-wise (`axis=1`). To quantify this, you can run the example below." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "af048047-df04-4c5f-8b36-d48f53d021ae", + "metadata": {}, + "outputs": [], + "source": [ + "size = 100_000\n", + "df = pd.DataFrame({\n", + " 'A': np.random.uniform(0.0, 1.0, size=size),\n", + " 'B': np.random.uniform(0.0, 1.0, size=size),\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "84b3d0d6-d9c3-4921-8561-80ef6d766f6f", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 100000 entries, 0 to 99999\n", + "Data columns (total 2 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 A 100000 non-null float64\n", + " 1 B 100000 non-null float64\n", + "dtypes: float64(2)\n", + "memory usage: 1.5 MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "markdown", + "id": "9dfd0c4b-996d-4426-8b58-d66c78124a8f", + "metadata": {}, + "source": [ + "Note that this dataframe is fairly small." + ] + }, + { + "cell_type": "markdown", + "id": "d0b672e5-9762-496e-932f-4c5729c62061", + "metadata": {}, + "source": [ + "### Evaluating a condition" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "093ddcde-ee7f-4d66-847d-221e8181b9dc", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "551 ms ± 8.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%timeit df.apply(lambda x: 0 if x.A + x.B < 1.0 else 1, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "6b10519f-26b5-4c74-af2f-ee34af35e96d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.17 ms ± 5.24 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" + ] + } + ], + "source": [ + "%timeit np.select([df.A + df.B < 1.0, df.A + df.B >= 1.0], [0, 1])" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "e8b003c0-7445-475e-9ece-68a9783b1388", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "510 μs ± 4.17 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" + ] + } + ], + "source": [ + "%timeit np.where(df.A + df.B < 1.0, 0, 1)" + ] + }, + { + "cell_type": "markdown", + "id": "35ebd7e1-48bb-4d3b-860d-f0d765ffa62e", + "metadata": {}, + "source": [ + "Clearly, `.apply()` is very slow comparted to `np.select()` and `np.where()`. Note that `np.where()` is faster than `np.select()` by a factor of 2." + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "9bc83bfe-680e-4b3d-8017-970cf08fd956", + "metadata": {}, + "outputs": [], + "source": [ + "assert np.array_equal(\n", + " df.apply(lambda x: 0 if x.A + x.B < 1.0 else 1, axis=1).to_numpy(),\n", + " np.where(df.A + df.B < 1.0, 0, 1),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "de5e05b5-154e-498c-a565-3116e490ae11", + "metadata": {}, + "outputs": [], + "source": [ + "assert np.array_equal(\n", + " df.apply(lambda x: 0 if x.A + x.B < 1.0 else 1, axis=1).to_numpy(),\n", + " np.select([df.A + df.B < 1.0, df.A + df.B >= 1.0], [0, 1]),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "9b46cd48-f1ce-4041-9560-6c1b09556d53", + "metadata": {}, + "source": [ + "All three approaches produce the same results." + ] + }, + { + "cell_type": "markdown", + "id": "c63e4df2-6fed-4072-aadd-3256a7c8cede", + "metadata": {}, + "source": [ + "### Adding a column" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "ef441507-f6f5-4485-b03f-36636259a848", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "563 ms ± 8.58 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%timeit df['C'] = df.apply(lambda x: x.A + x.B, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "bd13d78b-b7fd-40c0-8b0e-3bdafdef4b33", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "176 μs ± 2.21 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n" + ] + } + ], + "source": [ + "%timeit df['C'] = df.A + df.B" + ] + }, + { + "cell_type": "markdown", + "id": "3f092bfc-9f32-4636-ba95-52b2c07d2fdb", + "metadata": {}, + "source": [ + "Clearly, `.apply()` is very slow comparted to a straightforward column definition. The difference is a factor of 1,000." + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "5c8f3b66-1eea-4e58-9035-f6db4af3df3f", + "metadata": {}, + "outputs": [], + "source": [ + "assert df.apply(lambda x: x.A + x.B, axis=1).equals(df.A + df.B)" + ] + }, + { + "cell_type": "markdown", + "id": "c31c53ea-e297-4658-b55b-35ab47987237", + "metadata": {}, + "source": [ + "Both approaches yield the same result." + ] + }, + { + "cell_type": "markdown", + "id": "a32a0791-8063-40ac-83d0-93a5ab796c70", + "metadata": {}, + "source": [ + "### Aggregating columns" + ] + }, + { + "cell_type": "markdown", + "id": "8be8ec5b-878b-4452-9815-9c0a23f97d9d", + "metadata": {}, + "source": [ + "Although less dramatically so, applying `.apply()` along axis 0 is also slower than its numpy counterpart." + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "47d6aca2-f52e-4746-a139-119fcdfe3030", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "303 μs ± 4.28 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" + ] + } + ], + "source": [ + "%timeit df.apply(np.sum, axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "1e4ed799-08fd-4c14-bdf0-f6db5b829c0c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "179 μs ± 10.2 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n" + ] + } + ], + "source": [ + "%timeit np.sum(df.to_numpy(), axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "d0504d19-a6d8-4f3d-a4ff-73e9c04152e4", + "metadata": {}, + "outputs": [], + "source": [ + "assert np.array_equal(df.apply(np.sum, axis=0), np.sum(df.to_numpy(), axis=0))" + ] + }, + { + "cell_type": "markdown", + "id": "ce8fb4ac-795e-43e3-ae72-fa528df86855", + "metadata": {}, + "source": [ + "Again, both produce the same result." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From a841ab6918c550d8efb25fd715b00d8a1f12d5a5 Mon Sep 17 00:00:00 2001 From: Geert Jan Bex Date: Thu, 20 Nov 2025 18:32:09 +0100 Subject: [PATCH 05/10] Display pandas version --- source-code/pandas/copy_on_write.ipynb | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/source-code/pandas/copy_on_write.ipynb b/source-code/pandas/copy_on_write.ipynb index 744c29b..199cb6f 100644 --- a/source-code/pandas/copy_on_write.ipynb +++ b/source-code/pandas/copy_on_write.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 1, "id": "9550bae2-79e5-4db6-b8eb-4b64dbc1b4e1", "metadata": {}, "outputs": [], @@ -37,6 +37,27 @@ "The answer to that question seems to depend on the version of `pandas`." ] }, + { + "cell_type": "code", + "execution_count": 3, + "id": "44918d07-7c0c-45ba-a1bc-04a0fa27c06d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'2.3.3'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.__version__" + ] + }, { "cell_type": "markdown", "id": "ea08f1e0-78c1-463c-994c-92066363b006", @@ -1688,7 +1709,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.7" + "version": "3.12.12" } }, "nbformat": 4, From abe85d1d31b4642c219caed27459dbd7f5c3198d Mon Sep 17 00:00:00 2001 From: Geert Jan Bex Date: Fri, 21 Nov 2025 10:32:27 +0100 Subject: [PATCH 06/10] Fix typo --- source-code/pandas/copy_on_write.ipynb | 728 ++++++++++++------------- 1 file changed, 364 insertions(+), 364 deletions(-) diff --git a/source-code/pandas/copy_on_write.ipynb b/source-code/pandas/copy_on_write.ipynb index 199cb6f..d18c56d 100644 --- a/source-code/pandas/copy_on_write.ipynb +++ b/source-code/pandas/copy_on_write.ipynb @@ -39,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "44918d07-7c0c-45ba-a1bc-04a0fa27c06d", "metadata": {}, "outputs": [ @@ -49,7 +49,7 @@ "'2.3.3'" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -76,7 +76,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 3, "id": "520bc27d-5d55-46dc-a7bf-5b9b12162b9d", "metadata": {}, "outputs": [], @@ -91,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 4, "id": "432772e9-3031-4001-81e3-c346cb7f9c76", "metadata": {}, "outputs": [ @@ -127,7 +127,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 5, "id": "eb6f4dad-6df3-493e-869f-204d06b0cc7d", "metadata": {}, "outputs": [ @@ -168,50 +168,50 @@ " \n", " \n", " mean\n", - " 4.225888\n", - " 1.284407\n", - " -2.830820\n", - " 0.143920\n", + " -0.666188\n", + " -0.175422\n", + " 4.077410\n", + " 2.294040\n", " \n", " \n", " std\n", - " 577.759840\n", - " 577.063194\n", - " 577.528168\n", - " 576.729627\n", + " 576.312351\n", + " 576.718069\n", + " 577.017098\n", + " 577.149466\n", " \n", " \n", " min\n", - " -999.977694\n", - " -999.908941\n", + " -999.995592\n", + " -999.992864\n", " -999.000000\n", " -999.000000\n", " \n", " \n", " 25%\n", - " -495.491551\n", - " -497.783061\n", - " -506.000000\n", + " -499.708991\n", + " -497.766770\n", + " -496.000000\n", " -499.000000\n", " \n", " \n", " 50%\n", - " 4.748510\n", - " 2.103870\n", - " -4.000000\n", - " 0.000000\n", + " -1.080819\n", + " -0.135347\n", + " 2.000000\n", + " 6.000000\n", " \n", " \n", " 75%\n", - " 505.160586\n", - " 500.502871\n", - " 497.000000\n", - " 498.000000\n", + " 498.134233\n", + " 495.339400\n", + " 504.000000\n", + " 501.000000\n", " \n", " \n", " max\n", - " 999.992940\n", - " 999.947904\n", + " 999.976901\n", + " 999.997666\n", " 999.000000\n", " 999.000000\n", " \n", @@ -222,16 +222,16 @@ "text/plain": [ " column1 column2 column3 column4\n", "count 100000.000000 100000.000000 100000.000000 100000.000000\n", - "mean 4.225888 1.284407 -2.830820 0.143920\n", - "std 577.759840 577.063194 577.528168 576.729627\n", - "min -999.977694 -999.908941 -999.000000 -999.000000\n", - "25% -495.491551 -497.783061 -506.000000 -499.000000\n", - "50% 4.748510 2.103870 -4.000000 0.000000\n", - "75% 505.160586 500.502871 497.000000 498.000000\n", - "max 999.992940 999.947904 999.000000 999.000000" + "mean -0.666188 -0.175422 4.077410 2.294040\n", + "std 576.312351 576.718069 577.017098 577.149466\n", + "min -999.995592 -999.992864 -999.000000 -999.000000\n", + "25% -499.708991 -497.766770 -496.000000 -499.000000\n", + "50% -1.080819 -0.135347 2.000000 6.000000\n", + "75% 498.134233 495.339400 504.000000 501.000000\n", + "max 999.976901 999.997666 999.000000 999.000000" ] }, - "execution_count": 17, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -250,7 +250,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 6, "id": "1a43e7eb-a7ad-4089-95ba-15879f5920ce", "metadata": {}, "outputs": [], @@ -260,7 +260,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 7, "id": "1af4b0bc-81eb-4dab-9a65-fb5f71d004c2", "metadata": {}, "outputs": [ @@ -296,56 +296,56 @@ " count\n", " 50000.000000\n", " 50000.000000\n", - " 50000.00000\n", + " 50000.000000\n", " 50000.000000\n", " \n", " \n", " mean\n", - " 4.828118\n", - " 0.964712\n", - " -2.11148\n", - " -3.656300\n", + " -0.119765\n", + " -2.058561\n", + " -0.764240\n", + " 3.063220\n", " \n", " \n", " std\n", - " 577.936131\n", - " 577.971021\n", - " 577.96331\n", - " 576.181292\n", + " 575.680081\n", + " 578.254495\n", + " 577.348834\n", + " 577.086857\n", " \n", " \n", " min\n", - " -999.944283\n", - " -999.897600\n", - " -999.00000\n", + " -999.933748\n", + " -999.992864\n", + " -999.000000\n", " -999.000000\n", " \n", " \n", " 25%\n", - " -493.817751\n", - " -500.003775\n", - " -504.00000\n", - " -503.000000\n", + " -499.293681\n", + " -499.616524\n", + " -501.000000\n", + " -498.000000\n", " \n", " \n", " 50%\n", - " 4.462345\n", - " 3.577384\n", - " -6.00000\n", - " -2.000000\n", + " -1.213840\n", + " -4.281987\n", + " -3.000000\n", + " 10.000000\n", " \n", " \n", " 75%\n", - " 503.114598\n", - " 500.886860\n", - " 501.00000\n", - " 494.000000\n", + " 498.062126\n", + " 495.209044\n", + " 500.000000\n", + " 499.250000\n", " \n", " \n", " max\n", - " 999.964569\n", - " 999.864196\n", - " 999.00000\n", + " 999.976901\n", + " 999.997666\n", + " 999.000000\n", " 999.000000\n", " \n", " \n", @@ -353,18 +353,18 @@ "" ], "text/plain": [ - " column1 column2 column3 column4\n", - "count 50000.000000 50000.000000 50000.00000 50000.000000\n", - "mean 4.828118 0.964712 -2.11148 -3.656300\n", - "std 577.936131 577.971021 577.96331 576.181292\n", - "min -999.944283 -999.897600 -999.00000 -999.000000\n", - "25% -493.817751 -500.003775 -504.00000 -503.000000\n", - "50% 4.462345 3.577384 -6.00000 -2.000000\n", - "75% 503.114598 500.886860 501.00000 494.000000\n", - "max 999.964569 999.864196 999.00000 999.000000" + " column1 column2 column3 column4\n", + "count 50000.000000 50000.000000 50000.000000 50000.000000\n", + "mean -0.119765 -2.058561 -0.764240 3.063220\n", + "std 575.680081 578.254495 577.348834 577.086857\n", + "min -999.933748 -999.992864 -999.000000 -999.000000\n", + "25% -499.293681 -499.616524 -501.000000 -498.000000\n", + "50% -1.213840 -4.281987 -3.000000 10.000000\n", + "75% 498.062126 495.209044 500.000000 499.250000\n", + "max 999.976901 999.997666 999.000000 999.000000" ] }, - "execution_count": 19, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -383,7 +383,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 8, "id": "65dfb904-0b0a-4b4a-baab-77011a840910", "metadata": {}, "outputs": [ @@ -391,7 +391,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_565/3787905307.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "/tmp/ipykernel_15868/3787905307.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", "\n", "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", @@ -407,7 +407,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 9, "id": "cbeb2db9-7018-4cac-9dd4-2880d2f7a214", "metadata": {}, "outputs": [ @@ -443,56 +443,56 @@ " count\n", " 50000.000000\n", " 50000.000000\n", - " 50000.00000\n", + " 50000.000000\n", " 50000.000000\n", " \n", " \n", " mean\n", - " 2.605923\n", - " 0.964712\n", - " -2.11148\n", - " -3.656300\n", + " 0.147446\n", + " -2.058561\n", + " -0.764240\n", + " 3.063220\n", " \n", " \n", " std\n", - " 408.136559\n", - " 577.971021\n", - " 577.96331\n", - " 576.181292\n", + " 407.380662\n", + " 578.254495\n", + " 577.348834\n", + " 577.086857\n", " \n", " \n", " min\n", " -500.000000\n", - " -999.897600\n", - " -999.00000\n", + " -999.992864\n", + " -999.000000\n", " -999.000000\n", " \n", " \n", " 25%\n", - " -493.817751\n", - " -500.003775\n", - " -504.00000\n", - " -503.000000\n", + " -499.293681\n", + " -499.616524\n", + " -501.000000\n", + " -498.000000\n", " \n", " \n", " 50%\n", - " 4.462345\n", - " 3.577384\n", - " -6.00000\n", - " -2.000000\n", + " -1.213840\n", + " -4.281987\n", + " -3.000000\n", + " 10.000000\n", " \n", " \n", " 75%\n", + " 498.062126\n", + " 495.209044\n", " 500.000000\n", - " 500.886860\n", - " 501.00000\n", - " 494.000000\n", + " 499.250000\n", " \n", " \n", " max\n", " 500.000000\n", - " 999.864196\n", - " 999.00000\n", + " 999.997666\n", + " 999.000000\n", " 999.000000\n", " \n", " \n", @@ -500,18 +500,18 @@ "" ], "text/plain": [ - " column1 column2 column3 column4\n", - "count 50000.000000 50000.000000 50000.00000 50000.000000\n", - "mean 2.605923 0.964712 -2.11148 -3.656300\n", - "std 408.136559 577.971021 577.96331 576.181292\n", - "min -500.000000 -999.897600 -999.00000 -999.000000\n", - "25% -493.817751 -500.003775 -504.00000 -503.000000\n", - "50% 4.462345 3.577384 -6.00000 -2.000000\n", - "75% 500.000000 500.886860 501.00000 494.000000\n", - "max 500.000000 999.864196 999.00000 999.000000" + " column1 column2 column3 column4\n", + "count 50000.000000 50000.000000 50000.000000 50000.000000\n", + "mean 0.147446 -2.058561 -0.764240 3.063220\n", + "std 407.380662 578.254495 577.348834 577.086857\n", + "min -500.000000 -999.992864 -999.000000 -999.000000\n", + "25% -499.293681 -499.616524 -501.000000 -498.000000\n", + "50% -1.213840 -4.281987 -3.000000 10.000000\n", + "75% 498.062126 495.209044 500.000000 499.250000\n", + "max 500.000000 999.997666 999.000000 999.000000" ] }, - "execution_count": 21, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -564,7 +564,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 10, "id": "b4cff949-c4aa-4f7d-b1e4-b4b78bf0284b", "metadata": {}, "outputs": [], @@ -582,7 +582,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 11, "id": "6bb4fec9-0623-4482-8ebd-9077723956e0", "metadata": {}, "outputs": [], @@ -597,7 +597,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 12, "id": "f005ad9f-3b19-40bf-b01a-4f7fd8f4d024", "metadata": {}, "outputs": [], @@ -607,7 +607,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 13, "id": "91499993-8461-437d-b7b1-7f0caf20d7d6", "metadata": {}, "outputs": [ @@ -648,50 +648,50 @@ " \n", " \n", " mean\n", - " 1.695724\n", - " -3.040382\n", - " 4.001460\n", - " 0.511100\n", + " 0.455395\n", + " 5.588258\n", + " 1.329300\n", + " 1.763540\n", " \n", " \n", " std\n", - " 578.627560\n", - " 576.453798\n", - " 576.244217\n", - " 578.301376\n", + " 577.070175\n", + " 577.342945\n", + " 577.130306\n", + " 575.554149\n", " \n", " \n", " min\n", - " -999.983952\n", - " -999.968792\n", + " -999.965920\n", + " -999.963945\n", " -999.000000\n", " -999.000000\n", " \n", " \n", " 25%\n", - " -502.160012\n", - " -500.333306\n", - " -492.250000\n", - " -498.000000\n", + " -500.420461\n", + " -495.503431\n", + " -497.000000\n", + " -493.000000\n", " \n", " \n", " 50%\n", - " 4.057258\n", - " -4.377464\n", + " 3.895165\n", + " 9.582978\n", " 3.000000\n", - " 0.000000\n", + " 1.000000\n", " \n", " \n", " 75%\n", - " 502.159214\n", - " 494.614704\n", - " 499.000000\n", - " 500.250000\n", + " 496.851678\n", + " 505.826690\n", + " 501.000000\n", + " 499.250000\n", " \n", " \n", " max\n", - " 999.998666\n", - " 999.913716\n", + " 999.979256\n", + " 999.948488\n", " 999.000000\n", " 999.000000\n", " \n", @@ -702,16 +702,16 @@ "text/plain": [ " column1 column2 column3 column4\n", "count 50000.000000 50000.000000 50000.000000 50000.000000\n", - "mean 1.695724 -3.040382 4.001460 0.511100\n", - "std 578.627560 576.453798 576.244217 578.301376\n", - "min -999.983952 -999.968792 -999.000000 -999.000000\n", - "25% -502.160012 -500.333306 -492.250000 -498.000000\n", - "50% 4.057258 -4.377464 3.000000 0.000000\n", - "75% 502.159214 494.614704 499.000000 500.250000\n", - "max 999.998666 999.913716 999.000000 999.000000" + "mean 0.455395 5.588258 1.329300 1.763540\n", + "std 577.070175 577.342945 577.130306 575.554149\n", + "min -999.965920 -999.963945 -999.000000 -999.000000\n", + "25% -500.420461 -495.503431 -497.000000 -493.000000\n", + "50% 3.895165 9.582978 3.000000 1.000000\n", + "75% 496.851678 505.826690 501.000000 499.250000\n", + "max 999.979256 999.948488 999.000000 999.000000" ] }, - "execution_count": 25, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -722,7 +722,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 14, "id": "abf7f30b-2bd0-4b3b-94fd-f9b6f183b26b", "metadata": {}, "outputs": [ @@ -730,7 +730,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_565/3016868282.py:1: ChainedAssignmentError: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "/tmp/ipykernel_15868/3016868282.py:1: ChainedAssignmentError: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", "When using the Copy-on-Write mode, such inplace method never works to update the original DataFrame or Series, because the intermediate object on which we are setting values always behaves as a copy.\n", "\n", "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' instead, to perform the operation inplace on the original object.\n", @@ -754,7 +754,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 15, "id": "34e80602-a9fa-44d2-9fbb-3b6d4bd6a3d3", "metadata": {}, "outputs": [ @@ -795,50 +795,50 @@ " \n", " \n", " mean\n", - " 1.695724\n", - " -3.040382\n", - " 4.001460\n", - " 0.511100\n", + " 0.455395\n", + " 5.588258\n", + " 1.329300\n", + " 1.763540\n", " \n", " \n", " std\n", - " 578.627560\n", - " 576.453798\n", - " 576.244217\n", - " 578.301376\n", + " 577.070175\n", + " 577.342945\n", + " 577.130306\n", + " 575.554149\n", " \n", " \n", " min\n", - " -999.983952\n", - " -999.968792\n", + " -999.965920\n", + " -999.963945\n", " -999.000000\n", " -999.000000\n", " \n", " \n", " 25%\n", - " -502.160012\n", - " -500.333306\n", - " -492.250000\n", - " -498.000000\n", + " -500.420461\n", + " -495.503431\n", + " -497.000000\n", + " -493.000000\n", " \n", " \n", " 50%\n", - " 4.057258\n", - " -4.377464\n", + " 3.895165\n", + " 9.582978\n", " 3.000000\n", - " 0.000000\n", + " 1.000000\n", " \n", " \n", " 75%\n", - " 502.159214\n", - " 494.614704\n", - " 499.000000\n", - " 500.250000\n", + " 496.851678\n", + " 505.826690\n", + " 501.000000\n", + " 499.250000\n", " \n", " \n", " max\n", - " 999.998666\n", - " 999.913716\n", + " 999.979256\n", + " 999.948488\n", " 999.000000\n", " 999.000000\n", " \n", @@ -849,16 +849,16 @@ "text/plain": [ " column1 column2 column3 column4\n", "count 50000.000000 50000.000000 50000.000000 50000.000000\n", - "mean 1.695724 -3.040382 4.001460 0.511100\n", - "std 578.627560 576.453798 576.244217 578.301376\n", - "min -999.983952 -999.968792 -999.000000 -999.000000\n", - "25% -502.160012 -500.333306 -492.250000 -498.000000\n", - "50% 4.057258 -4.377464 3.000000 0.000000\n", - "75% 502.159214 494.614704 499.000000 500.250000\n", - "max 999.998666 999.913716 999.000000 999.000000" + "mean 0.455395 5.588258 1.329300 1.763540\n", + "std 577.070175 577.342945 577.130306 575.554149\n", + "min -999.965920 -999.963945 -999.000000 -999.000000\n", + "25% -500.420461 -495.503431 -497.000000 -493.000000\n", + "50% 3.895165 9.582978 3.000000 1.000000\n", + "75% 496.851678 505.826690 501.000000 499.250000\n", + "max 999.979256 999.948488 999.000000 999.000000" ] }, - "execution_count": 28, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -877,7 +877,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 16, "id": "acc94632-2ed2-4dc1-a067-6d9518044c7b", "metadata": {}, "outputs": [ @@ -913,56 +913,56 @@ " count\n", " 100000.000000\n", " 100000.000000\n", - " 100000.00000\n", + " 100000.000000\n", " 100000.000000\n", " \n", " \n", " mean\n", - " 1.623964\n", - " -3.764614\n", - " 1.34378\n", - " 1.478400\n", + " 0.540672\n", + " 4.948220\n", + " 1.349060\n", + " 1.572440\n", " \n", " \n", " std\n", - " 577.971329\n", - " 577.167935\n", - " 575.56337\n", - " 577.826276\n", + " 577.009925\n", + " 577.169270\n", + " 576.321436\n", + " 576.108494\n", " \n", " \n", " min\n", - " -999.983952\n", - " -999.980827\n", - " -999.00000\n", + " -999.965920\n", + " -999.988117\n", + " -999.000000\n", " -999.000000\n", " \n", " \n", " 25%\n", - " -499.179172\n", - " -503.072353\n", - " -497.00000\n", - " -497.000000\n", + " -499.069860\n", + " -494.820546\n", + " -496.000000\n", + " -496.000000\n", " \n", " \n", " 50%\n", - " 3.133260\n", - " -5.691740\n", - " 1.00000\n", - " 2.000000\n", + " 2.821413\n", + " 5.823958\n", + " 3.000000\n", + " 1.000000\n", " \n", " \n", " 75%\n", - " 502.454035\n", - " 495.773629\n", - " 497.00000\n", + " 499.807599\n", + " 505.991898\n", " 501.000000\n", + " 500.000000\n", " \n", " \n", " max\n", - " 999.998666\n", - " 999.985790\n", - " 999.00000\n", + " 999.984508\n", + " 999.948488\n", + " 999.000000\n", " 999.000000\n", " \n", " \n", @@ -970,18 +970,18 @@ "" ], "text/plain": [ - " column1 column2 column3 column4\n", - "count 100000.000000 100000.000000 100000.00000 100000.000000\n", - "mean 1.623964 -3.764614 1.34378 1.478400\n", - "std 577.971329 577.167935 575.56337 577.826276\n", - "min -999.983952 -999.980827 -999.00000 -999.000000\n", - "25% -499.179172 -503.072353 -497.00000 -497.000000\n", - "50% 3.133260 -5.691740 1.00000 2.000000\n", - "75% 502.454035 495.773629 497.00000 501.000000\n", - "max 999.998666 999.985790 999.00000 999.000000" + " column1 column2 column3 column4\n", + "count 100000.000000 100000.000000 100000.000000 100000.000000\n", + "mean 0.540672 4.948220 1.349060 1.572440\n", + "std 577.009925 577.169270 576.321436 576.108494\n", + "min -999.965920 -999.988117 -999.000000 -999.000000\n", + "25% -499.069860 -494.820546 -496.000000 -496.000000\n", + "50% 2.821413 5.823958 3.000000 1.000000\n", + "75% 499.807599 505.991898 501.000000 500.000000\n", + "max 999.984508 999.948488 999.000000 999.000000" ] }, - "execution_count": 29, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1000,7 +1000,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 17, "id": "1b86dd08-d8de-4b03-a363-80d18201b4da", "metadata": {}, "outputs": [], @@ -1018,7 +1018,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 18, "id": "dea886ac-e1a2-4903-b550-e73a2be70507", "metadata": {}, "outputs": [ @@ -1059,50 +1059,50 @@ " \n", " \n", " mean\n", - " 1.695724\n", - " -3.040382\n", - " 4.001460\n", - " 0.511100\n", + " 0.455395\n", + " 5.588258\n", + " 1.329300\n", + " 1.763540\n", " \n", " \n", " std\n", - " 578.627560\n", - " 576.453798\n", - " 576.244217\n", - " 578.301376\n", + " 577.070175\n", + " 577.342945\n", + " 577.130306\n", + " 575.554149\n", " \n", " \n", " min\n", - " -999.983952\n", - " -999.968792\n", + " -999.965920\n", + " -999.963945\n", " -999.000000\n", " -999.000000\n", " \n", " \n", " 25%\n", - " -502.160012\n", - " -500.333306\n", - " -492.250000\n", - " -498.000000\n", + " -500.420461\n", + " -495.503431\n", + " -497.000000\n", + " -493.000000\n", " \n", " \n", " 50%\n", - " 4.057258\n", - " -4.377464\n", + " 3.895165\n", + " 9.582978\n", " 3.000000\n", - " 0.000000\n", + " 1.000000\n", " \n", " \n", " 75%\n", - " 502.159214\n", - " 494.614704\n", - " 499.000000\n", - " 500.250000\n", + " 496.851678\n", + " 505.826690\n", + " 501.000000\n", + " 499.250000\n", " \n", " \n", " max\n", - " 999.998666\n", - " 999.913716\n", + " 999.979256\n", + " 999.948488\n", " 999.000000\n", " 999.000000\n", " \n", @@ -1113,16 +1113,16 @@ "text/plain": [ " column1 column2 column3 column4\n", "count 50000.000000 50000.000000 50000.000000 50000.000000\n", - "mean 1.695724 -3.040382 4.001460 0.511100\n", - "std 578.627560 576.453798 576.244217 578.301376\n", - "min -999.983952 -999.968792 -999.000000 -999.000000\n", - "25% -502.160012 -500.333306 -492.250000 -498.000000\n", - "50% 4.057258 -4.377464 3.000000 0.000000\n", - "75% 502.159214 494.614704 499.000000 500.250000\n", - "max 999.998666 999.913716 999.000000 999.000000" + "mean 0.455395 5.588258 1.329300 1.763540\n", + "std 577.070175 577.342945 577.130306 575.554149\n", + "min -999.965920 -999.963945 -999.000000 -999.000000\n", + "25% -500.420461 -495.503431 -497.000000 -493.000000\n", + "50% 3.895165 9.582978 3.000000 1.000000\n", + "75% 496.851678 505.826690 501.000000 499.250000\n", + "max 999.979256 999.948488 999.000000 999.000000" ] }, - "execution_count": 31, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1141,7 +1141,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 19, "id": "c73063c8-4e9d-42ba-9804-1286b769ad54", "metadata": {}, "outputs": [ @@ -1177,56 +1177,56 @@ " count\n", " 100000.000000\n", " 100000.000000\n", - " 100000.00000\n", + " 100000.000000\n", " 100000.000000\n", " \n", " \n", " mean\n", - " 1.174175\n", - " -3.764614\n", - " 1.34378\n", - " 1.478400\n", + " 0.653302\n", + " 4.948220\n", + " 1.349060\n", + " 1.572440\n", " \n", " \n", " std\n", - " 408.414807\n", - " 577.167935\n", - " 575.56337\n", - " 577.826276\n", + " 408.053983\n", + " 577.169270\n", + " 576.321436\n", + " 576.108494\n", " \n", " \n", " min\n", " -500.000000\n", - " -999.980827\n", - " -999.00000\n", + " -999.988117\n", + " -999.000000\n", " -999.000000\n", " \n", " \n", " 25%\n", - " -499.179172\n", - " -503.072353\n", - " -497.00000\n", - " -497.000000\n", + " -499.069860\n", + " -494.820546\n", + " -496.000000\n", + " -496.000000\n", " \n", " \n", " 50%\n", - " 3.133260\n", - " -5.691740\n", - " 1.00000\n", - " 2.000000\n", + " 2.821413\n", + " 5.823958\n", + " 3.000000\n", + " 1.000000\n", " \n", " \n", " 75%\n", - " 500.000000\n", - " 495.773629\n", - " 497.00000\n", + " 499.807599\n", + " 505.991898\n", " 501.000000\n", + " 500.000000\n", " \n", " \n", " max\n", " 500.000000\n", - " 999.985790\n", - " 999.00000\n", + " 999.948488\n", + " 999.000000\n", " 999.000000\n", " \n", " \n", @@ -1234,18 +1234,18 @@ "" ], "text/plain": [ - " column1 column2 column3 column4\n", - "count 100000.000000 100000.000000 100000.00000 100000.000000\n", - "mean 1.174175 -3.764614 1.34378 1.478400\n", - "std 408.414807 577.167935 575.56337 577.826276\n", - "min -500.000000 -999.980827 -999.00000 -999.000000\n", - "25% -499.179172 -503.072353 -497.00000 -497.000000\n", - "50% 3.133260 -5.691740 1.00000 2.000000\n", - "75% 500.000000 495.773629 497.00000 501.000000\n", - "max 500.000000 999.985790 999.00000 999.000000" + " column1 column2 column3 column4\n", + "count 100000.000000 100000.000000 100000.000000 100000.000000\n", + "mean 0.653302 4.948220 1.349060 1.572440\n", + "std 408.053983 577.169270 576.321436 576.108494\n", + "min -500.000000 -999.988117 -999.000000 -999.000000\n", + "25% -499.069860 -494.820546 -496.000000 -496.000000\n", + "50% 2.821413 5.823958 3.000000 1.000000\n", + "75% 499.807599 505.991898 501.000000 500.000000\n", + "max 500.000000 999.948488 999.000000 999.000000" ] }, - "execution_count": 32, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1264,7 +1264,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 20, "id": "1151d8a4-db49-4524-b7f3-bd38866e3d5e", "metadata": {}, "outputs": [], @@ -1274,7 +1274,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 21, "id": "bd8ce0d1-73cb-4982-b4fe-65e7d2144943", "metadata": {}, "outputs": [ @@ -1310,56 +1310,56 @@ " count\n", " 100000.000000\n", " 100000.000000\n", - " 100000.00000\n", + " 100000.000000\n", " 100000.000000\n", " \n", " \n", " mean\n", - " 1.174175\n", - " -2.876739\n", - " 1.34378\n", - " 1.478400\n", + " 0.653302\n", + " 3.035664\n", + " 1.349060\n", + " 1.572440\n", " \n", " \n", " std\n", - " 408.414807\n", - " 408.107880\n", - " 575.56337\n", - " 577.826276\n", + " 408.053983\n", + " 408.488775\n", + " 576.321436\n", + " 576.108494\n", " \n", " \n", " min\n", " -500.000000\n", " -500.000000\n", - " -999.00000\n", + " -999.000000\n", " -999.000000\n", " \n", " \n", " 25%\n", - " -499.179172\n", - " -500.000000\n", - " -497.00000\n", - " -497.000000\n", + " -499.069860\n", + " -494.820546\n", + " -496.000000\n", + " -496.000000\n", " \n", " \n", " 50%\n", - " 3.133260\n", - " -5.691740\n", - " 1.00000\n", - " 2.000000\n", + " 2.821413\n", + " 5.823958\n", + " 3.000000\n", + " 1.000000\n", " \n", " \n", " 75%\n", + " 499.807599\n", " 500.000000\n", - " 495.773629\n", - " 497.00000\n", " 501.000000\n", + " 500.000000\n", " \n", " \n", " max\n", " 500.000000\n", " 500.000000\n", - " 999.00000\n", + " 999.000000\n", " 999.000000\n", " \n", " \n", @@ -1367,18 +1367,18 @@ "" ], "text/plain": [ - " column1 column2 column3 column4\n", - "count 100000.000000 100000.000000 100000.00000 100000.000000\n", - "mean 1.174175 -2.876739 1.34378 1.478400\n", - "std 408.414807 408.107880 575.56337 577.826276\n", - "min -500.000000 -500.000000 -999.00000 -999.000000\n", - "25% -499.179172 -500.000000 -497.00000 -497.000000\n", - "50% 3.133260 -5.691740 1.00000 2.000000\n", - "75% 500.000000 495.773629 497.00000 501.000000\n", - "max 500.000000 500.000000 999.00000 999.000000" + " column1 column2 column3 column4\n", + "count 100000.000000 100000.000000 100000.000000 100000.000000\n", + "mean 0.653302 3.035664 1.349060 1.572440\n", + "std 408.053983 408.488775 576.321436 576.108494\n", + "min -500.000000 -500.000000 -999.000000 -999.000000\n", + "25% -499.069860 -494.820546 -496.000000 -496.000000\n", + "50% 2.821413 5.823958 3.000000 1.000000\n", + "75% 499.807599 500.000000 501.000000 500.000000\n", + "max 500.000000 500.000000 999.000000 999.000000" ] }, - "execution_count": 34, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1397,7 +1397,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 22, "id": "bea974f5-367b-4432-893f-a33a449dda6f", "metadata": {}, "outputs": [ @@ -1438,50 +1438,50 @@ " \n", " \n", " mean\n", - " 1.695724\n", - " -3.040382\n", - " 4.001460\n", - " 0.511100\n", + " 0.455395\n", + " 5.588258\n", + " 1.329300\n", + " 1.763540\n", " \n", " \n", " std\n", - " 578.627560\n", - " 576.453798\n", - " 576.244217\n", - " 578.301376\n", + " 577.070175\n", + " 577.342945\n", + " 577.130306\n", + " 575.554149\n", " \n", " \n", " min\n", - " -999.983952\n", - " -999.968792\n", + " -999.965920\n", + " -999.963945\n", " -999.000000\n", " -999.000000\n", " \n", " \n", " 25%\n", - " -502.160012\n", - " -500.333306\n", - " -492.250000\n", - " -498.000000\n", + " -500.420461\n", + " -495.503431\n", + " -497.000000\n", + " -493.000000\n", " \n", " \n", " 50%\n", - " 4.057258\n", - " -4.377464\n", + " 3.895165\n", + " 9.582978\n", " 3.000000\n", - " 0.000000\n", + " 1.000000\n", " \n", " \n", " 75%\n", - " 502.159214\n", - " 494.614704\n", - " 499.000000\n", - " 500.250000\n", + " 496.851678\n", + " 505.826690\n", + " 501.000000\n", + " 499.250000\n", " \n", " \n", " max\n", - " 999.998666\n", - " 999.913716\n", + " 999.979256\n", + " 999.948488\n", " 999.000000\n", " 999.000000\n", " \n", @@ -1492,16 +1492,16 @@ "text/plain": [ " column1 column2 column3 column4\n", "count 50000.000000 50000.000000 50000.000000 50000.000000\n", - "mean 1.695724 -3.040382 4.001460 0.511100\n", - "std 578.627560 576.453798 576.244217 578.301376\n", - "min -999.983952 -999.968792 -999.000000 -999.000000\n", - "25% -502.160012 -500.333306 -492.250000 -498.000000\n", - "50% 4.057258 -4.377464 3.000000 0.000000\n", - "75% 502.159214 494.614704 499.000000 500.250000\n", - "max 999.998666 999.913716 999.000000 999.000000" + "mean 0.455395 5.588258 1.329300 1.763540\n", + "std 577.070175 577.342945 577.130306 575.554149\n", + "min -999.965920 -999.963945 -999.000000 -999.000000\n", + "25% -500.420461 -495.503431 -497.000000 -493.000000\n", + "50% 3.895165 9.582978 3.000000 1.000000\n", + "75% 496.851678 505.826690 501.000000 499.250000\n", + "max 999.979256 999.948488 999.000000 999.000000" ] }, - "execution_count": 35, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1528,7 +1528,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 23, "id": "e4aa3b34-c948-4efe-b523-37e3ab2b2522", "metadata": {}, "outputs": [], @@ -1538,7 +1538,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 24, "id": "018ed5c4-de6f-4a18-bc11-ae3917dcf481", "metadata": {}, "outputs": [], @@ -1553,7 +1553,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 25, "id": "08daae66-9e28-4394-8021-b64cd806a4ab", "metadata": {}, "outputs": [ @@ -1561,7 +1561,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "6.94 ms ± 638 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + "7.19 ms ± 1.07 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], @@ -1572,7 +1572,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 26, "id": "04dd1afc-11cb-46a6-9e15-28461c9e96b3", "metadata": {}, "outputs": [ @@ -1580,7 +1580,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "946 μs ± 49.5 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" + "962 μs ± 137 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" ] } ], @@ -1591,7 +1591,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 27, "id": "5bc546df-96b7-446c-be09-3bdc48b6e86b", "metadata": {}, "outputs": [], @@ -1601,7 +1601,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 28, "id": "e8d2d195-0c5f-48ee-956f-895c6ae45fa3", "metadata": {}, "outputs": [], @@ -1616,7 +1616,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 29, "id": "5d455f94-47f4-4582-8019-15763457198b", "metadata": {}, "outputs": [ @@ -1624,7 +1624,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "778 ms ± 78.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "971 ms ± 162 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -1635,7 +1635,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 30, "id": "b8b68166-7010-41d7-b2ea-f6536dc2d147", "metadata": {}, "outputs": [ @@ -1643,7 +1643,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "49 ms ± 6.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "66.8 ms ± 13.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -1657,7 +1657,7 @@ "id": "ca530907-9cda-408f-816a-f6f34351e0d9", "metadata": {}, "source": [ - "As you can see, assigning the column to perform an in--place operations is significantly faster." + "As you can see, assigning the column to perform an in-place operations is significantly faster." ] }, { From fde9090824a69df62f9e8cbe2077a6f75904d973 Mon Sep 17 00:00:00 2001 From: Geert Jan Bex Date: Mon, 24 Nov 2025 11:00:12 +0100 Subject: [PATCH 07/10] Add numba example --- source-code/pandas/README.md | 2 + source-code/pandas/numba_and_pandas.ipynb | 327 ++++++++++++++++++++++ 2 files changed, 329 insertions(+) create mode 100644 source-code/pandas/numba_and_pandas.ipynb diff --git a/source-code/pandas/README.md b/source-code/pandas/README.md index 313b336..1bb6cf9 100644 --- a/source-code/pandas/README.md +++ b/source-code/pandas/README.md @@ -32,3 +32,5 @@ easy to use. 1. `apply.ipynb`: Jupyter notebook that illustrates the use of the `apply` method in pandas dataframes for applying functions along rows or columns. It includes a comparison of performance between using `apply` and vectorized operations. +1. `numba_and_pandas.ipynb`: Jupyter notebook that demonstrates how to use Numba + to optimize performance of operations on pandas dataframes. diff --git a/source-code/pandas/numba_and_pandas.ipynb b/source-code/pandas/numba_and_pandas.ipynb new file mode 100644 index 0000000..6dc82e0 --- /dev/null +++ b/source-code/pandas/numba_and_pandas.ipynb @@ -0,0 +1,327 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4be65a5f-39d9-42f6-a1ec-beee22363ce3", + "metadata": {}, + "source": [ + "## Requirements" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "2c930efe-41a4-4665-a419-7d1ad683cc82", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "from numba import njit\n", + "import numpy as np\n", + "import pandas as pd\n", + "import time" + ] + }, + { + "cell_type": "markdown", + "id": "8cfbec4d-90d1-4e45-9c9e-310475a395f0", + "metadata": {}, + "source": [ + "## Using numba" + ] + }, + { + "cell_type": "markdown", + "id": "8ec085cd-5fce-420b-8f12-46b9a9e56269", + "metadata": {}, + "source": [ + "Consider the following dataframe with 2 million rows." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "31f2457e-b51a-4e6c-91fb-852c088c6792", + "metadata": {}, + "outputs": [], + "source": [ + "size = 2_000_000\n", + "df = pd.DataFrame({\n", + " 'x': np.random.rand(size),\n", + " 'y': np.random.rand(size),\n", + "})" + ] + }, + { + "cell_type": "markdown", + "id": "7203ebcd-b1ba-4877-90ef-e1b9665d1029", + "metadata": {}, + "source": [ + "You want to create a series computed as $\\sqrt{x^2 + y^2}$. You can consider three approoaches:\n", + "1. pandas' `.apply()` method,\n", + "2. numpy expressions, and\n", + "3. using a numba function." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "20974ffd-621b-4ba5-a168-01f27a307712", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 8.93 s, sys: 288 ms, total: 9.22 s\n", + "Wall time: 9.22 s\n" + ] + } + ], + "source": [ + "%time df.apply(lambda row: np.sqrt(row['x']**2 + row['y']**2), axis=1);" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "b15fabcb-22d5-487e-bd79-72d1f6924e7f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 34.4 ms, sys: 16.1 ms, total: 50.5 ms\n", + "Wall time: 48.9 ms\n" + ] + } + ], + "source": [ + "%time np.sqrt(df['x']**2 + df['y']**2);" + ] + }, + { + "cell_type": "markdown", + "id": "0f53a473-d8bc-40b3-ba63-7b4db3ec81ca", + "metadata": {}, + "source": [ + "It is clear that the using numpy is much more efficient, the speedup is 250." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "4bddc8bb-5e68-4a7c-9c45-589f797dac12", + "metadata": {}, + "outputs": [], + "source": [ + "@njit\n", + "def score_numba(x, y):\n", + " result = np.empty_like(x)\n", + " for i in range(len(x)):\n", + " result[i] = np.sqrt(x[i]**2 + y[i]**2)\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bd7cc279-76d1-4b2d-a300-24142d0fafa2", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 248 ms, sys: 47.8 ms, total: 296 ms\n", + "Wall time: 433 ms\n" + ] + } + ], + "source": [ + "%time score_numba(df.x.values, df.y.values);" + ] + }, + { + "cell_type": "markdown", + "id": "a5b4731e-593d-41ee-8354-54a066992c8c", + "metadata": {}, + "source": [ + "Using numba is about 25 times faster than the pandas `.apply()` method, but 10 slower than numpy, so is there a point?\n", + "\n", + "There is if you run that function multiple times." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8339e439-3d9c-4404-af22-3af502ecf941", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 5.7 ms, sys: 0 ns, total: 5.7 ms\n", + "Wall time: 5.77 ms\n" + ] + } + ], + "source": [ + "%time score_numba(df.x.values, df.y.values);" + ] + }, + { + "cell_type": "markdown", + "id": "d6900b2d-3dd1-4e90-aee2-a526b8dd058d", + "metadata": {}, + "source": [ + "As you can see, numba is now more than 2,000 times faster then the equivalent `.apply()` method call. Once the initial compilation has been done, there is little or no overhead on subsequent calls." + ] + }, + { + "cell_type": "markdown", + "id": "7b302dd6-97e9-4294-8b3c-2236fd3c03a2", + "metadata": {}, + "source": [ + "This can be compared to a similar implementation with a non-compiled Python function." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "024672ee-7e66-4e6b-a199-b78c81489735", + "metadata": {}, + "outputs": [], + "source": [ + "def score_python(x, y):\n", + " result = np.empty_like(x)\n", + " for i in range(len(x)):\n", + " result[i] = np.sqrt(x[i]**2 + y[i]**2)\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "796d265e-4c6b-4f70-ac08-dfe7d90036db", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.74 s, sys: 0 ns, total: 1.74 s\n", + "Wall time: 1.74 s\n" + ] + } + ], + "source": [ + "%time score_python(df.x.values, df.y.values);" + ] + }, + { + "cell_type": "markdown", + "id": "26476f95-fb22-4751-8ac9-80d90f210c6c", + "metadata": {}, + "source": [ + "Even this approach is faster than pandas' `.apply()`, but still more than 200 times slower than numba (once compiled)." + ] + }, + { + "cell_type": "markdown", + "id": "f613ea5c-eddc-4517-a31b-980ae613d072", + "metadata": {}, + "source": [ + "## Benchmark" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "15f830eb-3ea7-43b7-a31f-c2b234735df7", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sizes = [50_000, 100_000, 500_000, 1_000_000, 2_000_000]\n", + "times_apply, times_numpy, times_numba = [], [], []\n", + "\n", + "for size in sizes:\n", + " df = pd.DataFrame({\n", + " \"x\": np.random.rand(size),\n", + " \"y\": np.random.rand(size)\n", + " })\n", + " \n", + " start = time.time()\n", + " df.apply(lambda row: np.sqrt(row['x']**2 + row['y']**2), axis=1)\n", + " times_apply.append(time.time() - start)\n", + " \n", + " start = time.time()\n", + " np.sqrt(df['x']**2 + df['y']**2)\n", + " times_numpy.append(time.time() - start)\n", + " \n", + " score_numba(df['x'].values, df['y'].values) # warm-up compile\n", + " start = time.time()\n", + " score_numba(df['x'].values, df['y'].values)\n", + " times_numba.append(time.time() - start)\n", + "\n", + "plt.figure(figsize=(7,5))\n", + "plt.plot(sizes, times_apply, \"o-r\", label=\"Pandas apply()\")\n", + "plt.plot(sizes, times_numpy, \"x-g\", label=\"Numpy\")\n", + "plt.plot(sizes, times_numba, \"s-b\", label=\"Numba accelerated\")\n", + "plt.xlabel(\"Number of Rows\")\n", + "plt.ylabel(\"Runtime (seconds)\")\n", + "plt.yscale('log')\n", + "plt.legend()\n", + "plt.grid(True)" + ] + }, + { + "cell_type": "markdown", + "id": "3e37e637-a4e5-4422-b769-4de985926a46", + "metadata": {}, + "source": [ + "It is clear that numba can significantly speedup computations that rely on function that are called repeatedly." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 6f444ec10ff358c468f7a30c49d6b1c591700865 Mon Sep 17 00:00:00 2001 From: Geert Jan Bex Date: Mon, 24 Nov 2025 18:56:18 +0100 Subject: [PATCH 08/10] Add DuckDB examples --- source-code/README.md | 1 + source-code/duckdb/README.md | 14 + source-code/duckdb/patients.ipynb | 1386 +++++++++++++++++++++++++---- 3 files changed, 1250 insertions(+), 151 deletions(-) create mode 100644 source-code/duckdb/README.md diff --git a/source-code/README.md b/source-code/README.md index 4f9bfdc..177a58c 100644 --- a/source-code/README.md +++ b/source-code/README.md @@ -15,6 +15,7 @@ to create it. There is some material not covered in the presentation as well. representation and algorithms. * [`pandas`](pandas): illustrations of using pandas and seaborn. * [`polars`](polars): Kllustrations of using polars. +* [`duckdb`](duckdb): illustrations of using DuckDB for SQL queries. * [`regexes`](regexes): illustrations of using regular expressions for validation and information extraction from textual data. * [`seaborn`](seaborn): illustrations of using Seaborn to create plots. diff --git a/source-code/duckdb/README.md b/source-code/duckdb/README.md new file mode 100644 index 0000000..c4b0e68 --- /dev/null +++ b/source-code/duckdb/README.md @@ -0,0 +1,14 @@ +# DuckDB + +DuckDB is an in-process SQL OLAP database management system. It is designed to +support analytical query workloads and is optimized for fast query performance +on large datasets. DuckDB can be embedded directly into applications, making it +a popular choice for data analysis tasks in various programming environments. + + +## What is it? + +1. `patients.ipynb`: A Jupyter notebook that demonstrates how to use DuckDB for + analyzing patient data. It includes examples of loading data and executing + SQL queries. +1. `data/`: CSV files to use with the notebook. diff --git a/source-code/duckdb/patients.ipynb b/source-code/duckdb/patients.ipynb index 380233b..853801e 100644 --- a/source-code/duckdb/patients.ipynb +++ b/source-code/duckdb/patients.ipynb @@ -1,31 +1,946 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "9b23ccda-f524-41bc-975b-568da36a4493", + "metadata": {}, + "source": [ + "## Requirements" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4c0d0975-71ce-4e9a-986c-ac6cbfb251bb", + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "d46cb701-62c6-4279-bf61-36935a0a53a9", + "metadata": {}, + "source": [ + "## Database connection" + ] + }, + { + "cell_type": "markdown", + "id": "6a0df994-ac76-4c42-904a-a6113ea419f9", + "metadata": {}, + "source": [ + "Create a connection to the database, and query metadata." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "861daf66-0810-48d9-968c-18e1dabe0e4f", + "metadata": {}, + "outputs": [], + "source": [ + "conn = duckdb.connect('data/patient_experiment.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "843f9d0f-8384-4977-916e-49ef59e5cbad", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "┌─────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐\n", + "│ column_name │ column_type │ null │ key │ default │ extra │\n", + "│ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │\n", + "├─────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤\n", + "│ column0 │ BIGINT │ YES │ NULL │ NULL │ NULL │\n", + "│ patient │ BIGINT │ YES │ NULL │ NULL │ NULL │\n", + "│ dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ date │ TIMESTAMP │ YES │ NULL │ NULL │ NULL │\n", + "│ temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "└─────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.sql('''\n", + " DESCRIBE patient_experiment;\n", + "''')" + ] + }, + { + "cell_type": "markdown", + "id": "62195da0-ca79-4f65-b3f8-780783518d98", + "metadata": {}, + "source": [ + "Create a function to show the tables/views in the database." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a3effca9-7353-435b-b7b5-01a487034314", + "metadata": {}, + "outputs": [], + "source": [ + "def show_tables(conn):\n", + " conn.sql('''\n", + " SELECT table_schema, table_name, table_type\n", + " FROM information_schema.tables\n", + " WHERE table_schema NOT IN ('information_schema', 'pg_catalog');\n", + " ''').show()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "66400c5a-ca95-4104-a537-7a29dbdb001b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌──────────────┬────────────────────┬────────────┐\n", + "│ table_schema │ table_name │ table_type │\n", + "│ varchar │ varchar │ varchar │\n", + "├──────────────┼────────────────────┼────────────┤\n", + "│ main │ file │ VIEW │\n", + "│ main │ patient_experiment │ VIEW │\n", + "└──────────────┴────────────────────┴────────────┘\n", + "\n" + ] + } + ], + "source": [ + "show_tables(conn)" + ] + }, + { + "cell_type": "markdown", + "id": "5bc72abf-863b-4516-bc42-14784d7a40b0", + "metadata": {}, + "source": [ + "## Queries" + ] + }, + { + "cell_type": "markdown", + "id": "bff175ad-cbe7-4a5d-91db-7d6eada3b695", + "metadata": {}, + "source": [ + "Select all the data for patient 6 and convert it to a pandas dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "d4607877-270b-48c5-ba91-6aa19a8a24cf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
patientdatetemperaturedose
062012-10-02 10:00:0037.50.0
162012-10-02 11:00:0038.12.0
262012-10-02 12:00:0037.93.0
362012-10-02 13:00:0037.72.0
462012-10-02 14:00:0037.21.0
562012-10-02 15:00:0036.80.0
\n", + "
" + ], + "text/plain": [ + " patient date temperature dose\n", + "0 6 2012-10-02 10:00:00 37.5 0.0\n", + "1 6 2012-10-02 11:00:00 38.1 2.0\n", + "2 6 2012-10-02 12:00:00 37.9 3.0\n", + "3 6 2012-10-02 13:00:00 37.7 2.0\n", + "4 6 2012-10-02 14:00:00 37.2 1.0\n", + "5 6 2012-10-02 15:00:00 36.8 0.0" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute('''\n", + " SELECT patient, date, temperature, dose\n", + " FROM patient_experiment\n", + " WHERE patient == 6;\n", + "''').df()" + ] + }, + { + "cell_type": "markdown", + "id": "02af9a7f-84d2-4cb9-a705-441b4eff526a", + "metadata": {}, + "source": [ + "For the patients with a high fever, count the number of timepoints they had a temperature above $39.5\\textdegree C$ as well as their maximum temperature." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "262b2ad4-05c0-445e-8005-f07307f88947", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
patienthigh_fever_countmax_temperature
07340.7
19140.2
\n", + "
" + ], + "text/plain": [ + " patient high_fever_count max_temperature\n", + "0 7 3 40.7\n", + "1 9 1 40.2" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute('''\n", + " SELECT\n", + " patient,\n", + " COUNT(temperature) AS high_fever_count,\n", + " MAX(temperature) AS max_temperature\n", + " FROM patient_experiment\n", + " WHERE temperature > 39.5\n", + " GROUP BY patient\n", + " ORDER BY patient;\n", + "''').df()" + ] + }, + { + "cell_type": "markdown", + "id": "98ef7ed5-23d1-4591-ab8a-405248f98a3a", + "metadata": {}, + "source": [ + "For each patient, compute the total dose administered, as well as the maximum temperature, and order by descending maximum temperature." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "7df98e50-5ab8-4813-a3ad-d3796aa35c48", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
patientmax_temperaturetotal_dose
0740.730.0
1940.230.0
2539.527.0
3339.513.0
4239.415.0
5138.56.0
6638.18.0
7438.110.0
8837.90.0
\n", + "
" + ], + "text/plain": [ + " patient max_temperature total_dose\n", + "0 7 40.7 30.0\n", + "1 9 40.2 30.0\n", + "2 5 39.5 27.0\n", + "3 3 39.5 13.0\n", + "4 2 39.4 15.0\n", + "5 1 38.5 6.0\n", + "6 6 38.1 8.0\n", + "7 4 38.1 10.0\n", + "8 8 37.9 0.0" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute('''\n", + " SELECT\n", + " patient,\n", + " MAX(temperature) AS 'max_temperature',\n", + " SUM(dose) AS 'total_dose'\n", + " FROM patient_experiment\n", + " GROUP BY patient\n", + " ORDER BY max_temperature DESC;\n", + "''').df()" + ] + }, + { + "cell_type": "markdown", + "id": "d3d6fcbf-9bc5-40fb-9e4f-046def5249db", + "metadata": {}, + "source": [ + "If you want to query the result of such a query, you can create a view, `'hypothesis'` in this example." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6b8694e2-5e9f-4d15-b2c3-609ab762d7b4", + "metadata": {}, + "outputs": [], + "source": [ + "conn.execute('''\n", + " CREATE VIEW hypothesis AS SELECT\n", + " patient,\n", + " MAX(temperature) AS 'max_temperature',\n", + " SUM(dose) AS 'total_dose'\n", + " FROM patient_experiment\n", + " GROUP BY patient\n", + " ORDER BY max_temperature DESC;\n", + "''');" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "7c755e2b-d781-435e-a878-9b29b6374670", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌──────────────┬────────────────────┬────────────┐\n", + "│ table_schema │ table_name │ table_type │\n", + "│ varchar │ varchar │ varchar │\n", + "├──────────────┼────────────────────┼────────────┤\n", + "│ main │ file │ VIEW │\n", + "│ main │ hypothesis │ VIEW │\n", + "│ main │ patient_experiment │ VIEW │\n", + "└──────────────┴────────────────────┴────────────┘\n", + "\n" + ] + } + ], + "source": [ + "show_tables(conn)" + ] + }, + { + "cell_type": "markdown", + "id": "96617d94-da47-4604-9c27-3375058cbd86", + "metadata": {}, + "source": [ + "Get the maximum dose administered to a patient." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "d1a405c8-05e4-4d27-bd1c-3d1fe61c5e43", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
max(total_dose)
030.0
\n", + "
" + ], + "text/plain": [ + " max(total_dose)\n", + "0 30.0" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute('''\n", + " SELECT MAX(total_dose)\n", + " FROM hypothesis;\n", + "''').df()" + ] + }, + { + "cell_type": "markdown", + "id": "2718eb2c-1e3a-4f90-a1f6-bb27e9d4a332", + "metadata": {}, + "source": [ + "Although DuckDB has an extension to perform a pivot (this is not standard SQL), it is not as elegant as the pandas counterpart as multi-level columns are not suppported by DuckDB." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "0c4e1ad8-a9ae-4808-901b-c036e4e0ee17", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<_duckdb.DuckDBPyConnection at 0x739eb95f72b0>" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute('''\n", + " CREATE TABLE time_series AS\n", + " PIVOT patient_experiment\n", + " ON patient\n", + " USING\n", + " first(temperature) AS temperature,\n", + " first(dose) AS dose\n", + " GROUP BY date;\n", + "''');" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "b4b38844-e6d2-4df1-8f6c-21e4daf77d48", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌───────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐\n", + "│ column_name │ column_type │ null │ key │ default │ extra │\n", + "│ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │\n", + "├───────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤\n", + "│ date │ TIMESTAMP │ YES │ NULL │ NULL │ NULL │\n", + "│ 1_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 1_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 2_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 2_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 3_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 3_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 4_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 4_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 5_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 5_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 6_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 6_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 7_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 7_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 8_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 8_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 9_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 9_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "├───────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┤\n", + "│ 19 rows 6 columns │\n", + "└─────────────────────────────────────────────────────────────────────┘\n", + "\n" + ] + } + ], + "source": [ + "conn.sql('DESCRIBE time_series;').show()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "95b029d4-0329-4462-8737-94f036dc8147", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datetemperaturedose
02012-10-02 10:00:0037.50.0
12012-10-02 11:00:0038.12.0
22012-10-02 12:00:0037.93.0
32012-10-02 13:00:0037.72.0
42012-10-02 14:00:0037.21.0
52012-10-02 15:00:0036.80.0
62012-10-02 16:00:00NaNNaN
\n", + "
" + ], + "text/plain": [ + " date temperature dose\n", + "0 2012-10-02 10:00:00 37.5 0.0\n", + "1 2012-10-02 11:00:00 38.1 2.0\n", + "2 2012-10-02 12:00:00 37.9 3.0\n", + "3 2012-10-02 13:00:00 37.7 2.0\n", + "4 2012-10-02 14:00:00 37.2 1.0\n", + "5 2012-10-02 15:00:00 36.8 0.0\n", + "6 2012-10-02 16:00:00 NaN NaN" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute('''\n", + " SELECT\n", + " date,\n", + " \"6_temperature\" AS temperature,\n", + " \"6_dose\" AS dose\n", + " FROM time_series\n", + " ORDER BY date;\n", + "''').df()" + ] + }, { "cell_type": "code", - "execution_count": 1, - "id": "4c0d0975-71ce-4e9a-986c-ac6cbfb251bb", + "execution_count": 27, + "id": "78a996cf-4bc2-40dc-8bce-27bcc6731b7b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌────────────────────┐\n", + "│ name │\n", + "│ varchar │\n", + "├────────────────────┤\n", + "│ file │\n", + "│ hypothesis │\n", + "│ patient_experiment │\n", + "│ time_series │\n", + "└────────────────────┘\n", + "\n" + ] + } + ], + "source": [ + "conn.sql('SHOW TABLES;').show()" + ] + }, + { + "cell_type": "markdown", + "id": "8637dcab-93f4-4d69-9fec-0bc840b03598", "metadata": {}, + "source": [ + "Create a view on a second CSV file." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "8570b365-09d5-4993-908e-4eaac974204d", + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "import duckdb\n", - "import pandas as pd" + "conn.execute('''\n", + " CREATE VIEW patient_metadata AS\n", + " SELECT *\n", + " FROM read_csv_auto('data/patient_metadata.csv', filename=true);\n", + "''');" ] }, { "cell_type": "code", - "execution_count": 2, - "id": "861daf66-0810-48d9-968c-18e1dabe0e4f", + "execution_count": 41, + "id": "022db70e-64a2-40ad-920f-463f465b274c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌──────────────┬────────────────────┬────────────┐\n", + "│ table_schema │ table_name │ table_type │\n", + "│ varchar │ varchar │ varchar │\n", + "├──────────────┼────────────────────┼────────────┤\n", + "│ main │ time_series │ BASE TABLE │\n", + "│ main │ file │ VIEW │\n", + "│ main │ hypothesis │ VIEW │\n", + "│ main │ metadata │ VIEW │\n", + "│ main │ patient_experiment │ VIEW │\n", + "│ main │ patient_metadata │ VIEW │\n", + "└──────────────┴────────────────────┴────────────┘\n", + "\n" + ] + } + ], "source": [ - "conn = duckdb.connect('data/patient_experiment.csv')" + "show_tables(conn)" + ] + }, + { + "cell_type": "markdown", + "id": "8ae2a578-6849-4a22-ad2e-b770de3ff44e", + "metadata": {}, + "source": [ + "Determine the patient IDs that are either in `patient_experiment`, or in `patient_metadata`, but not in both. Note that a full outer join is used to combine the informantion in both tables." ] }, { "cell_type": "code", - "execution_count": 25, - "id": "c0f56de5-2351-4473-b768-234a89a705ac", + "execution_count": 45, + "id": "6482c2b6-4896-4c30-9bb4-b5a278626364", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
patientpresent
04only in experiment
110only in metadata
211only in metadata
\n", + "
" + ], + "text/plain": [ + " patient present\n", + "0 4 only in experiment\n", + "1 10 only in metadata\n", + "2 11 only in metadata" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute('''\n", + " SELECT\n", + " DISTINCT COALESCE(exp.patient, mt.patient) AS patient,\n", + " CASE\n", + " WHEN exp.patient IS NOT NULL AND mt.patient IS NULL\n", + " THEN 'only in experiment'\n", + " WHEN exp.patient IS NULL and mt.patient IS NOT NULL\n", + " THEN 'only in metadata'\n", + " ELSE 'in both'\n", + " END AS present\n", + " FROM patient_experiment AS exp FULL OUTER JOIN patient_metadata AS mt\n", + " USING (patient)\n", + " WHERE NOT present = 'in both'\n", + " ORDER BY exp.patient, mt.patient;\n", + "''').df()" + ] + }, + { + "cell_type": "markdown", + "id": "9cf58cd3-0804-4472-bfd2-72f5afd82166", "metadata": {}, + "source": [ + "You can do an inner join between the tables `patient_experiment` and `patient_metadata` to get the maximum temperature, the condition and gender for each patient that occurs in both tables." + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "1639aeaa-b33a-4889-869d-93500242980b", + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { @@ -49,114 +964,115 @@ " \n", " \n", " patient\n", + " max_temperature\n", + " condition\n", + " gender\n", " \n", " \n", " \n", " \n", " 0\n", " 1\n", + " 38.5\n", + " A\n", + " M\n", " \n", " \n", " 1\n", - " 3\n", + " 2\n", + " 39.4\n", + " A\n", + " F\n", " \n", " \n", " 2\n", - " 6\n", + " 3\n", + " 39.5\n", + " A\n", + " M\n", " \n", " \n", " 3\n", - " 7\n", + " 5\n", + " 39.5\n", + " A\n", + " M\n", " \n", " \n", " 4\n", - " 8\n", + " 6\n", + " 38.1\n", + " B\n", + " F\n", " \n", " \n", " 5\n", - " 5\n", + " 7\n", + " 40.7\n", + " B\n", + " M\n", " \n", " \n", " 6\n", - " 9\n", + " 8\n", + " 37.9\n", + " B\n", + " F\n", " \n", " \n", " 7\n", - " 2\n", - " \n", - " \n", - " 8\n", - " 4\n", + " 9\n", + " 40.2\n", + " B\n", + " M\n", " \n", " \n", "\n", "" ], "text/plain": [ - " patient\n", - "0 1\n", - "1 3\n", - "2 6\n", - "3 7\n", - "4 8\n", - "5 5\n", - "6 9\n", - "7 2\n", - "8 4" + " patient max_temperature condition gender\n", + "0 1 38.5 A M\n", + "1 2 39.4 A F\n", + "2 3 39.5 A M\n", + "3 5 39.5 A M\n", + "4 6 38.1 B F\n", + "5 7 40.7 B M\n", + "6 8 37.9 B F\n", + "7 9 40.2 B M" ] }, - "execution_count": 25, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "conn.execute('SELECT DISTINCT patient FROM \"patient_experiment\";').df()" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "0c4e1ad8-a9ae-4808-901b-c036e4e0ee17", - "metadata": {}, - "outputs": [], - "source": [ - "sql_pivot = '''\n", - "CREATE TABLE time_series AS\n", - " PIVOT \"data/patient_experiment.csv\"\n", - " ON patient\n", - " USING\n", - " first(temperature) AS temperature,\n", - " first(dose) AS dose\n", - " GROUP BY date;\n", - "'''" + "conn.execute('''\n", + " SELECT\n", + " COALESCE(exp.patient, mt.patient) AS patient,\n", + " MAX(exp.temperature) AS max_temperature,\n", + " ANY_VALUE(mt.condition) AS condition,\n", + " ANY_VALUE(mt.gender) AS gender\n", + " FROM patient_experiment AS exp INNER JOIN patient_metadata AS mt\n", + " USING (patient)\n", + " GROUP BY exp.patient, mt.patient\n", + " ORDER BY exp.patient, mt.patient\n", + "''').df()" ] }, { - "cell_type": "code", - "execution_count": 16, - "id": "6ff65df2-1997-4983-b3c8-1753034c3218", + "cell_type": "markdown", + "id": "b2afc510-6340-4c59-aeec-e245d8358a0e", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "conn.execute(sql_pivot)" + "## New style versus classic style" ] }, { "cell_type": "code", - "execution_count": 18, - "id": "95b029d4-0329-4462-8737-94f036dc8147", + "execution_count": 51, + "id": "397405b0-756c-461a-81f1-e7af99b98d0c", "metadata": {}, "outputs": [ { @@ -180,66 +1096,128 @@ " \n", " \n", " \n", + " patient\n", " date\n", + " temperature\n", + " dose\n", " \n", " \n", " \n", " \n", " 0\n", - " 2012-10-02 14:00:00\n", + " 6\n", + " 2012-10-02 10:00:00\n", + " 37.5\n", + " 0.0\n", " \n", " \n", " 1\n", + " 6\n", " 2012-10-02 11:00:00\n", + " 38.1\n", + " 2.0\n", " \n", " \n", " 2\n", - " 2012-10-02 10:00:00\n", + " 6\n", + " 2012-10-02 12:00:00\n", + " 37.9\n", + " 3.0\n", " \n", " \n", " 3\n", + " 6\n", " 2012-10-02 13:00:00\n", + " 37.7\n", + " 2.0\n", " \n", " \n", " 4\n", - " 2012-10-02 12:00:00\n", + " 6\n", + " 2012-10-02 14:00:00\n", + " 37.2\n", + " 1.0\n", " \n", " \n", " 5\n", + " 6\n", " 2012-10-02 15:00:00\n", - " \n", - " \n", - " 6\n", - " 2012-10-02 16:00:00\n", + " 36.8\n", + " 0.0\n", " \n", " \n", "\n", "" ], "text/plain": [ - " date\n", - "0 2012-10-02 14:00:00\n", - "1 2012-10-02 11:00:00\n", - "2 2012-10-02 10:00:00\n", - "3 2012-10-02 13:00:00\n", - "4 2012-10-02 12:00:00\n", - "5 2012-10-02 15:00:00\n", - "6 2012-10-02 16:00:00" + " patient date temperature dose\n", + "0 6 2012-10-02 10:00:00 37.5 0.0\n", + "1 6 2012-10-02 11:00:00 38.1 2.0\n", + "2 6 2012-10-02 12:00:00 37.9 3.0\n", + "3 6 2012-10-02 13:00:00 37.7 2.0\n", + "4 6 2012-10-02 14:00:00 37.2 1.0\n", + "5 6 2012-10-02 15:00:00 36.8 0.0" ] }, - "execution_count": 18, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "conn.execute('SELECT date FROM time_series;').df()" + "conn.execute('''\n", + " SELECT patient, date, temperature, dose\n", + " FROM patient_experiment\n", + " WHERE patient == 6;\n", + "''').df()" ] }, { "cell_type": "code", - "execution_count": 23, - "id": "78a996cf-4bc2-40dc-8bce-27bcc6731b7b", + "execution_count": 52, + "id": "3f06bf99-ead9-4eec-8251-b834f5c2eb69", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌─────────┬─────────────────────┬─────────────┬────────┐\n", + "│ patient │ date │ temperature │ dose │\n", + "│ int64 │ timestamp │ double │ double │\n", + "├─────────┼─────────────────────┼─────────────┼────────┤\n", + "│ 6 │ 2012-10-02 10:00:00 │ 37.5 │ 0.0 │\n", + "│ 6 │ 2012-10-02 11:00:00 │ 38.1 │ 2.0 │\n", + "│ 6 │ 2012-10-02 12:00:00 │ 37.9 │ 3.0 │\n", + "│ 6 │ 2012-10-02 13:00:00 │ 37.7 │ 2.0 │\n", + "│ 6 │ 2012-10-02 14:00:00 │ 37.2 │ 1.0 │\n", + "│ 6 │ 2012-10-02 15:00:00 │ 36.8 │ 0.0 │\n", + "└─────────┴─────────────────────┴─────────────┴────────┘\n", + "\n" + ] + } + ], + "source": [ + "conn.sql('''\n", + " SELECT patient, date, temperature, dose\n", + " FROM patient_experiment\n", + "''').filter('patient = 6').show()" + ] + }, + { + "cell_type": "markdown", + "id": "cd9dee70-4c1c-4445-9b6c-f957791210fc", + "metadata": {}, + "source": [ + "For the patients with a high fever, count the number of timepoints they had a temperature above $39.5\\textdegree C$ as well as their maximum temperature." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "0f29e357-5c17-4aca-92dc-ceaecf959023", "metadata": {}, "outputs": [ { @@ -263,46 +1241,96 @@ " \n", " \n", " \n", - " name\n", + " patient\n", + " high_fever_count\n", + " max_temperature\n", " \n", " \n", " \n", " \n", " 0\n", - " file\n", + " 7\n", + " 3\n", + " 40.7\n", " \n", " \n", " 1\n", - " patient_experiment\n", - " \n", - " \n", - " 2\n", - " time_series\n", + " 9\n", + " 1\n", + " 40.2\n", " \n", " \n", "\n", "" ], "text/plain": [ - " name\n", - "0 file\n", - "1 patient_experiment\n", - "2 time_series" + " patient high_fever_count max_temperature\n", + "0 7 3 40.7\n", + "1 9 1 40.2" ] }, - "execution_count": 23, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "conn.execute('show tables;').df()" + "conn.execute('''\n", + " SELECT\n", + " patient,\n", + " COUNT(temperature) AS high_fever_count,\n", + " MAX(temperature) AS max_temperature\n", + " FROM patient_experiment\n", + " WHERE temperature > 39.5\n", + " GROUP BY patient\n", + " ORDER BY patient;\n", + "''').df()" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "d07c6fff-b72d-49c0-a0f7-f545e4013331", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌─────────┬──────────────────┬─────────────────┐\n", + "│ patient │ high_fever_count │ max_temperature │\n", + "│ int64 │ int64 │ double │\n", + "├─────────┼──────────────────┼─────────────────┤\n", + "│ 7 │ 3 │ 40.7 │\n", + "│ 9 │ 1 │ 40.2 │\n", + "└─────────┴──────────────────┴─────────────────┘\n", + "\n" + ] + } + ], + "source": [ + "conn.sql('SELECT patient, temperature FROM patient_experiment') \\\n", + " .filter('temperature > 39.5') \\\n", + " .aggregate(\n", + " 'patient, '\n", + " 'COUNT(temperature) AS high_fever_count, '\n", + " 'MAX(temperature) AS max_temperature',\n", + " group_expr='patient') \\\n", + " .show() " + ] + }, + { + "cell_type": "markdown", + "id": "65ab2e40-bbf8-4537-a41b-e2321568fb98", + "metadata": {}, + "source": [ + "For each patient, compute the total dose administered, as well as the maximum temperature, and order by descending maximum temperature." ] }, { "cell_type": "code", - "execution_count": 24, - "id": "ea5cf7e3-1b67-45fd-bdde-1ee294575149", + "execution_count": 18, + "id": "27f12e02-c92a-459a-a469-e733ba17052f", "metadata": {}, "outputs": [ { @@ -326,91 +1354,147 @@ " \n", " \n", " \n", - " table_schema\n", - " table_name\n", - " table_type\n", + " patient\n", + " max_temperature\n", + " total_dose\n", " \n", " \n", " \n", " \n", " 0\n", - " main\n", - " time_series\n", - " BASE TABLE\n", + " 7\n", + " 40.7\n", + " 30.0\n", " \n", " \n", " 1\n", - " main\n", - " file\n", - " VIEW\n", + " 9\n", + " 40.2\n", + " 30.0\n", " \n", " \n", " 2\n", - " main\n", - " patient_experiment\n", - " VIEW\n", + " 5\n", + " 39.5\n", + " 27.0\n", + " \n", + " \n", + " 3\n", + " 3\n", + " 39.5\n", + " 13.0\n", + " \n", + " \n", + " 4\n", + " 2\n", + " 39.4\n", + " 15.0\n", + " \n", + " \n", + " 5\n", + " 1\n", + " 38.5\n", + " 6.0\n", + " \n", + " \n", + " 6\n", + " 6\n", + " 38.1\n", + " 8.0\n", + " \n", + " \n", + " 7\n", + " 4\n", + " 38.1\n", + " 10.0\n", + " \n", + " \n", + " 8\n", + " 8\n", + " 37.9\n", + " 0.0\n", " \n", " \n", "\n", "" ], "text/plain": [ - " table_schema table_name table_type\n", - "0 main time_series BASE TABLE\n", - "1 main file VIEW\n", - "2 main patient_experiment VIEW" + " patient max_temperature total_dose\n", + "0 7 40.7 30.0\n", + "1 9 40.2 30.0\n", + "2 5 39.5 27.0\n", + "3 3 39.5 13.0\n", + "4 2 39.4 15.0\n", + "5 1 38.5 6.0\n", + "6 6 38.1 8.0\n", + "7 4 38.1 10.0\n", + "8 8 37.9 0.0" ] }, - "execution_count": 24, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "conn.sql(\"\"\"\n", - " SELECT table_schema, table_name, table_type\n", - " FROM information_schema.tables\n", - " WHERE table_schema NOT IN ('information_schema', 'pg_catalog')\n", - "\"\"\").df()" + "conn.execute('''\n", + " SELECT\n", + " patient,\n", + " MAX(temperature) AS 'max_temperature',\n", + " SUM(dose) AS 'total_dose'\n", + " FROM patient_experiment\n", + " GROUP BY patient\n", + " ORDER BY max_temperature DESC;\n", + "''').df()" ] }, { "cell_type": "code", - "execution_count": 28, - "id": "843f9d0f-8384-4977-916e-49ef59e5cbad", + "execution_count": 77, + "id": "5f83cde9-129a-4f76-b199-3ae7adf3f08c", "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "┌─────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐\n", - "│ column_name │ column_type │ null │ key │ default │ extra │\n", - "│ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │\n", - "├─────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤\n", - "│ column0 │ BIGINT │ YES │ NULL │ NULL │ NULL │\n", - "│ patient │ BIGINT │ YES │ NULL │ NULL │ NULL │\n", - "│ dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", - "│ date │ TIMESTAMP │ YES │ NULL │ NULL │ NULL │\n", - "│ temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", - "└─────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "┌─────────┬─────────────────┬────────────┐\n", + "│ patient │ max_temperature │ total_dose │\n", + "│ int64 │ double │ double │\n", + "├─────────┼─────────────────┼────────────┤\n", + "│ 7 │ 40.7 │ 30.0 │\n", + "│ 9 │ 40.2 │ 30.0 │\n", + "│ 5 │ 39.5 │ 27.0 │\n", + "│ 3 │ 39.5 │ 13.0 │\n", + "│ 2 │ 39.4 │ 15.0 │\n", + "│ 1 │ 38.5 │ 6.0 │\n", + "│ 4 │ 38.1 │ 10.0 │\n", + "│ 6 │ 38.1 │ 8.0 │\n", + "│ 8 │ 37.9 │ 0.0 │\n", + "└─────────┴─────────────────┴────────────┘\n", + "\n" + ] } ], "source": [ - "conn.sql('''dESCRIBE patient_experiment;''')" + "conn.sql('SELECT patient, temperature, dose from patient_experiment') \\\n", + " .aggregate(\n", + " 'patient, '\n", + " 'MAX(temperature) AS max_temperature, '\n", + " 'SUM(dose) AS total_dose',\n", + " group_expr='patient'\n", + " ) \\\n", + " .order('max_temperature DESC') \\\n", + " .show()" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "8570b365-09d5-4993-908e-4eaac974204d", + "cell_type": "markdown", + "id": "ddf5f25c-b0c1-48c9-9031-5fd8a813649b", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "The new-style queries allow for lazy evaluation, while the classic-style queries are evaluated immediately." + ] } ], "metadata": { @@ -429,7 +1513,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.11" + "version": "3.12.12" } }, "nbformat": 4, From d6fddbb3cc085b3fc48a211036b6c59132892688 Mon Sep 17 00:00:00 2001 From: Geert Jan Bex Date: Tue, 25 Nov 2025 08:16:36 +0100 Subject: [PATCH 09/10] Add illustration long to wide to long --- source-code/pandas/README.md | 2 + .../from_long_to_wide_and_back_again.ipynb | 1078 +++++++++++++++++ 2 files changed, 1080 insertions(+) create mode 100644 source-code/pandas/from_long_to_wide_and_back_again.ipynb diff --git a/source-code/pandas/README.md b/source-code/pandas/README.md index 1bb6cf9..80881e6 100644 --- a/source-code/pandas/README.md +++ b/source-code/pandas/README.md @@ -34,3 +34,5 @@ easy to use. a comparison of performance between using `apply` and vectorized operations. 1. `numba_and_pandas.ipynb`: Jupyter notebook that demonstrates how to use Numba to optimize performance of operations on pandas dataframes. +1. `from_long_to_wide_and_back_again.ipynb`: Jupyter notebook that illustrates + how to reshape data using `stack` and `pivot` methods in pandas. diff --git a/source-code/pandas/from_long_to_wide_and_back_again.ipynb b/source-code/pandas/from_long_to_wide_and_back_again.ipynb new file mode 100644 index 0000000..a3610d4 --- /dev/null +++ b/source-code/pandas/from_long_to_wide_and_back_again.ipynb @@ -0,0 +1,1078 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4773bfbd-a3d3-4683-b51e-c80a649a7caf", + "metadata": {}, + "source": [ + "## Requirements" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "d7ab15e7-4fd2-4e81-a9ae-8469b05ec45c", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "ee9056b1-0b76-4cbb-ad68-049b0095b62d", + "metadata": {}, + "source": [ + "## Original dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "2c1acc53-9064-4589-9e88-541175d3deb0", + "metadata": {}, + "outputs": [], + "source": [ + "df_orig = pd.read_excel('data/patient_experiment.xlsx',\n", + " dtype={'dose': np.float32,\n", + " 'temperature': np.float32})" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "b64f834e-b044-422c-830f-112868864269", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 62 entries, 0 to 61\n", + "Data columns (total 4 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 patient 62 non-null int64 \n", + " 1 dose 61 non-null float32 \n", + " 2 date 62 non-null datetime64[ns]\n", + " 3 temperature 61 non-null float32 \n", + "dtypes: datetime64[ns](1), float32(2), int64(1)\n", + "memory usage: 1.6 KB\n" + ] + } + ], + "source": [ + "df_orig.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "df59f8e2-0cb6-4dae-b2b2-71a89a5a894a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
patientdosedatetemperature
010.02012-10-02 10:00:0038.299999
112.02012-10-02 11:00:0038.500000
212.02012-10-02 12:00:0038.099998
312.02012-10-02 13:00:0037.299999
410.02012-10-02 14:00:0037.500000
\n", + "
" + ], + "text/plain": [ + " patient dose date temperature\n", + "0 1 0.0 2012-10-02 10:00:00 38.299999\n", + "1 1 2.0 2012-10-02 11:00:00 38.500000\n", + "2 1 2.0 2012-10-02 12:00:00 38.099998\n", + "3 1 2.0 2012-10-02 13:00:00 37.299999\n", + "4 1 0.0 2012-10-02 14:00:00 37.500000" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_orig.head()" + ] + }, + { + "cell_type": "markdown", + "id": "115ce86f-4e2f-4ea3-a240-2cc68d5721d8", + "metadata": {}, + "source": [ + "## To wide format: pivot" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "1adfa94d-dbb6-41cc-af21-9ca6d1e97226", + "metadata": {}, + "outputs": [], + "source": [ + "df_wide = df_orig.pivot(\n", + " index='date',\n", + " values=['temperature', 'dose'],\n", + " columns=['patient']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "18c3ba13-5949-4c14-95f2-1577466313b3", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "DatetimeIndex: 7 entries, 2012-10-02 10:00:00 to 2012-10-02 16:00:00\n", + "Data columns (total 18 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 (temperature, 1) 7 non-null float32\n", + " 1 (temperature, 2) 7 non-null float32\n", + " 2 (temperature, 3) 6 non-null float32\n", + " 3 (temperature, 4) 7 non-null float32\n", + " 4 (temperature, 5) 7 non-null float32\n", + " 5 (temperature, 6) 6 non-null float32\n", + " 6 (temperature, 7) 7 non-null float32\n", + " 7 (temperature, 8) 7 non-null float32\n", + " 8 (temperature, 9) 7 non-null float32\n", + " 9 (dose, 1) 7 non-null float32\n", + " 10 (dose, 2) 7 non-null float32\n", + " 11 (dose, 3) 7 non-null float32\n", + " 12 (dose, 4) 6 non-null float32\n", + " 13 (dose, 5) 7 non-null float32\n", + " 14 (dose, 6) 6 non-null float32\n", + " 15 (dose, 7) 7 non-null float32\n", + " 16 (dose, 8) 7 non-null float32\n", + " 17 (dose, 9) 7 non-null float32\n", + "dtypes: float32(18)\n", + "memory usage: 560.0 bytes\n" + ] + } + ], + "source": [ + "df_wide.info()" + ] + }, + { + "cell_type": "markdown", + "id": "ea3a9d6b-fd0a-474d-8c42-df7a1f7d7e08", + "metadata": {}, + "source": [ + "Now you have a dataframe with the date as index, and multi-level columns. The top-level is the temerature and the dose, but next level is the patient ID." + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "a41a59d3-1d12-4aee-a7c1-3b85a9950b07", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
temperaturedose
patient123456789123456789
date
2012-10-02 10:00:0038.29999939.29999937.90000238.09999837.90000237.50000039.50000037.79999938.2999990.00.00.00.00.00.00.00.00.0
2012-10-02 11:00:0038.50000039.40000239.50000037.20000139.50000038.09999840.70000137.90000239.5000002.05.02.05.03.02.010.00.010.0
2012-10-02 12:00:0038.09999838.09999838.29999936.09999838.29999937.90000239.79999937.40000240.2000012.05.05.05.07.03.05.00.012.0
2012-10-02 13:00:0037.29999937.299999NaN35.90000238.50000037.70000140.20000137.59999839.0999982.05.02.00.05.02.08.00.04.0
2012-10-02 14:00:0037.50000036.79999937.70000136.29999939.40000237.20000138.29999937.29999937.9000020.00.02.0NaN9.01.03.00.04.0
\n", + "
" + ], + "text/plain": [ + " temperature \\\n", + "patient 1 2 3 4 5 \n", + "date \n", + "2012-10-02 10:00:00 38.299999 39.299999 37.900002 38.099998 37.900002 \n", + "2012-10-02 11:00:00 38.500000 39.400002 39.500000 37.200001 39.500000 \n", + "2012-10-02 12:00:00 38.099998 38.099998 38.299999 36.099998 38.299999 \n", + "2012-10-02 13:00:00 37.299999 37.299999 NaN 35.900002 38.500000 \n", + "2012-10-02 14:00:00 37.500000 36.799999 37.700001 36.299999 39.400002 \n", + "\n", + " dose \\\n", + "patient 6 7 8 9 1 2 \n", + "date \n", + "2012-10-02 10:00:00 37.500000 39.500000 37.799999 38.299999 0.0 0.0 \n", + "2012-10-02 11:00:00 38.099998 40.700001 37.900002 39.500000 2.0 5.0 \n", + "2012-10-02 12:00:00 37.900002 39.799999 37.400002 40.200001 2.0 5.0 \n", + "2012-10-02 13:00:00 37.700001 40.200001 37.599998 39.099998 2.0 5.0 \n", + "2012-10-02 14:00:00 37.200001 38.299999 37.299999 37.900002 0.0 0.0 \n", + "\n", + " \n", + "patient 3 4 5 6 7 8 9 \n", + "date \n", + "2012-10-02 10:00:00 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "2012-10-02 11:00:00 2.0 5.0 3.0 2.0 10.0 0.0 10.0 \n", + "2012-10-02 12:00:00 5.0 5.0 7.0 3.0 5.0 0.0 12.0 \n", + "2012-10-02 13:00:00 2.0 0.0 5.0 2.0 8.0 0.0 4.0 \n", + "2012-10-02 14:00:00 2.0 NaN 9.0 1.0 3.0 0.0 4.0 " + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_wide.head()" + ] + }, + { + "cell_type": "markdown", + "id": "fd0c0272-d182-47c4-bf41-81c869b06932", + "metadata": {}, + "source": [ + "## And back again: stack + reset index" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "81d18cc4-0451-4fd2-ac0a-56e868a31df0", + "metadata": {}, + "outputs": [], + "source": [ + "df_long = df_wide \\\n", + " .stack('patient', future_stack=True) \\\n", + " .reset_index() " + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "3a0b4e1f-560e-4639-bcc1-330bdf82c6d2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 63 entries, 0 to 62\n", + "Data columns (total 4 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 date 63 non-null datetime64[ns]\n", + " 1 patient 63 non-null int64 \n", + " 2 temperature 61 non-null float32 \n", + " 3 dose 61 non-null float32 \n", + "dtypes: datetime64[ns](1), float32(2), int64(1)\n", + "memory usage: 1.6 KB\n" + ] + } + ], + "source": [ + "df_long.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "551264e3-967b-43a2-b8c3-494095f88f87", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datepatienttemperaturedose
02012-10-02 10:00:00138.2999990.0
12012-10-02 10:00:00239.2999990.0
22012-10-02 10:00:00337.9000020.0
32012-10-02 10:00:00438.0999980.0
42012-10-02 10:00:00537.9000020.0
\n", + "
" + ], + "text/plain": [ + " date patient temperature dose\n", + "0 2012-10-02 10:00:00 1 38.299999 0.0\n", + "1 2012-10-02 10:00:00 2 39.299999 0.0\n", + "2 2012-10-02 10:00:00 3 37.900002 0.0\n", + "3 2012-10-02 10:00:00 4 38.099998 0.0\n", + "4 2012-10-02 10:00:00 5 37.900002 0.0" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_long.head()" + ] + }, + { + "cell_type": "markdown", + "id": "00be5fc1-e40d-4091-b0d6-4019a412f1e0", + "metadata": {}, + "source": [ + "Breaking it down into two steps, first the `stack()` method:" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "507b3ec3-6780-40e7-a7b8-c2345c3efa61", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
temperaturedose
datepatient
2012-10-02 10:00:00138.2999990.0
239.2999990.0
337.9000020.0
438.0999980.0
537.9000020.0
............
2012-10-02 16:00:00537.2000010.0
6NaNNaN
737.2999991.0
836.7999990.0
937.2999990.0
\n", + "

63 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " temperature dose\n", + "date patient \n", + "2012-10-02 10:00:00 1 38.299999 0.0\n", + " 2 39.299999 0.0\n", + " 3 37.900002 0.0\n", + " 4 38.099998 0.0\n", + " 5 37.900002 0.0\n", + "... ... ...\n", + "2012-10-02 16:00:00 5 37.200001 0.0\n", + " 6 NaN NaN\n", + " 7 37.299999 1.0\n", + " 8 36.799999 0.0\n", + " 9 37.299999 0.0\n", + "\n", + "[63 rows x 2 columns]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_wide.stack(\"patient\", future_stack=True)" + ] + }, + { + "cell_type": "markdown", + "id": "57e15671-7a85-41b0-bcb5-fe2e089efdc6", + "metadata": {}, + "source": [ + "As you can see, this has created a dataframe that has only two columns, but a multi-level index. The top-level index is the date, the sublevel is the patient." + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "4e351540-2f48-4ef3-aa47-e8929cb1c4e4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datepatienttemperaturedose
02012-10-02 10:00:00138.2999990.0
12012-10-02 10:00:00239.2999990.0
22012-10-02 10:00:00337.9000020.0
32012-10-02 10:00:00438.0999980.0
42012-10-02 10:00:00537.9000020.0
...............
582012-10-02 16:00:00537.2000010.0
592012-10-02 16:00:006NaNNaN
602012-10-02 16:00:00737.2999991.0
612012-10-02 16:00:00836.7999990.0
622012-10-02 16:00:00937.2999990.0
\n", + "

63 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " date patient temperature dose\n", + "0 2012-10-02 10:00:00 1 38.299999 0.0\n", + "1 2012-10-02 10:00:00 2 39.299999 0.0\n", + "2 2012-10-02 10:00:00 3 37.900002 0.0\n", + "3 2012-10-02 10:00:00 4 38.099998 0.0\n", + "4 2012-10-02 10:00:00 5 37.900002 0.0\n", + ".. ... ... ... ...\n", + "58 2012-10-02 16:00:00 5 37.200001 0.0\n", + "59 2012-10-02 16:00:00 6 NaN NaN\n", + "60 2012-10-02 16:00:00 7 37.299999 1.0\n", + "61 2012-10-02 16:00:00 8 36.799999 0.0\n", + "62 2012-10-02 16:00:00 9 37.299999 0.0\n", + "\n", + "[63 rows x 4 columns]" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_long = df_wide.stack(\"patient\", future_stack=True).reset_index()\n", + "df_long" + ] + }, + { + "cell_type": "markdown", + "id": "23489994-b08f-4aed-a49e-daa0908015b4", + "metadata": {}, + "source": [ + "Resetting the index will create columns out of the multi-level index, so one for the date, a second for the patient ID." + ] + }, + { + "cell_type": "markdown", + "id": "e7030ce2-291e-45fa-a791-05b7eaa8f1b2", + "metadata": {}, + "source": [ + "If you prefer to get rid of the column name, simply set it to `None`." + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "ca46e4f1-407a-4e64-ba85-837e0ec4997e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datepatienttemperaturedose
02012-10-02 10:00:00138.2999990.0
12012-10-02 10:00:00239.2999990.0
22012-10-02 10:00:00337.9000020.0
32012-10-02 10:00:00438.0999980.0
42012-10-02 10:00:00537.9000020.0
...............
582012-10-02 16:00:00537.2000010.0
592012-10-02 16:00:006NaNNaN
602012-10-02 16:00:00737.2999991.0
612012-10-02 16:00:00836.7999990.0
622012-10-02 16:00:00937.2999990.0
\n", + "

63 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " date patient temperature dose\n", + "0 2012-10-02 10:00:00 1 38.299999 0.0\n", + "1 2012-10-02 10:00:00 2 39.299999 0.0\n", + "2 2012-10-02 10:00:00 3 37.900002 0.0\n", + "3 2012-10-02 10:00:00 4 38.099998 0.0\n", + "4 2012-10-02 10:00:00 5 37.900002 0.0\n", + ".. ... ... ... ...\n", + "58 2012-10-02 16:00:00 5 37.200001 0.0\n", + "59 2012-10-02 16:00:00 6 NaN NaN\n", + "60 2012-10-02 16:00:00 7 37.299999 1.0\n", + "61 2012-10-02 16:00:00 8 36.799999 0.0\n", + "62 2012-10-02 16:00:00 9 37.299999 0.0\n", + "\n", + "[63 rows x 4 columns]" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_long.columns.name = None\n", + "df_long" + ] + }, + { + "cell_type": "markdown", + "id": "7447a0bd-fe9e-4976-9c22-0e3c73674def", + "metadata": {}, + "source": [ + "Except for the order of the columns, and the sorting of the rows, you are back to the original data format." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 4579ffa065940742758b2b789c19f1c24befee63 Mon Sep 17 00:00:00 2001 From: Geert Jan Bex Date: Tue, 25 Nov 2025 08:20:33 +0100 Subject: [PATCH 10/10] Delete double entry --- source-code/README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/source-code/README.md b/source-code/README.md index 177a58c..3f0175d 100644 --- a/source-code/README.md +++ b/source-code/README.md @@ -23,8 +23,6 @@ to create it. There is some material not covered in the presentation as well. soup and graph representation using networkx. * [`xarray`](xarray): illustrates the xarray library for pandas-like operations on multi-dimensional arrays. -* [`duckdb`](duckdb): illustrates the DuckDB library for SQL-like operations - on dataframes, including integration with pandas and polars. **Note:** material on dashboards has been moved to a [dedicated repository](https://github.com/gjbex/Python-dashboards).