diff --git a/docs/README.md b/docs/README.md
index 1d2c156..24a5947 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -54,6 +54,14 @@ from scratch. Familiarity with numpy is not required, but would be beneficial.
If you plan to do Python programming in a Linux or HPC environment you should
be familiar with these as well.
+For following along hands-on, you need
+* laptop or desktop with internet access.
+* a system set up so you can connect to an HPC system, an account on an HPC
+ system (e.g., VSC, CECI, ...), compute credits if that is required to run
+ jobs on the HPC system if you want to use an HPC system;
+* a Python environment that can run Jupyter Lab if you want to use your own system;
+* access to Google Colaboratory if you prefer not to install software.
+
## Level
diff --git a/source-code/README.md b/source-code/README.md
index 064c59f..177a58c 100644
--- a/source-code/README.md
+++ b/source-code/README.md
@@ -15,6 +15,7 @@ to create it. There is some material not covered in the presentation as well.
representation and algorithms.
* [`pandas`](pandas): illustrations of using pandas and seaborn.
* [`polars`](polars): Kllustrations of using polars.
+* [`duckdb`](duckdb): illustrations of using DuckDB for SQL queries.
* [`regexes`](regexes): illustrations of using regular expressions for
validation and information extraction from textual data.
* [`seaborn`](seaborn): illustrations of using Seaborn to create plots.
@@ -22,6 +23,8 @@ to create it. There is some material not covered in the presentation as well.
soup and graph representation using networkx.
* [`xarray`](xarray): illustrates the xarray library for pandas-like operations
on multi-dimensional arrays.
+* [`duckdb`](duckdb): illustrates the DuckDB library for SQL-like operations
+ on dataframes, including integration with pandas and polars.
**Note:** material on dashboards has been moved to a [dedicated
repository](https://github.com/gjbex/Python-dashboards).
diff --git a/source-code/duckdb/README.md b/source-code/duckdb/README.md
new file mode 100644
index 0000000..c4b0e68
--- /dev/null
+++ b/source-code/duckdb/README.md
@@ -0,0 +1,14 @@
+# DuckDB
+
+DuckDB is an in-process SQL OLAP database management system. It is designed to
+support analytical query workloads and is optimized for fast query performance
+on large datasets. DuckDB can be embedded directly into applications, making it
+a popular choice for data analysis tasks in various programming environments.
+
+
+## What is it?
+
+1. `patients.ipynb`: A Jupyter notebook that demonstrates how to use DuckDB for
+ analyzing patient data. It includes examples of loading data and executing
+ SQL queries.
+1. `data/`: CSV files to use with the notebook.
diff --git a/source-code/duckdb/data/patient_experiment.csv b/source-code/duckdb/data/patient_experiment.csv
new file mode 100644
index 0000000..034e2c7
--- /dev/null
+++ b/source-code/duckdb/data/patient_experiment.csv
@@ -0,0 +1,63 @@
+,patient,dose,date,temperature
+0,1,0.0,2012-10-02 10:00:00,38.3
+1,1,2.0,2012-10-02 11:00:00,38.5
+2,1,2.0,2012-10-02 12:00:00,38.1
+3,1,2.0,2012-10-02 13:00:00,37.3
+4,1,0.0,2012-10-02 14:00:00,37.5
+5,1,0.0,2012-10-02 15:00:00,37.1
+6,1,0.0,2012-10-02 16:00:00,36.8
+7,2,0.0,2012-10-02 10:00:00,39.3
+8,2,5.0,2012-10-02 11:00:00,39.4
+9,2,5.0,2012-10-02 12:00:00,38.1
+10,2,5.0,2012-10-02 13:00:00,37.3
+11,2,0.0,2012-10-02 14:00:00,36.8
+12,2,0.0,2012-10-02 15:00:00,36.8
+13,2,0.0,2012-10-02 16:00:00,36.8
+14,3,0.0,2012-10-02 10:00:00,37.9
+15,3,2.0,2012-10-02 11:00:00,39.5
+16,3,5.0,2012-10-02 12:00:00,38.3
+17,3,2.0,2012-10-02 13:00:00,
+18,3,2.0,2012-10-02 14:00:00,37.7
+19,3,2.0,2012-10-02 15:00:00,37.1
+20,3,0.0,2012-10-02 16:00:00,36.7
+21,4,0.0,2012-10-02 10:00:00,38.1
+22,4,5.0,2012-10-02 11:00:00,37.2
+23,4,5.0,2012-10-02 12:00:00,36.1
+24,4,0.0,2012-10-02 13:00:00,35.9
+25,4,,2012-10-02 14:00:00,36.3
+26,4,0.0,2012-10-02 15:00:00,36.6
+27,4,0.0,2012-10-02 16:00:00,36.7
+28,5,0.0,2012-10-02 10:00:00,37.9
+29,5,3.0,2012-10-02 11:00:00,39.5
+30,5,7.0,2012-10-02 12:00:00,38.3
+31,5,5.0,2012-10-02 13:00:00,38.5
+32,5,9.0,2012-10-02 14:00:00,39.4
+33,5,3.0,2012-10-02 15:00:00,37.9
+34,5,0.0,2012-10-02 16:00:00,37.2
+35,6,0.0,2012-10-02 10:00:00,37.5
+36,6,2.0,2012-10-02 11:00:00,38.1
+37,6,3.0,2012-10-02 12:00:00,37.9
+38,6,2.0,2012-10-02 13:00:00,37.7
+39,6,1.0,2012-10-02 14:00:00,37.2
+40,6,0.0,2012-10-02 15:00:00,36.8
+41,7,0.0,2012-10-02 10:00:00,39.5
+42,7,10.0,2012-10-02 11:00:00,40.7
+43,7,5.0,2012-10-02 12:00:00,39.8
+44,7,8.0,2012-10-02 13:00:00,40.2
+45,7,3.0,2012-10-02 14:00:00,38.3
+46,7,3.0,2012-10-02 15:00:00,37.6
+47,7,1.0,2012-10-02 16:00:00,37.3
+48,8,0.0,2012-10-02 10:00:00,37.8
+49,8,0.0,2012-10-02 11:00:00,37.9
+50,8,0.0,2012-10-02 12:00:00,37.4
+51,8,0.0,2012-10-02 13:00:00,37.6
+52,8,0.0,2012-10-02 14:00:00,37.3
+53,8,0.0,2012-10-02 15:00:00,37.1
+54,8,0.0,2012-10-02 16:00:00,36.8
+55,9,0.0,2012-10-02 10:00:00,38.3
+56,9,10.0,2012-10-02 11:00:00,39.5
+57,9,12.0,2012-10-02 12:00:00,40.2
+58,9,4.0,2012-10-02 13:00:00,39.1
+59,9,4.0,2012-10-02 14:00:00,37.9
+60,9,0.0,2012-10-02 15:00:00,37.1
+61,9,0.0,2012-10-02 16:00:00,37.3
diff --git a/source-code/duckdb/data/patient_metadata.csv b/source-code/duckdb/data/patient_metadata.csv
new file mode 100644
index 0000000..59e23ac
--- /dev/null
+++ b/source-code/duckdb/data/patient_metadata.csv
@@ -0,0 +1,11 @@
+,patient,gender,condition
+0,1,M,A
+1,2,F,A
+2,3,M,A
+3,5,M,A
+4,6,F,B
+5,7,M,B
+6,8,F,B
+7,9,M,B
+8,10,F,B
+9,11,M,B
diff --git a/source-code/duckdb/patients.ipynb b/source-code/duckdb/patients.ipynb
new file mode 100644
index 0000000..853801e
--- /dev/null
+++ b/source-code/duckdb/patients.ipynb
@@ -0,0 +1,1521 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "9b23ccda-f524-41bc-975b-568da36a4493",
+ "metadata": {},
+ "source": [
+ "## Requirements"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "4c0d0975-71ce-4e9a-986c-ac6cbfb251bb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import duckdb\n",
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d46cb701-62c6-4279-bf61-36935a0a53a9",
+ "metadata": {},
+ "source": [
+ "## Database connection"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6a0df994-ac76-4c42-904a-a6113ea419f9",
+ "metadata": {},
+ "source": [
+ "Create a connection to the database, and query metadata."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "861daf66-0810-48d9-968c-18e1dabe0e4f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "conn = duckdb.connect('data/patient_experiment.csv')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "843f9d0f-8384-4977-916e-49ef59e5cbad",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "┌─────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐\n",
+ "│ column_name │ column_type │ null │ key │ default │ extra │\n",
+ "│ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │\n",
+ "├─────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤\n",
+ "│ column0 │ BIGINT │ YES │ NULL │ NULL │ NULL │\n",
+ "│ patient │ BIGINT │ YES │ NULL │ NULL │ NULL │\n",
+ "│ dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n",
+ "│ date │ TIMESTAMP │ YES │ NULL │ NULL │ NULL │\n",
+ "│ temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n",
+ "└─────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "conn.sql('''\n",
+ " DESCRIBE patient_experiment;\n",
+ "''')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "62195da0-ca79-4f65-b3f8-780783518d98",
+ "metadata": {},
+ "source": [
+ "Create a function to show the tables/views in the database."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "a3effca9-7353-435b-b7b5-01a487034314",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def show_tables(conn):\n",
+ " conn.sql('''\n",
+ " SELECT table_schema, table_name, table_type\n",
+ " FROM information_schema.tables\n",
+ " WHERE table_schema NOT IN ('information_schema', 'pg_catalog');\n",
+ " ''').show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "66400c5a-ca95-4104-a537-7a29dbdb001b",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "┌──────────────┬────────────────────┬────────────┐\n",
+ "│ table_schema │ table_name │ table_type │\n",
+ "│ varchar │ varchar │ varchar │\n",
+ "├──────────────┼────────────────────┼────────────┤\n",
+ "│ main │ file │ VIEW │\n",
+ "│ main │ patient_experiment │ VIEW │\n",
+ "└──────────────┴────────────────────┴────────────┘\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "show_tables(conn)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5bc72abf-863b-4516-bc42-14784d7a40b0",
+ "metadata": {},
+ "source": [
+ "## Queries"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bff175ad-cbe7-4a5d-91db-7d6eada3b695",
+ "metadata": {},
+ "source": [
+ "Select all the data for patient 6 and convert it to a pandas dataframe."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "d4607877-270b-48c5-ba91-6aa19a8a24cf",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " patient | \n",
+ " date | \n",
+ " temperature | \n",
+ " dose | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 6 | \n",
+ " 2012-10-02 10:00:00 | \n",
+ " 37.5 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 6 | \n",
+ " 2012-10-02 11:00:00 | \n",
+ " 38.1 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 6 | \n",
+ " 2012-10-02 12:00:00 | \n",
+ " 37.9 | \n",
+ " 3.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 6 | \n",
+ " 2012-10-02 13:00:00 | \n",
+ " 37.7 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 6 | \n",
+ " 2012-10-02 14:00:00 | \n",
+ " 37.2 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 6 | \n",
+ " 2012-10-02 15:00:00 | \n",
+ " 36.8 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " patient date temperature dose\n",
+ "0 6 2012-10-02 10:00:00 37.5 0.0\n",
+ "1 6 2012-10-02 11:00:00 38.1 2.0\n",
+ "2 6 2012-10-02 12:00:00 37.9 3.0\n",
+ "3 6 2012-10-02 13:00:00 37.7 2.0\n",
+ "4 6 2012-10-02 14:00:00 37.2 1.0\n",
+ "5 6 2012-10-02 15:00:00 36.8 0.0"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "conn.execute('''\n",
+ " SELECT patient, date, temperature, dose\n",
+ " FROM patient_experiment\n",
+ " WHERE patient == 6;\n",
+ "''').df()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "02af9a7f-84d2-4cb9-a705-441b4eff526a",
+ "metadata": {},
+ "source": [
+ "For the patients with a high fever, count the number of timepoints they had a temperature above $39.5\\textdegree C$ as well as their maximum temperature."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "262b2ad4-05c0-445e-8005-f07307f88947",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " patient | \n",
+ " high_fever_count | \n",
+ " max_temperature | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 7 | \n",
+ " 3 | \n",
+ " 40.7 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 9 | \n",
+ " 1 | \n",
+ " 40.2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " patient high_fever_count max_temperature\n",
+ "0 7 3 40.7\n",
+ "1 9 1 40.2"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "conn.execute('''\n",
+ " SELECT\n",
+ " patient,\n",
+ " COUNT(temperature) AS high_fever_count,\n",
+ " MAX(temperature) AS max_temperature\n",
+ " FROM patient_experiment\n",
+ " WHERE temperature > 39.5\n",
+ " GROUP BY patient\n",
+ " ORDER BY patient;\n",
+ "''').df()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "98ef7ed5-23d1-4591-ab8a-405248f98a3a",
+ "metadata": {},
+ "source": [
+ "For each patient, compute the total dose administered, as well as the maximum temperature, and order by descending maximum temperature."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "7df98e50-5ab8-4813-a3ad-d3796aa35c48",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " patient | \n",
+ " max_temperature | \n",
+ " total_dose | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 7 | \n",
+ " 40.7 | \n",
+ " 30.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 9 | \n",
+ " 40.2 | \n",
+ " 30.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 5 | \n",
+ " 39.5 | \n",
+ " 27.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 3 | \n",
+ " 39.5 | \n",
+ " 13.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2 | \n",
+ " 39.4 | \n",
+ " 15.0 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 1 | \n",
+ " 38.5 | \n",
+ " 6.0 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 6 | \n",
+ " 38.1 | \n",
+ " 8.0 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 4 | \n",
+ " 38.1 | \n",
+ " 10.0 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 8 | \n",
+ " 37.9 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " patient max_temperature total_dose\n",
+ "0 7 40.7 30.0\n",
+ "1 9 40.2 30.0\n",
+ "2 5 39.5 27.0\n",
+ "3 3 39.5 13.0\n",
+ "4 2 39.4 15.0\n",
+ "5 1 38.5 6.0\n",
+ "6 6 38.1 8.0\n",
+ "7 4 38.1 10.0\n",
+ "8 8 37.9 0.0"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "conn.execute('''\n",
+ " SELECT\n",
+ " patient,\n",
+ " MAX(temperature) AS 'max_temperature',\n",
+ " SUM(dose) AS 'total_dose'\n",
+ " FROM patient_experiment\n",
+ " GROUP BY patient\n",
+ " ORDER BY max_temperature DESC;\n",
+ "''').df()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d3d6fcbf-9bc5-40fb-9e4f-046def5249db",
+ "metadata": {},
+ "source": [
+ "If you want to query the result of such a query, you can create a view, `'hypothesis'` in this example."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "6b8694e2-5e9f-4d15-b2c3-609ab762d7b4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "conn.execute('''\n",
+ " CREATE VIEW hypothesis AS SELECT\n",
+ " patient,\n",
+ " MAX(temperature) AS 'max_temperature',\n",
+ " SUM(dose) AS 'total_dose'\n",
+ " FROM patient_experiment\n",
+ " GROUP BY patient\n",
+ " ORDER BY max_temperature DESC;\n",
+ "''');"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "7c755e2b-d781-435e-a878-9b29b6374670",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "┌──────────────┬────────────────────┬────────────┐\n",
+ "│ table_schema │ table_name │ table_type │\n",
+ "│ varchar │ varchar │ varchar │\n",
+ "├──────────────┼────────────────────┼────────────┤\n",
+ "│ main │ file │ VIEW │\n",
+ "│ main │ hypothesis │ VIEW │\n",
+ "│ main │ patient_experiment │ VIEW │\n",
+ "└──────────────┴────────────────────┴────────────┘\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "show_tables(conn)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "96617d94-da47-4604-9c27-3375058cbd86",
+ "metadata": {},
+ "source": [
+ "Get the maximum dose administered to a patient."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "d1a405c8-05e4-4d27-bd1c-3d1fe61c5e43",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " max(total_dose) | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 30.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " max(total_dose)\n",
+ "0 30.0"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "conn.execute('''\n",
+ " SELECT MAX(total_dose)\n",
+ " FROM hypothesis;\n",
+ "''').df()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2718eb2c-1e3a-4f90-a1f6-bb27e9d4a332",
+ "metadata": {},
+ "source": [
+ "Although DuckDB has an extension to perform a pivot (this is not standard SQL), it is not as elegant as the pandas counterpart as multi-level columns are not suppported by DuckDB."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "0c4e1ad8-a9ae-4808-901b-c036e4e0ee17",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "<_duckdb.DuckDBPyConnection at 0x739eb95f72b0>"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "conn.execute('''\n",
+ " CREATE TABLE time_series AS\n",
+ " PIVOT patient_experiment\n",
+ " ON patient\n",
+ " USING\n",
+ " first(temperature) AS temperature,\n",
+ " first(dose) AS dose\n",
+ " GROUP BY date;\n",
+ "''');"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "b4b38844-e6d2-4df1-8f6c-21e4daf77d48",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "┌───────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐\n",
+ "│ column_name │ column_type │ null │ key │ default │ extra │\n",
+ "│ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │\n",
+ "├───────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤\n",
+ "│ date │ TIMESTAMP │ YES │ NULL │ NULL │ NULL │\n",
+ "│ 1_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n",
+ "│ 1_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n",
+ "│ 2_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n",
+ "│ 2_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n",
+ "│ 3_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n",
+ "│ 3_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n",
+ "│ 4_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n",
+ "│ 4_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n",
+ "│ 5_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n",
+ "│ 5_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n",
+ "│ 6_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n",
+ "│ 6_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n",
+ "│ 7_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n",
+ "│ 7_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n",
+ "│ 8_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n",
+ "│ 8_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n",
+ "│ 9_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n",
+ "│ 9_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n",
+ "├───────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┤\n",
+ "│ 19 rows 6 columns │\n",
+ "└─────────────────────────────────────────────────────────────────────┘\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "conn.sql('DESCRIBE time_series;').show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "95b029d4-0329-4462-8737-94f036dc8147",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " date | \n",
+ " temperature | \n",
+ " dose | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 2012-10-02 10:00:00 | \n",
+ " 37.5 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2012-10-02 11:00:00 | \n",
+ " 38.1 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2012-10-02 12:00:00 | \n",
+ " 37.9 | \n",
+ " 3.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2012-10-02 13:00:00 | \n",
+ " 37.7 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2012-10-02 14:00:00 | \n",
+ " 37.2 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 2012-10-02 15:00:00 | \n",
+ " 36.8 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 2012-10-02 16:00:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " date temperature dose\n",
+ "0 2012-10-02 10:00:00 37.5 0.0\n",
+ "1 2012-10-02 11:00:00 38.1 2.0\n",
+ "2 2012-10-02 12:00:00 37.9 3.0\n",
+ "3 2012-10-02 13:00:00 37.7 2.0\n",
+ "4 2012-10-02 14:00:00 37.2 1.0\n",
+ "5 2012-10-02 15:00:00 36.8 0.0\n",
+ "6 2012-10-02 16:00:00 NaN NaN"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "conn.execute('''\n",
+ " SELECT\n",
+ " date,\n",
+ " \"6_temperature\" AS temperature,\n",
+ " \"6_dose\" AS dose\n",
+ " FROM time_series\n",
+ " ORDER BY date;\n",
+ "''').df()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "78a996cf-4bc2-40dc-8bce-27bcc6731b7b",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "┌────────────────────┐\n",
+ "│ name │\n",
+ "│ varchar │\n",
+ "├────────────────────┤\n",
+ "│ file │\n",
+ "│ hypothesis │\n",
+ "│ patient_experiment │\n",
+ "│ time_series │\n",
+ "└────────────────────┘\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "conn.sql('SHOW TABLES;').show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8637dcab-93f4-4d69-9fec-0bc840b03598",
+ "metadata": {},
+ "source": [
+ "Create a view on a second CSV file."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "id": "8570b365-09d5-4993-908e-4eaac974204d",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "conn.execute('''\n",
+ " CREATE VIEW patient_metadata AS\n",
+ " SELECT *\n",
+ " FROM read_csv_auto('data/patient_metadata.csv', filename=true);\n",
+ "''');"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "id": "022db70e-64a2-40ad-920f-463f465b274c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "┌──────────────┬────────────────────┬────────────┐\n",
+ "│ table_schema │ table_name │ table_type │\n",
+ "│ varchar │ varchar │ varchar │\n",
+ "├──────────────┼────────────────────┼────────────┤\n",
+ "│ main │ time_series │ BASE TABLE │\n",
+ "│ main │ file │ VIEW │\n",
+ "│ main │ hypothesis │ VIEW │\n",
+ "│ main │ metadata │ VIEW │\n",
+ "│ main │ patient_experiment │ VIEW │\n",
+ "│ main │ patient_metadata │ VIEW │\n",
+ "└──────────────┴────────────────────┴────────────┘\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "show_tables(conn)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8ae2a578-6849-4a22-ad2e-b770de3ff44e",
+ "metadata": {},
+ "source": [
+ "Determine the patient IDs that are either in `patient_experiment`, or in `patient_metadata`, but not in both. Note that a full outer join is used to combine the informantion in both tables."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "6482c2b6-4896-4c30-9bb4-b5a278626364",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " patient | \n",
+ " present | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 4 | \n",
+ " only in experiment | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 10 | \n",
+ " only in metadata | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 11 | \n",
+ " only in metadata | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " patient present\n",
+ "0 4 only in experiment\n",
+ "1 10 only in metadata\n",
+ "2 11 only in metadata"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "conn.execute('''\n",
+ " SELECT\n",
+ " DISTINCT COALESCE(exp.patient, mt.patient) AS patient,\n",
+ " CASE\n",
+ " WHEN exp.patient IS NOT NULL AND mt.patient IS NULL\n",
+ " THEN 'only in experiment'\n",
+ " WHEN exp.patient IS NULL and mt.patient IS NOT NULL\n",
+ " THEN 'only in metadata'\n",
+ " ELSE 'in both'\n",
+ " END AS present\n",
+ " FROM patient_experiment AS exp FULL OUTER JOIN patient_metadata AS mt\n",
+ " USING (patient)\n",
+ " WHERE NOT present = 'in both'\n",
+ " ORDER BY exp.patient, mt.patient;\n",
+ "''').df()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9cf58cd3-0804-4472-bfd2-72f5afd82166",
+ "metadata": {},
+ "source": [
+ "You can do an inner join between the tables `patient_experiment` and `patient_metadata` to get the maximum temperature, the condition and gender for each patient that occurs in both tables."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "id": "1639aeaa-b33a-4889-869d-93500242980b",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " patient | \n",
+ " max_temperature | \n",
+ " condition | \n",
+ " gender | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " 38.5 | \n",
+ " A | \n",
+ " M | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " 39.4 | \n",
+ " A | \n",
+ " F | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " 39.5 | \n",
+ " A | \n",
+ " M | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 5 | \n",
+ " 39.5 | \n",
+ " A | \n",
+ " M | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 6 | \n",
+ " 38.1 | \n",
+ " B | \n",
+ " F | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 7 | \n",
+ " 40.7 | \n",
+ " B | \n",
+ " M | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 8 | \n",
+ " 37.9 | \n",
+ " B | \n",
+ " F | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 9 | \n",
+ " 40.2 | \n",
+ " B | \n",
+ " M | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " patient max_temperature condition gender\n",
+ "0 1 38.5 A M\n",
+ "1 2 39.4 A F\n",
+ "2 3 39.5 A M\n",
+ "3 5 39.5 A M\n",
+ "4 6 38.1 B F\n",
+ "5 7 40.7 B M\n",
+ "6 8 37.9 B F\n",
+ "7 9 40.2 B M"
+ ]
+ },
+ "execution_count": 50,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "conn.execute('''\n",
+ " SELECT\n",
+ " COALESCE(exp.patient, mt.patient) AS patient,\n",
+ " MAX(exp.temperature) AS max_temperature,\n",
+ " ANY_VALUE(mt.condition) AS condition,\n",
+ " ANY_VALUE(mt.gender) AS gender\n",
+ " FROM patient_experiment AS exp INNER JOIN patient_metadata AS mt\n",
+ " USING (patient)\n",
+ " GROUP BY exp.patient, mt.patient\n",
+ " ORDER BY exp.patient, mt.patient\n",
+ "''').df()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b2afc510-6340-4c59-aeec-e245d8358a0e",
+ "metadata": {},
+ "source": [
+ "## New style versus classic style"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "id": "397405b0-756c-461a-81f1-e7af99b98d0c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " patient | \n",
+ " date | \n",
+ " temperature | \n",
+ " dose | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 6 | \n",
+ " 2012-10-02 10:00:00 | \n",
+ " 37.5 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 6 | \n",
+ " 2012-10-02 11:00:00 | \n",
+ " 38.1 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 6 | \n",
+ " 2012-10-02 12:00:00 | \n",
+ " 37.9 | \n",
+ " 3.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 6 | \n",
+ " 2012-10-02 13:00:00 | \n",
+ " 37.7 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 6 | \n",
+ " 2012-10-02 14:00:00 | \n",
+ " 37.2 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 6 | \n",
+ " 2012-10-02 15:00:00 | \n",
+ " 36.8 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " patient date temperature dose\n",
+ "0 6 2012-10-02 10:00:00 37.5 0.0\n",
+ "1 6 2012-10-02 11:00:00 38.1 2.0\n",
+ "2 6 2012-10-02 12:00:00 37.9 3.0\n",
+ "3 6 2012-10-02 13:00:00 37.7 2.0\n",
+ "4 6 2012-10-02 14:00:00 37.2 1.0\n",
+ "5 6 2012-10-02 15:00:00 36.8 0.0"
+ ]
+ },
+ "execution_count": 51,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "conn.execute('''\n",
+ " SELECT patient, date, temperature, dose\n",
+ " FROM patient_experiment\n",
+ " WHERE patient == 6;\n",
+ "''').df()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "id": "3f06bf99-ead9-4eec-8251-b834f5c2eb69",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "┌─────────┬─────────────────────┬─────────────┬────────┐\n",
+ "│ patient │ date │ temperature │ dose │\n",
+ "│ int64 │ timestamp │ double │ double │\n",
+ "├─────────┼─────────────────────┼─────────────┼────────┤\n",
+ "│ 6 │ 2012-10-02 10:00:00 │ 37.5 │ 0.0 │\n",
+ "│ 6 │ 2012-10-02 11:00:00 │ 38.1 │ 2.0 │\n",
+ "│ 6 │ 2012-10-02 12:00:00 │ 37.9 │ 3.0 │\n",
+ "│ 6 │ 2012-10-02 13:00:00 │ 37.7 │ 2.0 │\n",
+ "│ 6 │ 2012-10-02 14:00:00 │ 37.2 │ 1.0 │\n",
+ "│ 6 │ 2012-10-02 15:00:00 │ 36.8 │ 0.0 │\n",
+ "└─────────┴─────────────────────┴─────────────┴────────┘\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "conn.sql('''\n",
+ " SELECT patient, date, temperature, dose\n",
+ " FROM patient_experiment\n",
+ "''').filter('patient = 6').show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cd9dee70-4c1c-4445-9b6c-f957791210fc",
+ "metadata": {},
+ "source": [
+ "For the patients with a high fever, count the number of timepoints they had a temperature above $39.5\\textdegree C$ as well as their maximum temperature."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "0f29e357-5c17-4aca-92dc-ceaecf959023",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " patient | \n",
+ " high_fever_count | \n",
+ " max_temperature | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 7 | \n",
+ " 3 | \n",
+ " 40.7 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 9 | \n",
+ " 1 | \n",
+ " 40.2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " patient high_fever_count max_temperature\n",
+ "0 7 3 40.7\n",
+ "1 9 1 40.2"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "conn.execute('''\n",
+ " SELECT\n",
+ " patient,\n",
+ " COUNT(temperature) AS high_fever_count,\n",
+ " MAX(temperature) AS max_temperature\n",
+ " FROM patient_experiment\n",
+ " WHERE temperature > 39.5\n",
+ " GROUP BY patient\n",
+ " ORDER BY patient;\n",
+ "''').df()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "id": "d07c6fff-b72d-49c0-a0f7-f545e4013331",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "┌─────────┬──────────────────┬─────────────────┐\n",
+ "│ patient │ high_fever_count │ max_temperature │\n",
+ "│ int64 │ int64 │ double │\n",
+ "├─────────┼──────────────────┼─────────────────┤\n",
+ "│ 7 │ 3 │ 40.7 │\n",
+ "│ 9 │ 1 │ 40.2 │\n",
+ "└─────────┴──────────────────┴─────────────────┘\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "conn.sql('SELECT patient, temperature FROM patient_experiment') \\\n",
+ " .filter('temperature > 39.5') \\\n",
+ " .aggregate(\n",
+ " 'patient, '\n",
+ " 'COUNT(temperature) AS high_fever_count, '\n",
+ " 'MAX(temperature) AS max_temperature',\n",
+ " group_expr='patient') \\\n",
+ " .show() "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "65ab2e40-bbf8-4537-a41b-e2321568fb98",
+ "metadata": {},
+ "source": [
+ "For each patient, compute the total dose administered, as well as the maximum temperature, and order by descending maximum temperature."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "27f12e02-c92a-459a-a469-e733ba17052f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " patient | \n",
+ " max_temperature | \n",
+ " total_dose | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 7 | \n",
+ " 40.7 | \n",
+ " 30.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 9 | \n",
+ " 40.2 | \n",
+ " 30.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 5 | \n",
+ " 39.5 | \n",
+ " 27.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 3 | \n",
+ " 39.5 | \n",
+ " 13.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2 | \n",
+ " 39.4 | \n",
+ " 15.0 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 1 | \n",
+ " 38.5 | \n",
+ " 6.0 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 6 | \n",
+ " 38.1 | \n",
+ " 8.0 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 4 | \n",
+ " 38.1 | \n",
+ " 10.0 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 8 | \n",
+ " 37.9 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " patient max_temperature total_dose\n",
+ "0 7 40.7 30.0\n",
+ "1 9 40.2 30.0\n",
+ "2 5 39.5 27.0\n",
+ "3 3 39.5 13.0\n",
+ "4 2 39.4 15.0\n",
+ "5 1 38.5 6.0\n",
+ "6 6 38.1 8.0\n",
+ "7 4 38.1 10.0\n",
+ "8 8 37.9 0.0"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "conn.execute('''\n",
+ " SELECT\n",
+ " patient,\n",
+ " MAX(temperature) AS 'max_temperature',\n",
+ " SUM(dose) AS 'total_dose'\n",
+ " FROM patient_experiment\n",
+ " GROUP BY patient\n",
+ " ORDER BY max_temperature DESC;\n",
+ "''').df()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "id": "5f83cde9-129a-4f76-b199-3ae7adf3f08c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "┌─────────┬─────────────────┬────────────┐\n",
+ "│ patient │ max_temperature │ total_dose │\n",
+ "│ int64 │ double │ double │\n",
+ "├─────────┼─────────────────┼────────────┤\n",
+ "│ 7 │ 40.7 │ 30.0 │\n",
+ "│ 9 │ 40.2 │ 30.0 │\n",
+ "│ 5 │ 39.5 │ 27.0 │\n",
+ "│ 3 │ 39.5 │ 13.0 │\n",
+ "│ 2 │ 39.4 │ 15.0 │\n",
+ "│ 1 │ 38.5 │ 6.0 │\n",
+ "│ 4 │ 38.1 │ 10.0 │\n",
+ "│ 6 │ 38.1 │ 8.0 │\n",
+ "│ 8 │ 37.9 │ 0.0 │\n",
+ "└─────────┴─────────────────┴────────────┘\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "conn.sql('SELECT patient, temperature, dose from patient_experiment') \\\n",
+ " .aggregate(\n",
+ " 'patient, '\n",
+ " 'MAX(temperature) AS max_temperature, '\n",
+ " 'SUM(dose) AS total_dose',\n",
+ " group_expr='patient'\n",
+ " ) \\\n",
+ " .order('max_temperature DESC') \\\n",
+ " .show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ddf5f25c-b0c1-48c9-9031-5fd8a813649b",
+ "metadata": {},
+ "source": [
+ "The new-style queries allow for lazy evaluation, while the classic-style queries are evaluated immediately."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}