diff --git a/source-code/README.md b/source-code/README.md index 064c59f..3f0175d 100644 --- a/source-code/README.md +++ b/source-code/README.md @@ -15,6 +15,7 @@ to create it. There is some material not covered in the presentation as well. representation and algorithms. * [`pandas`](pandas): illustrations of using pandas and seaborn. * [`polars`](polars): Kllustrations of using polars. +* [`duckdb`](duckdb): illustrations of using DuckDB for SQL queries. * [`regexes`](regexes): illustrations of using regular expressions for validation and information extraction from textual data. * [`seaborn`](seaborn): illustrations of using Seaborn to create plots. diff --git a/source-code/duckdb/README.md b/source-code/duckdb/README.md new file mode 100644 index 0000000..c4b0e68 --- /dev/null +++ b/source-code/duckdb/README.md @@ -0,0 +1,14 @@ +# DuckDB + +DuckDB is an in-process SQL OLAP database management system. It is designed to +support analytical query workloads and is optimized for fast query performance +on large datasets. DuckDB can be embedded directly into applications, making it +a popular choice for data analysis tasks in various programming environments. + + +## What is it? + +1. `patients.ipynb`: A Jupyter notebook that demonstrates how to use DuckDB for + analyzing patient data. It includes examples of loading data and executing + SQL queries. +1. `data/`: CSV files to use with the notebook. diff --git a/source-code/duckdb/data/patient_experiment.csv b/source-code/duckdb/data/patient_experiment.csv new file mode 100644 index 0000000..034e2c7 --- /dev/null +++ b/source-code/duckdb/data/patient_experiment.csv @@ -0,0 +1,63 @@ +,patient,dose,date,temperature +0,1,0.0,2012-10-02 10:00:00,38.3 +1,1,2.0,2012-10-02 11:00:00,38.5 +2,1,2.0,2012-10-02 12:00:00,38.1 +3,1,2.0,2012-10-02 13:00:00,37.3 +4,1,0.0,2012-10-02 14:00:00,37.5 +5,1,0.0,2012-10-02 15:00:00,37.1 +6,1,0.0,2012-10-02 16:00:00,36.8 +7,2,0.0,2012-10-02 10:00:00,39.3 +8,2,5.0,2012-10-02 11:00:00,39.4 +9,2,5.0,2012-10-02 12:00:00,38.1 +10,2,5.0,2012-10-02 13:00:00,37.3 +11,2,0.0,2012-10-02 14:00:00,36.8 +12,2,0.0,2012-10-02 15:00:00,36.8 +13,2,0.0,2012-10-02 16:00:00,36.8 +14,3,0.0,2012-10-02 10:00:00,37.9 +15,3,2.0,2012-10-02 11:00:00,39.5 +16,3,5.0,2012-10-02 12:00:00,38.3 +17,3,2.0,2012-10-02 13:00:00, +18,3,2.0,2012-10-02 14:00:00,37.7 +19,3,2.0,2012-10-02 15:00:00,37.1 +20,3,0.0,2012-10-02 16:00:00,36.7 +21,4,0.0,2012-10-02 10:00:00,38.1 +22,4,5.0,2012-10-02 11:00:00,37.2 +23,4,5.0,2012-10-02 12:00:00,36.1 +24,4,0.0,2012-10-02 13:00:00,35.9 +25,4,,2012-10-02 14:00:00,36.3 +26,4,0.0,2012-10-02 15:00:00,36.6 +27,4,0.0,2012-10-02 16:00:00,36.7 +28,5,0.0,2012-10-02 10:00:00,37.9 +29,5,3.0,2012-10-02 11:00:00,39.5 +30,5,7.0,2012-10-02 12:00:00,38.3 +31,5,5.0,2012-10-02 13:00:00,38.5 +32,5,9.0,2012-10-02 14:00:00,39.4 +33,5,3.0,2012-10-02 15:00:00,37.9 +34,5,0.0,2012-10-02 16:00:00,37.2 +35,6,0.0,2012-10-02 10:00:00,37.5 +36,6,2.0,2012-10-02 11:00:00,38.1 +37,6,3.0,2012-10-02 12:00:00,37.9 +38,6,2.0,2012-10-02 13:00:00,37.7 +39,6,1.0,2012-10-02 14:00:00,37.2 +40,6,0.0,2012-10-02 15:00:00,36.8 +41,7,0.0,2012-10-02 10:00:00,39.5 +42,7,10.0,2012-10-02 11:00:00,40.7 +43,7,5.0,2012-10-02 12:00:00,39.8 +44,7,8.0,2012-10-02 13:00:00,40.2 +45,7,3.0,2012-10-02 14:00:00,38.3 +46,7,3.0,2012-10-02 15:00:00,37.6 +47,7,1.0,2012-10-02 16:00:00,37.3 +48,8,0.0,2012-10-02 10:00:00,37.8 +49,8,0.0,2012-10-02 11:00:00,37.9 +50,8,0.0,2012-10-02 12:00:00,37.4 +51,8,0.0,2012-10-02 13:00:00,37.6 +52,8,0.0,2012-10-02 14:00:00,37.3 +53,8,0.0,2012-10-02 15:00:00,37.1 +54,8,0.0,2012-10-02 16:00:00,36.8 +55,9,0.0,2012-10-02 10:00:00,38.3 +56,9,10.0,2012-10-02 11:00:00,39.5 +57,9,12.0,2012-10-02 12:00:00,40.2 +58,9,4.0,2012-10-02 13:00:00,39.1 +59,9,4.0,2012-10-02 14:00:00,37.9 +60,9,0.0,2012-10-02 15:00:00,37.1 +61,9,0.0,2012-10-02 16:00:00,37.3 diff --git a/source-code/duckdb/data/patient_metadata.csv b/source-code/duckdb/data/patient_metadata.csv new file mode 100644 index 0000000..59e23ac --- /dev/null +++ b/source-code/duckdb/data/patient_metadata.csv @@ -0,0 +1,11 @@ +,patient,gender,condition +0,1,M,A +1,2,F,A +2,3,M,A +3,5,M,A +4,6,F,B +5,7,M,B +6,8,F,B +7,9,M,B +8,10,F,B +9,11,M,B diff --git a/source-code/duckdb/patients.ipynb b/source-code/duckdb/patients.ipynb new file mode 100644 index 0000000..853801e --- /dev/null +++ b/source-code/duckdb/patients.ipynb @@ -0,0 +1,1521 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9b23ccda-f524-41bc-975b-568da36a4493", + "metadata": {}, + "source": [ + "## Requirements" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4c0d0975-71ce-4e9a-986c-ac6cbfb251bb", + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "d46cb701-62c6-4279-bf61-36935a0a53a9", + "metadata": {}, + "source": [ + "## Database connection" + ] + }, + { + "cell_type": "markdown", + "id": "6a0df994-ac76-4c42-904a-a6113ea419f9", + "metadata": {}, + "source": [ + "Create a connection to the database, and query metadata." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "861daf66-0810-48d9-968c-18e1dabe0e4f", + "metadata": {}, + "outputs": [], + "source": [ + "conn = duckdb.connect('data/patient_experiment.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "843f9d0f-8384-4977-916e-49ef59e5cbad", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "┌─────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐\n", + "│ column_name │ column_type │ null │ key │ default │ extra │\n", + "│ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │\n", + "├─────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤\n", + "│ column0 │ BIGINT │ YES │ NULL │ NULL │ NULL │\n", + "│ patient │ BIGINT │ YES │ NULL │ NULL │ NULL │\n", + "│ dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ date │ TIMESTAMP │ YES │ NULL │ NULL │ NULL │\n", + "│ temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "└─────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.sql('''\n", + " DESCRIBE patient_experiment;\n", + "''')" + ] + }, + { + "cell_type": "markdown", + "id": "62195da0-ca79-4f65-b3f8-780783518d98", + "metadata": {}, + "source": [ + "Create a function to show the tables/views in the database." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a3effca9-7353-435b-b7b5-01a487034314", + "metadata": {}, + "outputs": [], + "source": [ + "def show_tables(conn):\n", + " conn.sql('''\n", + " SELECT table_schema, table_name, table_type\n", + " FROM information_schema.tables\n", + " WHERE table_schema NOT IN ('information_schema', 'pg_catalog');\n", + " ''').show()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "66400c5a-ca95-4104-a537-7a29dbdb001b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌──────────────┬────────────────────┬────────────┐\n", + "│ table_schema │ table_name │ table_type │\n", + "│ varchar │ varchar │ varchar │\n", + "├──────────────┼────────────────────┼────────────┤\n", + "│ main │ file │ VIEW │\n", + "│ main │ patient_experiment │ VIEW │\n", + "└──────────────┴────────────────────┴────────────┘\n", + "\n" + ] + } + ], + "source": [ + "show_tables(conn)" + ] + }, + { + "cell_type": "markdown", + "id": "5bc72abf-863b-4516-bc42-14784d7a40b0", + "metadata": {}, + "source": [ + "## Queries" + ] + }, + { + "cell_type": "markdown", + "id": "bff175ad-cbe7-4a5d-91db-7d6eada3b695", + "metadata": {}, + "source": [ + "Select all the data for patient 6 and convert it to a pandas dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "d4607877-270b-48c5-ba91-6aa19a8a24cf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
patientdatetemperaturedose
062012-10-02 10:00:0037.50.0
162012-10-02 11:00:0038.12.0
262012-10-02 12:00:0037.93.0
362012-10-02 13:00:0037.72.0
462012-10-02 14:00:0037.21.0
562012-10-02 15:00:0036.80.0
\n", + "
" + ], + "text/plain": [ + " patient date temperature dose\n", + "0 6 2012-10-02 10:00:00 37.5 0.0\n", + "1 6 2012-10-02 11:00:00 38.1 2.0\n", + "2 6 2012-10-02 12:00:00 37.9 3.0\n", + "3 6 2012-10-02 13:00:00 37.7 2.0\n", + "4 6 2012-10-02 14:00:00 37.2 1.0\n", + "5 6 2012-10-02 15:00:00 36.8 0.0" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute('''\n", + " SELECT patient, date, temperature, dose\n", + " FROM patient_experiment\n", + " WHERE patient == 6;\n", + "''').df()" + ] + }, + { + "cell_type": "markdown", + "id": "02af9a7f-84d2-4cb9-a705-441b4eff526a", + "metadata": {}, + "source": [ + "For the patients with a high fever, count the number of timepoints they had a temperature above $39.5\\textdegree C$ as well as their maximum temperature." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "262b2ad4-05c0-445e-8005-f07307f88947", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
patienthigh_fever_countmax_temperature
07340.7
19140.2
\n", + "
" + ], + "text/plain": [ + " patient high_fever_count max_temperature\n", + "0 7 3 40.7\n", + "1 9 1 40.2" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute('''\n", + " SELECT\n", + " patient,\n", + " COUNT(temperature) AS high_fever_count,\n", + " MAX(temperature) AS max_temperature\n", + " FROM patient_experiment\n", + " WHERE temperature > 39.5\n", + " GROUP BY patient\n", + " ORDER BY patient;\n", + "''').df()" + ] + }, + { + "cell_type": "markdown", + "id": "98ef7ed5-23d1-4591-ab8a-405248f98a3a", + "metadata": {}, + "source": [ + "For each patient, compute the total dose administered, as well as the maximum temperature, and order by descending maximum temperature." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "7df98e50-5ab8-4813-a3ad-d3796aa35c48", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
patientmax_temperaturetotal_dose
0740.730.0
1940.230.0
2539.527.0
3339.513.0
4239.415.0
5138.56.0
6638.18.0
7438.110.0
8837.90.0
\n", + "
" + ], + "text/plain": [ + " patient max_temperature total_dose\n", + "0 7 40.7 30.0\n", + "1 9 40.2 30.0\n", + "2 5 39.5 27.0\n", + "3 3 39.5 13.0\n", + "4 2 39.4 15.0\n", + "5 1 38.5 6.0\n", + "6 6 38.1 8.0\n", + "7 4 38.1 10.0\n", + "8 8 37.9 0.0" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute('''\n", + " SELECT\n", + " patient,\n", + " MAX(temperature) AS 'max_temperature',\n", + " SUM(dose) AS 'total_dose'\n", + " FROM patient_experiment\n", + " GROUP BY patient\n", + " ORDER BY max_temperature DESC;\n", + "''').df()" + ] + }, + { + "cell_type": "markdown", + "id": "d3d6fcbf-9bc5-40fb-9e4f-046def5249db", + "metadata": {}, + "source": [ + "If you want to query the result of such a query, you can create a view, `'hypothesis'` in this example." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6b8694e2-5e9f-4d15-b2c3-609ab762d7b4", + "metadata": {}, + "outputs": [], + "source": [ + "conn.execute('''\n", + " CREATE VIEW hypothesis AS SELECT\n", + " patient,\n", + " MAX(temperature) AS 'max_temperature',\n", + " SUM(dose) AS 'total_dose'\n", + " FROM patient_experiment\n", + " GROUP BY patient\n", + " ORDER BY max_temperature DESC;\n", + "''');" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "7c755e2b-d781-435e-a878-9b29b6374670", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌──────────────┬────────────────────┬────────────┐\n", + "│ table_schema │ table_name │ table_type │\n", + "│ varchar │ varchar │ varchar │\n", + "├──────────────┼────────────────────┼────────────┤\n", + "│ main │ file │ VIEW │\n", + "│ main │ hypothesis │ VIEW │\n", + "│ main │ patient_experiment │ VIEW │\n", + "└──────────────┴────────────────────┴────────────┘\n", + "\n" + ] + } + ], + "source": [ + "show_tables(conn)" + ] + }, + { + "cell_type": "markdown", + "id": "96617d94-da47-4604-9c27-3375058cbd86", + "metadata": {}, + "source": [ + "Get the maximum dose administered to a patient." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "d1a405c8-05e4-4d27-bd1c-3d1fe61c5e43", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
max(total_dose)
030.0
\n", + "
" + ], + "text/plain": [ + " max(total_dose)\n", + "0 30.0" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute('''\n", + " SELECT MAX(total_dose)\n", + " FROM hypothesis;\n", + "''').df()" + ] + }, + { + "cell_type": "markdown", + "id": "2718eb2c-1e3a-4f90-a1f6-bb27e9d4a332", + "metadata": {}, + "source": [ + "Although DuckDB has an extension to perform a pivot (this is not standard SQL), it is not as elegant as the pandas counterpart as multi-level columns are not suppported by DuckDB." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "0c4e1ad8-a9ae-4808-901b-c036e4e0ee17", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<_duckdb.DuckDBPyConnection at 0x739eb95f72b0>" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute('''\n", + " CREATE TABLE time_series AS\n", + " PIVOT patient_experiment\n", + " ON patient\n", + " USING\n", + " first(temperature) AS temperature,\n", + " first(dose) AS dose\n", + " GROUP BY date;\n", + "''');" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "b4b38844-e6d2-4df1-8f6c-21e4daf77d48", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌───────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐\n", + "│ column_name │ column_type │ null │ key │ default │ extra │\n", + "│ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │\n", + "├───────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤\n", + "│ date │ TIMESTAMP │ YES │ NULL │ NULL │ NULL │\n", + "│ 1_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 1_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 2_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 2_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 3_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 3_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 4_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 4_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 5_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 5_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 6_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 6_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 7_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 7_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 8_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 8_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 9_temperature │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "│ 9_dose │ DOUBLE │ YES │ NULL │ NULL │ NULL │\n", + "├───────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┤\n", + "│ 19 rows 6 columns │\n", + "└─────────────────────────────────────────────────────────────────────┘\n", + "\n" + ] + } + ], + "source": [ + "conn.sql('DESCRIBE time_series;').show()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "95b029d4-0329-4462-8737-94f036dc8147", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datetemperaturedose
02012-10-02 10:00:0037.50.0
12012-10-02 11:00:0038.12.0
22012-10-02 12:00:0037.93.0
32012-10-02 13:00:0037.72.0
42012-10-02 14:00:0037.21.0
52012-10-02 15:00:0036.80.0
62012-10-02 16:00:00NaNNaN
\n", + "
" + ], + "text/plain": [ + " date temperature dose\n", + "0 2012-10-02 10:00:00 37.5 0.0\n", + "1 2012-10-02 11:00:00 38.1 2.0\n", + "2 2012-10-02 12:00:00 37.9 3.0\n", + "3 2012-10-02 13:00:00 37.7 2.0\n", + "4 2012-10-02 14:00:00 37.2 1.0\n", + "5 2012-10-02 15:00:00 36.8 0.0\n", + "6 2012-10-02 16:00:00 NaN NaN" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute('''\n", + " SELECT\n", + " date,\n", + " \"6_temperature\" AS temperature,\n", + " \"6_dose\" AS dose\n", + " FROM time_series\n", + " ORDER BY date;\n", + "''').df()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "78a996cf-4bc2-40dc-8bce-27bcc6731b7b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌────────────────────┐\n", + "│ name │\n", + "│ varchar │\n", + "├────────────────────┤\n", + "│ file │\n", + "│ hypothesis │\n", + "│ patient_experiment │\n", + "│ time_series │\n", + "└────────────────────┘\n", + "\n" + ] + } + ], + "source": [ + "conn.sql('SHOW TABLES;').show()" + ] + }, + { + "cell_type": "markdown", + "id": "8637dcab-93f4-4d69-9fec-0bc840b03598", + "metadata": {}, + "source": [ + "Create a view on a second CSV file." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "8570b365-09d5-4993-908e-4eaac974204d", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "conn.execute('''\n", + " CREATE VIEW patient_metadata AS\n", + " SELECT *\n", + " FROM read_csv_auto('data/patient_metadata.csv', filename=true);\n", + "''');" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "022db70e-64a2-40ad-920f-463f465b274c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌──────────────┬────────────────────┬────────────┐\n", + "│ table_schema │ table_name │ table_type │\n", + "│ varchar │ varchar │ varchar │\n", + "├──────────────┼────────────────────┼────────────┤\n", + "│ main │ time_series │ BASE TABLE │\n", + "│ main │ file │ VIEW │\n", + "│ main │ hypothesis │ VIEW │\n", + "│ main │ metadata │ VIEW │\n", + "│ main │ patient_experiment │ VIEW │\n", + "│ main │ patient_metadata │ VIEW │\n", + "└──────────────┴────────────────────┴────────────┘\n", + "\n" + ] + } + ], + "source": [ + "show_tables(conn)" + ] + }, + { + "cell_type": "markdown", + "id": "8ae2a578-6849-4a22-ad2e-b770de3ff44e", + "metadata": {}, + "source": [ + "Determine the patient IDs that are either in `patient_experiment`, or in `patient_metadata`, but not in both. Note that a full outer join is used to combine the informantion in both tables." + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "6482c2b6-4896-4c30-9bb4-b5a278626364", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
patientpresent
04only in experiment
110only in metadata
211only in metadata
\n", + "
" + ], + "text/plain": [ + " patient present\n", + "0 4 only in experiment\n", + "1 10 only in metadata\n", + "2 11 only in metadata" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute('''\n", + " SELECT\n", + " DISTINCT COALESCE(exp.patient, mt.patient) AS patient,\n", + " CASE\n", + " WHEN exp.patient IS NOT NULL AND mt.patient IS NULL\n", + " THEN 'only in experiment'\n", + " WHEN exp.patient IS NULL and mt.patient IS NOT NULL\n", + " THEN 'only in metadata'\n", + " ELSE 'in both'\n", + " END AS present\n", + " FROM patient_experiment AS exp FULL OUTER JOIN patient_metadata AS mt\n", + " USING (patient)\n", + " WHERE NOT present = 'in both'\n", + " ORDER BY exp.patient, mt.patient;\n", + "''').df()" + ] + }, + { + "cell_type": "markdown", + "id": "9cf58cd3-0804-4472-bfd2-72f5afd82166", + "metadata": {}, + "source": [ + "You can do an inner join between the tables `patient_experiment` and `patient_metadata` to get the maximum temperature, the condition and gender for each patient that occurs in both tables." + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "1639aeaa-b33a-4889-869d-93500242980b", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
patientmax_temperatureconditiongender
0138.5AM
1239.4AF
2339.5AM
3539.5AM
4638.1BF
5740.7BM
6837.9BF
7940.2BM
\n", + "
" + ], + "text/plain": [ + " patient max_temperature condition gender\n", + "0 1 38.5 A M\n", + "1 2 39.4 A F\n", + "2 3 39.5 A M\n", + "3 5 39.5 A M\n", + "4 6 38.1 B F\n", + "5 7 40.7 B M\n", + "6 8 37.9 B F\n", + "7 9 40.2 B M" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute('''\n", + " SELECT\n", + " COALESCE(exp.patient, mt.patient) AS patient,\n", + " MAX(exp.temperature) AS max_temperature,\n", + " ANY_VALUE(mt.condition) AS condition,\n", + " ANY_VALUE(mt.gender) AS gender\n", + " FROM patient_experiment AS exp INNER JOIN patient_metadata AS mt\n", + " USING (patient)\n", + " GROUP BY exp.patient, mt.patient\n", + " ORDER BY exp.patient, mt.patient\n", + "''').df()" + ] + }, + { + "cell_type": "markdown", + "id": "b2afc510-6340-4c59-aeec-e245d8358a0e", + "metadata": {}, + "source": [ + "## New style versus classic style" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "397405b0-756c-461a-81f1-e7af99b98d0c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
patientdatetemperaturedose
062012-10-02 10:00:0037.50.0
162012-10-02 11:00:0038.12.0
262012-10-02 12:00:0037.93.0
362012-10-02 13:00:0037.72.0
462012-10-02 14:00:0037.21.0
562012-10-02 15:00:0036.80.0
\n", + "
" + ], + "text/plain": [ + " patient date temperature dose\n", + "0 6 2012-10-02 10:00:00 37.5 0.0\n", + "1 6 2012-10-02 11:00:00 38.1 2.0\n", + "2 6 2012-10-02 12:00:00 37.9 3.0\n", + "3 6 2012-10-02 13:00:00 37.7 2.0\n", + "4 6 2012-10-02 14:00:00 37.2 1.0\n", + "5 6 2012-10-02 15:00:00 36.8 0.0" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute('''\n", + " SELECT patient, date, temperature, dose\n", + " FROM patient_experiment\n", + " WHERE patient == 6;\n", + "''').df()" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "3f06bf99-ead9-4eec-8251-b834f5c2eb69", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌─────────┬─────────────────────┬─────────────┬────────┐\n", + "│ patient │ date │ temperature │ dose │\n", + "│ int64 │ timestamp │ double │ double │\n", + "├─────────┼─────────────────────┼─────────────┼────────┤\n", + "│ 6 │ 2012-10-02 10:00:00 │ 37.5 │ 0.0 │\n", + "│ 6 │ 2012-10-02 11:00:00 │ 38.1 │ 2.0 │\n", + "│ 6 │ 2012-10-02 12:00:00 │ 37.9 │ 3.0 │\n", + "│ 6 │ 2012-10-02 13:00:00 │ 37.7 │ 2.0 │\n", + "│ 6 │ 2012-10-02 14:00:00 │ 37.2 │ 1.0 │\n", + "│ 6 │ 2012-10-02 15:00:00 │ 36.8 │ 0.0 │\n", + "└─────────┴─────────────────────┴─────────────┴────────┘\n", + "\n" + ] + } + ], + "source": [ + "conn.sql('''\n", + " SELECT patient, date, temperature, dose\n", + " FROM patient_experiment\n", + "''').filter('patient = 6').show()" + ] + }, + { + "cell_type": "markdown", + "id": "cd9dee70-4c1c-4445-9b6c-f957791210fc", + "metadata": {}, + "source": [ + "For the patients with a high fever, count the number of timepoints they had a temperature above $39.5\\textdegree C$ as well as their maximum temperature." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "0f29e357-5c17-4aca-92dc-ceaecf959023", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
patienthigh_fever_countmax_temperature
07340.7
19140.2
\n", + "
" + ], + "text/plain": [ + " patient high_fever_count max_temperature\n", + "0 7 3 40.7\n", + "1 9 1 40.2" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute('''\n", + " SELECT\n", + " patient,\n", + " COUNT(temperature) AS high_fever_count,\n", + " MAX(temperature) AS max_temperature\n", + " FROM patient_experiment\n", + " WHERE temperature > 39.5\n", + " GROUP BY patient\n", + " ORDER BY patient;\n", + "''').df()" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "d07c6fff-b72d-49c0-a0f7-f545e4013331", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌─────────┬──────────────────┬─────────────────┐\n", + "│ patient │ high_fever_count │ max_temperature │\n", + "│ int64 │ int64 │ double │\n", + "├─────────┼──────────────────┼─────────────────┤\n", + "│ 7 │ 3 │ 40.7 │\n", + "│ 9 │ 1 │ 40.2 │\n", + "└─────────┴──────────────────┴─────────────────┘\n", + "\n" + ] + } + ], + "source": [ + "conn.sql('SELECT patient, temperature FROM patient_experiment') \\\n", + " .filter('temperature > 39.5') \\\n", + " .aggregate(\n", + " 'patient, '\n", + " 'COUNT(temperature) AS high_fever_count, '\n", + " 'MAX(temperature) AS max_temperature',\n", + " group_expr='patient') \\\n", + " .show() " + ] + }, + { + "cell_type": "markdown", + "id": "65ab2e40-bbf8-4537-a41b-e2321568fb98", + "metadata": {}, + "source": [ + "For each patient, compute the total dose administered, as well as the maximum temperature, and order by descending maximum temperature." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "27f12e02-c92a-459a-a469-e733ba17052f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
patientmax_temperaturetotal_dose
0740.730.0
1940.230.0
2539.527.0
3339.513.0
4239.415.0
5138.56.0
6638.18.0
7438.110.0
8837.90.0
\n", + "
" + ], + "text/plain": [ + " patient max_temperature total_dose\n", + "0 7 40.7 30.0\n", + "1 9 40.2 30.0\n", + "2 5 39.5 27.0\n", + "3 3 39.5 13.0\n", + "4 2 39.4 15.0\n", + "5 1 38.5 6.0\n", + "6 6 38.1 8.0\n", + "7 4 38.1 10.0\n", + "8 8 37.9 0.0" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute('''\n", + " SELECT\n", + " patient,\n", + " MAX(temperature) AS 'max_temperature',\n", + " SUM(dose) AS 'total_dose'\n", + " FROM patient_experiment\n", + " GROUP BY patient\n", + " ORDER BY max_temperature DESC;\n", + "''').df()" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "5f83cde9-129a-4f76-b199-3ae7adf3f08c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌─────────┬─────────────────┬────────────┐\n", + "│ patient │ max_temperature │ total_dose │\n", + "│ int64 │ double │ double │\n", + "├─────────┼─────────────────┼────────────┤\n", + "│ 7 │ 40.7 │ 30.0 │\n", + "│ 9 │ 40.2 │ 30.0 │\n", + "│ 5 │ 39.5 │ 27.0 │\n", + "│ 3 │ 39.5 │ 13.0 │\n", + "│ 2 │ 39.4 │ 15.0 │\n", + "│ 1 │ 38.5 │ 6.0 │\n", + "│ 4 │ 38.1 │ 10.0 │\n", + "│ 6 │ 38.1 │ 8.0 │\n", + "│ 8 │ 37.9 │ 0.0 │\n", + "└─────────┴─────────────────┴────────────┘\n", + "\n" + ] + } + ], + "source": [ + "conn.sql('SELECT patient, temperature, dose from patient_experiment') \\\n", + " .aggregate(\n", + " 'patient, '\n", + " 'MAX(temperature) AS max_temperature, '\n", + " 'SUM(dose) AS total_dose',\n", + " group_expr='patient'\n", + " ) \\\n", + " .order('max_temperature DESC') \\\n", + " .show()" + ] + }, + { + "cell_type": "markdown", + "id": "ddf5f25c-b0c1-48c9-9031-5fd8a813649b", + "metadata": {}, + "source": [ + "The new-style queries allow for lazy evaluation, while the classic-style queries are evaluated immediately." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/source-code/pandas/README.md b/source-code/pandas/README.md index 746a50d..80881e6 100644 --- a/source-code/pandas/README.md +++ b/source-code/pandas/README.md @@ -25,7 +25,14 @@ easy to use. 1. `pipes.ipynb`: consolidating data processing using pipes. 1. `screenshots`: screenshots made for the slides. 1. `generate_csv_files.py`: script to generate CSV files in different - formats. + formatg. 1. `copy_on_write.ipynb`: Jupyter notebook that illustrates how data is shared between related notebooks and the role Copy-on-Write plays in order to prevent accidental data modifications in more than one dataframe. +1. `apply.ipynb`: Jupyter notebook that illustrates the use of the `apply` method + in pandas dataframes for applying functions along rows or columns. It includes + a comparison of performance between using `apply` and vectorized operations. +1. `numba_and_pandas.ipynb`: Jupyter notebook that demonstrates how to use Numba + to optimize performance of operations on pandas dataframes. +1. `from_long_to_wide_and_back_again.ipynb`: Jupyter notebook that illustrates + how to reshape data using `stack` and `pivot` methods in pandas. diff --git a/source-code/pandas/apply.ipynb b/source-code/pandas/apply.ipynb new file mode 100644 index 0000000..35fc9c7 --- /dev/null +++ b/source-code/pandas/apply.ipynb @@ -0,0 +1,360 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "c507c033-f47a-40f3-9d9d-d24d23e25474", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "c973633e-eccd-4a0f-873d-faf43fa3b836", + "metadata": {}, + "source": [ + "## apply" + ] + }, + { + "cell_type": "markdown", + "id": "f1401362-5955-495e-be57-5436a7446530", + "metadata": {}, + "source": [ + "Code that uses `.apply()` looks clean, but it is rather slow when used row-wise (`axis=1`). To quantify this, you can run the example below." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "af048047-df04-4c5f-8b36-d48f53d021ae", + "metadata": {}, + "outputs": [], + "source": [ + "size = 100_000\n", + "df = pd.DataFrame({\n", + " 'A': np.random.uniform(0.0, 1.0, size=size),\n", + " 'B': np.random.uniform(0.0, 1.0, size=size),\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "84b3d0d6-d9c3-4921-8561-80ef6d766f6f", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 100000 entries, 0 to 99999\n", + "Data columns (total 2 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 A 100000 non-null float64\n", + " 1 B 100000 non-null float64\n", + "dtypes: float64(2)\n", + "memory usage: 1.5 MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "markdown", + "id": "9dfd0c4b-996d-4426-8b58-d66c78124a8f", + "metadata": {}, + "source": [ + "Note that this dataframe is fairly small." + ] + }, + { + "cell_type": "markdown", + "id": "d0b672e5-9762-496e-932f-4c5729c62061", + "metadata": {}, + "source": [ + "### Evaluating a condition" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "093ddcde-ee7f-4d66-847d-221e8181b9dc", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "551 ms ± 8.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%timeit df.apply(lambda x: 0 if x.A + x.B < 1.0 else 1, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "6b10519f-26b5-4c74-af2f-ee34af35e96d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.17 ms ± 5.24 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" + ] + } + ], + "source": [ + "%timeit np.select([df.A + df.B < 1.0, df.A + df.B >= 1.0], [0, 1])" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "e8b003c0-7445-475e-9ece-68a9783b1388", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "510 μs ± 4.17 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" + ] + } + ], + "source": [ + "%timeit np.where(df.A + df.B < 1.0, 0, 1)" + ] + }, + { + "cell_type": "markdown", + "id": "35ebd7e1-48bb-4d3b-860d-f0d765ffa62e", + "metadata": {}, + "source": [ + "Clearly, `.apply()` is very slow comparted to `np.select()` and `np.where()`. Note that `np.where()` is faster than `np.select()` by a factor of 2." + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "9bc83bfe-680e-4b3d-8017-970cf08fd956", + "metadata": {}, + "outputs": [], + "source": [ + "assert np.array_equal(\n", + " df.apply(lambda x: 0 if x.A + x.B < 1.0 else 1, axis=1).to_numpy(),\n", + " np.where(df.A + df.B < 1.0, 0, 1),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "de5e05b5-154e-498c-a565-3116e490ae11", + "metadata": {}, + "outputs": [], + "source": [ + "assert np.array_equal(\n", + " df.apply(lambda x: 0 if x.A + x.B < 1.0 else 1, axis=1).to_numpy(),\n", + " np.select([df.A + df.B < 1.0, df.A + df.B >= 1.0], [0, 1]),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "9b46cd48-f1ce-4041-9560-6c1b09556d53", + "metadata": {}, + "source": [ + "All three approaches produce the same results." + ] + }, + { + "cell_type": "markdown", + "id": "c63e4df2-6fed-4072-aadd-3256a7c8cede", + "metadata": {}, + "source": [ + "### Adding a column" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "ef441507-f6f5-4485-b03f-36636259a848", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "563 ms ± 8.58 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%timeit df['C'] = df.apply(lambda x: x.A + x.B, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "bd13d78b-b7fd-40c0-8b0e-3bdafdef4b33", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "176 μs ± 2.21 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n" + ] + } + ], + "source": [ + "%timeit df['C'] = df.A + df.B" + ] + }, + { + "cell_type": "markdown", + "id": "3f092bfc-9f32-4636-ba95-52b2c07d2fdb", + "metadata": {}, + "source": [ + "Clearly, `.apply()` is very slow comparted to a straightforward column definition. The difference is a factor of 1,000." + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "5c8f3b66-1eea-4e58-9035-f6db4af3df3f", + "metadata": {}, + "outputs": [], + "source": [ + "assert df.apply(lambda x: x.A + x.B, axis=1).equals(df.A + df.B)" + ] + }, + { + "cell_type": "markdown", + "id": "c31c53ea-e297-4658-b55b-35ab47987237", + "metadata": {}, + "source": [ + "Both approaches yield the same result." + ] + }, + { + "cell_type": "markdown", + "id": "a32a0791-8063-40ac-83d0-93a5ab796c70", + "metadata": {}, + "source": [ + "### Aggregating columns" + ] + }, + { + "cell_type": "markdown", + "id": "8be8ec5b-878b-4452-9815-9c0a23f97d9d", + "metadata": {}, + "source": [ + "Although less dramatically so, applying `.apply()` along axis 0 is also slower than its numpy counterpart." + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "47d6aca2-f52e-4746-a139-119fcdfe3030", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "303 μs ± 4.28 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" + ] + } + ], + "source": [ + "%timeit df.apply(np.sum, axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "1e4ed799-08fd-4c14-bdf0-f6db5b829c0c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "179 μs ± 10.2 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n" + ] + } + ], + "source": [ + "%timeit np.sum(df.to_numpy(), axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "d0504d19-a6d8-4f3d-a4ff-73e9c04152e4", + "metadata": {}, + "outputs": [], + "source": [ + "assert np.array_equal(df.apply(np.sum, axis=0), np.sum(df.to_numpy(), axis=0))" + ] + }, + { + "cell_type": "markdown", + "id": "ce8fb4ac-795e-43e3-ae72-fa528df86855", + "metadata": {}, + "source": [ + "Again, both produce the same result." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/source-code/pandas/copy_on_write.ipynb b/source-code/pandas/copy_on_write.ipynb index 744c29b..d18c56d 100644 --- a/source-code/pandas/copy_on_write.ipynb +++ b/source-code/pandas/copy_on_write.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 1, "id": "9550bae2-79e5-4db6-b8eb-4b64dbc1b4e1", "metadata": {}, "outputs": [], @@ -37,6 +37,27 @@ "The answer to that question seems to depend on the version of `pandas`." ] }, + { + "cell_type": "code", + "execution_count": 2, + "id": "44918d07-7c0c-45ba-a1bc-04a0fa27c06d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'2.3.3'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.__version__" + ] + }, { "cell_type": "markdown", "id": "ea08f1e0-78c1-463c-994c-92066363b006", @@ -55,7 +76,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 3, "id": "520bc27d-5d55-46dc-a7bf-5b9b12162b9d", "metadata": {}, "outputs": [], @@ -70,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 4, "id": "432772e9-3031-4001-81e3-c346cb7f9c76", "metadata": {}, "outputs": [ @@ -106,7 +127,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 5, "id": "eb6f4dad-6df3-493e-869f-204d06b0cc7d", "metadata": {}, "outputs": [ @@ -147,50 +168,50 @@ " \n", " \n", " mean\n", - " 4.225888\n", - " 1.284407\n", - " -2.830820\n", - " 0.143920\n", + " -0.666188\n", + " -0.175422\n", + " 4.077410\n", + " 2.294040\n", " \n", " \n", " std\n", - " 577.759840\n", - " 577.063194\n", - " 577.528168\n", - " 576.729627\n", + " 576.312351\n", + " 576.718069\n", + " 577.017098\n", + " 577.149466\n", " \n", " \n", " min\n", - " -999.977694\n", - " -999.908941\n", + " -999.995592\n", + " -999.992864\n", " -999.000000\n", " -999.000000\n", " \n", " \n", " 25%\n", - " -495.491551\n", - " -497.783061\n", - " -506.000000\n", + " -499.708991\n", + " -497.766770\n", + " -496.000000\n", " -499.000000\n", " \n", " \n", " 50%\n", - " 4.748510\n", - " 2.103870\n", - " -4.000000\n", - " 0.000000\n", + " -1.080819\n", + " -0.135347\n", + " 2.000000\n", + " 6.000000\n", " \n", " \n", " 75%\n", - " 505.160586\n", - " 500.502871\n", - " 497.000000\n", - " 498.000000\n", + " 498.134233\n", + " 495.339400\n", + " 504.000000\n", + " 501.000000\n", " \n", " \n", " max\n", - " 999.992940\n", - " 999.947904\n", + " 999.976901\n", + " 999.997666\n", " 999.000000\n", " 999.000000\n", " \n", @@ -201,16 +222,16 @@ "text/plain": [ " column1 column2 column3 column4\n", "count 100000.000000 100000.000000 100000.000000 100000.000000\n", - "mean 4.225888 1.284407 -2.830820 0.143920\n", - "std 577.759840 577.063194 577.528168 576.729627\n", - "min -999.977694 -999.908941 -999.000000 -999.000000\n", - "25% -495.491551 -497.783061 -506.000000 -499.000000\n", - "50% 4.748510 2.103870 -4.000000 0.000000\n", - "75% 505.160586 500.502871 497.000000 498.000000\n", - "max 999.992940 999.947904 999.000000 999.000000" + "mean -0.666188 -0.175422 4.077410 2.294040\n", + "std 576.312351 576.718069 577.017098 577.149466\n", + "min -999.995592 -999.992864 -999.000000 -999.000000\n", + "25% -499.708991 -497.766770 -496.000000 -499.000000\n", + "50% -1.080819 -0.135347 2.000000 6.000000\n", + "75% 498.134233 495.339400 504.000000 501.000000\n", + "max 999.976901 999.997666 999.000000 999.000000" ] }, - "execution_count": 17, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -229,7 +250,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 6, "id": "1a43e7eb-a7ad-4089-95ba-15879f5920ce", "metadata": {}, "outputs": [], @@ -239,7 +260,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 7, "id": "1af4b0bc-81eb-4dab-9a65-fb5f71d004c2", "metadata": {}, "outputs": [ @@ -275,56 +296,56 @@ " count\n", " 50000.000000\n", " 50000.000000\n", - " 50000.00000\n", + " 50000.000000\n", " 50000.000000\n", " \n", " \n", " mean\n", - " 4.828118\n", - " 0.964712\n", - " -2.11148\n", - " -3.656300\n", + " -0.119765\n", + " -2.058561\n", + " -0.764240\n", + " 3.063220\n", " \n", " \n", " std\n", - " 577.936131\n", - " 577.971021\n", - " 577.96331\n", - " 576.181292\n", + " 575.680081\n", + " 578.254495\n", + " 577.348834\n", + " 577.086857\n", " \n", " \n", " min\n", - " -999.944283\n", - " -999.897600\n", - " -999.00000\n", + " -999.933748\n", + " -999.992864\n", + " -999.000000\n", " -999.000000\n", " \n", " \n", " 25%\n", - " -493.817751\n", - " -500.003775\n", - " -504.00000\n", - " -503.000000\n", + " -499.293681\n", + " -499.616524\n", + " -501.000000\n", + " -498.000000\n", " \n", " \n", " 50%\n", - " 4.462345\n", - " 3.577384\n", - " -6.00000\n", - " -2.000000\n", + " -1.213840\n", + " -4.281987\n", + " -3.000000\n", + " 10.000000\n", " \n", " \n", " 75%\n", - " 503.114598\n", - " 500.886860\n", - " 501.00000\n", - " 494.000000\n", + " 498.062126\n", + " 495.209044\n", + " 500.000000\n", + " 499.250000\n", " \n", " \n", " max\n", - " 999.964569\n", - " 999.864196\n", - " 999.00000\n", + " 999.976901\n", + " 999.997666\n", + " 999.000000\n", " 999.000000\n", " \n", " \n", @@ -332,18 +353,18 @@ "" ], "text/plain": [ - " column1 column2 column3 column4\n", - "count 50000.000000 50000.000000 50000.00000 50000.000000\n", - "mean 4.828118 0.964712 -2.11148 -3.656300\n", - "std 577.936131 577.971021 577.96331 576.181292\n", - "min -999.944283 -999.897600 -999.00000 -999.000000\n", - "25% -493.817751 -500.003775 -504.00000 -503.000000\n", - "50% 4.462345 3.577384 -6.00000 -2.000000\n", - "75% 503.114598 500.886860 501.00000 494.000000\n", - "max 999.964569 999.864196 999.00000 999.000000" + " column1 column2 column3 column4\n", + "count 50000.000000 50000.000000 50000.000000 50000.000000\n", + "mean -0.119765 -2.058561 -0.764240 3.063220\n", + "std 575.680081 578.254495 577.348834 577.086857\n", + "min -999.933748 -999.992864 -999.000000 -999.000000\n", + "25% -499.293681 -499.616524 -501.000000 -498.000000\n", + "50% -1.213840 -4.281987 -3.000000 10.000000\n", + "75% 498.062126 495.209044 500.000000 499.250000\n", + "max 999.976901 999.997666 999.000000 999.000000" ] }, - "execution_count": 19, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -362,7 +383,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 8, "id": "65dfb904-0b0a-4b4a-baab-77011a840910", "metadata": {}, "outputs": [ @@ -370,7 +391,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_565/3787905307.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "/tmp/ipykernel_15868/3787905307.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", "\n", "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", @@ -386,7 +407,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 9, "id": "cbeb2db9-7018-4cac-9dd4-2880d2f7a214", "metadata": {}, "outputs": [ @@ -422,56 +443,56 @@ " count\n", " 50000.000000\n", " 50000.000000\n", - " 50000.00000\n", + " 50000.000000\n", " 50000.000000\n", " \n", " \n", " mean\n", - " 2.605923\n", - " 0.964712\n", - " -2.11148\n", - " -3.656300\n", + " 0.147446\n", + " -2.058561\n", + " -0.764240\n", + " 3.063220\n", " \n", " \n", " std\n", - " 408.136559\n", - " 577.971021\n", - " 577.96331\n", - " 576.181292\n", + " 407.380662\n", + " 578.254495\n", + " 577.348834\n", + " 577.086857\n", " \n", " \n", " min\n", " -500.000000\n", - " -999.897600\n", - " -999.00000\n", + " -999.992864\n", + " -999.000000\n", " -999.000000\n", " \n", " \n", " 25%\n", - " -493.817751\n", - " -500.003775\n", - " -504.00000\n", - " -503.000000\n", + " -499.293681\n", + " -499.616524\n", + " -501.000000\n", + " -498.000000\n", " \n", " \n", " 50%\n", - " 4.462345\n", - " 3.577384\n", - " -6.00000\n", - " -2.000000\n", + " -1.213840\n", + " -4.281987\n", + " -3.000000\n", + " 10.000000\n", " \n", " \n", " 75%\n", + " 498.062126\n", + " 495.209044\n", " 500.000000\n", - " 500.886860\n", - " 501.00000\n", - " 494.000000\n", + " 499.250000\n", " \n", " \n", " max\n", " 500.000000\n", - " 999.864196\n", - " 999.00000\n", + " 999.997666\n", + " 999.000000\n", " 999.000000\n", " \n", " \n", @@ -479,18 +500,18 @@ "" ], "text/plain": [ - " column1 column2 column3 column4\n", - "count 50000.000000 50000.000000 50000.00000 50000.000000\n", - "mean 2.605923 0.964712 -2.11148 -3.656300\n", - "std 408.136559 577.971021 577.96331 576.181292\n", - "min -500.000000 -999.897600 -999.00000 -999.000000\n", - "25% -493.817751 -500.003775 -504.00000 -503.000000\n", - "50% 4.462345 3.577384 -6.00000 -2.000000\n", - "75% 500.000000 500.886860 501.00000 494.000000\n", - "max 500.000000 999.864196 999.00000 999.000000" + " column1 column2 column3 column4\n", + "count 50000.000000 50000.000000 50000.000000 50000.000000\n", + "mean 0.147446 -2.058561 -0.764240 3.063220\n", + "std 407.380662 578.254495 577.348834 577.086857\n", + "min -500.000000 -999.992864 -999.000000 -999.000000\n", + "25% -499.293681 -499.616524 -501.000000 -498.000000\n", + "50% -1.213840 -4.281987 -3.000000 10.000000\n", + "75% 498.062126 495.209044 500.000000 499.250000\n", + "max 500.000000 999.997666 999.000000 999.000000" ] }, - "execution_count": 21, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -543,7 +564,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 10, "id": "b4cff949-c4aa-4f7d-b1e4-b4b78bf0284b", "metadata": {}, "outputs": [], @@ -561,7 +582,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 11, "id": "6bb4fec9-0623-4482-8ebd-9077723956e0", "metadata": {}, "outputs": [], @@ -576,7 +597,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 12, "id": "f005ad9f-3b19-40bf-b01a-4f7fd8f4d024", "metadata": {}, "outputs": [], @@ -586,7 +607,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 13, "id": "91499993-8461-437d-b7b1-7f0caf20d7d6", "metadata": {}, "outputs": [ @@ -627,50 +648,50 @@ " \n", " \n", " mean\n", - " 1.695724\n", - " -3.040382\n", - " 4.001460\n", - " 0.511100\n", + " 0.455395\n", + " 5.588258\n", + " 1.329300\n", + " 1.763540\n", " \n", " \n", " std\n", - " 578.627560\n", - " 576.453798\n", - " 576.244217\n", - " 578.301376\n", + " 577.070175\n", + " 577.342945\n", + " 577.130306\n", + " 575.554149\n", " \n", " \n", " min\n", - " -999.983952\n", - " -999.968792\n", + " -999.965920\n", + " -999.963945\n", " -999.000000\n", " -999.000000\n", " \n", " \n", " 25%\n", - " -502.160012\n", - " -500.333306\n", - " -492.250000\n", - " -498.000000\n", + " -500.420461\n", + " -495.503431\n", + " -497.000000\n", + " -493.000000\n", " \n", " \n", " 50%\n", - " 4.057258\n", - " -4.377464\n", + " 3.895165\n", + " 9.582978\n", " 3.000000\n", - " 0.000000\n", + " 1.000000\n", " \n", " \n", " 75%\n", - " 502.159214\n", - " 494.614704\n", - " 499.000000\n", - " 500.250000\n", + " 496.851678\n", + " 505.826690\n", + " 501.000000\n", + " 499.250000\n", " \n", " \n", " max\n", - " 999.998666\n", - " 999.913716\n", + " 999.979256\n", + " 999.948488\n", " 999.000000\n", " 999.000000\n", " \n", @@ -681,16 +702,16 @@ "text/plain": [ " column1 column2 column3 column4\n", "count 50000.000000 50000.000000 50000.000000 50000.000000\n", - "mean 1.695724 -3.040382 4.001460 0.511100\n", - "std 578.627560 576.453798 576.244217 578.301376\n", - "min -999.983952 -999.968792 -999.000000 -999.000000\n", - "25% -502.160012 -500.333306 -492.250000 -498.000000\n", - "50% 4.057258 -4.377464 3.000000 0.000000\n", - "75% 502.159214 494.614704 499.000000 500.250000\n", - "max 999.998666 999.913716 999.000000 999.000000" + "mean 0.455395 5.588258 1.329300 1.763540\n", + "std 577.070175 577.342945 577.130306 575.554149\n", + "min -999.965920 -999.963945 -999.000000 -999.000000\n", + "25% -500.420461 -495.503431 -497.000000 -493.000000\n", + "50% 3.895165 9.582978 3.000000 1.000000\n", + "75% 496.851678 505.826690 501.000000 499.250000\n", + "max 999.979256 999.948488 999.000000 999.000000" ] }, - "execution_count": 25, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -701,7 +722,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 14, "id": "abf7f30b-2bd0-4b3b-94fd-f9b6f183b26b", "metadata": {}, "outputs": [ @@ -709,7 +730,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_565/3016868282.py:1: ChainedAssignmentError: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "/tmp/ipykernel_15868/3016868282.py:1: ChainedAssignmentError: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", "When using the Copy-on-Write mode, such inplace method never works to update the original DataFrame or Series, because the intermediate object on which we are setting values always behaves as a copy.\n", "\n", "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' instead, to perform the operation inplace on the original object.\n", @@ -733,7 +754,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 15, "id": "34e80602-a9fa-44d2-9fbb-3b6d4bd6a3d3", "metadata": {}, "outputs": [ @@ -774,50 +795,50 @@ " \n", " \n", " mean\n", - " 1.695724\n", - " -3.040382\n", - " 4.001460\n", - " 0.511100\n", + " 0.455395\n", + " 5.588258\n", + " 1.329300\n", + " 1.763540\n", " \n", " \n", " std\n", - " 578.627560\n", - " 576.453798\n", - " 576.244217\n", - " 578.301376\n", + " 577.070175\n", + " 577.342945\n", + " 577.130306\n", + " 575.554149\n", " \n", " \n", " min\n", - " -999.983952\n", - " -999.968792\n", + " -999.965920\n", + " -999.963945\n", " -999.000000\n", " -999.000000\n", " \n", " \n", " 25%\n", - " -502.160012\n", - " -500.333306\n", - " -492.250000\n", - " -498.000000\n", + " -500.420461\n", + " -495.503431\n", + " -497.000000\n", + " -493.000000\n", " \n", " \n", " 50%\n", - " 4.057258\n", - " -4.377464\n", + " 3.895165\n", + " 9.582978\n", " 3.000000\n", - " 0.000000\n", + " 1.000000\n", " \n", " \n", " 75%\n", - " 502.159214\n", - " 494.614704\n", - " 499.000000\n", - " 500.250000\n", + " 496.851678\n", + " 505.826690\n", + " 501.000000\n", + " 499.250000\n", " \n", " \n", " max\n", - " 999.998666\n", - " 999.913716\n", + " 999.979256\n", + " 999.948488\n", " 999.000000\n", " 999.000000\n", " \n", @@ -828,16 +849,16 @@ "text/plain": [ " column1 column2 column3 column4\n", "count 50000.000000 50000.000000 50000.000000 50000.000000\n", - "mean 1.695724 -3.040382 4.001460 0.511100\n", - "std 578.627560 576.453798 576.244217 578.301376\n", - "min -999.983952 -999.968792 -999.000000 -999.000000\n", - "25% -502.160012 -500.333306 -492.250000 -498.000000\n", - "50% 4.057258 -4.377464 3.000000 0.000000\n", - "75% 502.159214 494.614704 499.000000 500.250000\n", - "max 999.998666 999.913716 999.000000 999.000000" + "mean 0.455395 5.588258 1.329300 1.763540\n", + "std 577.070175 577.342945 577.130306 575.554149\n", + "min -999.965920 -999.963945 -999.000000 -999.000000\n", + "25% -500.420461 -495.503431 -497.000000 -493.000000\n", + "50% 3.895165 9.582978 3.000000 1.000000\n", + "75% 496.851678 505.826690 501.000000 499.250000\n", + "max 999.979256 999.948488 999.000000 999.000000" ] }, - "execution_count": 28, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -856,7 +877,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 16, "id": "acc94632-2ed2-4dc1-a067-6d9518044c7b", "metadata": {}, "outputs": [ @@ -892,56 +913,56 @@ " count\n", " 100000.000000\n", " 100000.000000\n", - " 100000.00000\n", + " 100000.000000\n", " 100000.000000\n", " \n", " \n", " mean\n", - " 1.623964\n", - " -3.764614\n", - " 1.34378\n", - " 1.478400\n", + " 0.540672\n", + " 4.948220\n", + " 1.349060\n", + " 1.572440\n", " \n", " \n", " std\n", - " 577.971329\n", - " 577.167935\n", - " 575.56337\n", - " 577.826276\n", + " 577.009925\n", + " 577.169270\n", + " 576.321436\n", + " 576.108494\n", " \n", " \n", " min\n", - " -999.983952\n", - " -999.980827\n", - " -999.00000\n", + " -999.965920\n", + " -999.988117\n", + " -999.000000\n", " -999.000000\n", " \n", " \n", " 25%\n", - " -499.179172\n", - " -503.072353\n", - " -497.00000\n", - " -497.000000\n", + " -499.069860\n", + " -494.820546\n", + " -496.000000\n", + " -496.000000\n", " \n", " \n", " 50%\n", - " 3.133260\n", - " -5.691740\n", - " 1.00000\n", - " 2.000000\n", + " 2.821413\n", + " 5.823958\n", + " 3.000000\n", + " 1.000000\n", " \n", " \n", " 75%\n", - " 502.454035\n", - " 495.773629\n", - " 497.00000\n", + " 499.807599\n", + " 505.991898\n", " 501.000000\n", + " 500.000000\n", " \n", " \n", " max\n", - " 999.998666\n", - " 999.985790\n", - " 999.00000\n", + " 999.984508\n", + " 999.948488\n", + " 999.000000\n", " 999.000000\n", " \n", " \n", @@ -949,18 +970,18 @@ "" ], "text/plain": [ - " column1 column2 column3 column4\n", - "count 100000.000000 100000.000000 100000.00000 100000.000000\n", - "mean 1.623964 -3.764614 1.34378 1.478400\n", - "std 577.971329 577.167935 575.56337 577.826276\n", - "min -999.983952 -999.980827 -999.00000 -999.000000\n", - "25% -499.179172 -503.072353 -497.00000 -497.000000\n", - "50% 3.133260 -5.691740 1.00000 2.000000\n", - "75% 502.454035 495.773629 497.00000 501.000000\n", - "max 999.998666 999.985790 999.00000 999.000000" + " column1 column2 column3 column4\n", + "count 100000.000000 100000.000000 100000.000000 100000.000000\n", + "mean 0.540672 4.948220 1.349060 1.572440\n", + "std 577.009925 577.169270 576.321436 576.108494\n", + "min -999.965920 -999.988117 -999.000000 -999.000000\n", + "25% -499.069860 -494.820546 -496.000000 -496.000000\n", + "50% 2.821413 5.823958 3.000000 1.000000\n", + "75% 499.807599 505.991898 501.000000 500.000000\n", + "max 999.984508 999.948488 999.000000 999.000000" ] }, - "execution_count": 29, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -979,7 +1000,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 17, "id": "1b86dd08-d8de-4b03-a363-80d18201b4da", "metadata": {}, "outputs": [], @@ -997,7 +1018,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 18, "id": "dea886ac-e1a2-4903-b550-e73a2be70507", "metadata": {}, "outputs": [ @@ -1038,50 +1059,50 @@ " \n", " \n", " mean\n", - " 1.695724\n", - " -3.040382\n", - " 4.001460\n", - " 0.511100\n", + " 0.455395\n", + " 5.588258\n", + " 1.329300\n", + " 1.763540\n", " \n", " \n", " std\n", - " 578.627560\n", - " 576.453798\n", - " 576.244217\n", - " 578.301376\n", + " 577.070175\n", + " 577.342945\n", + " 577.130306\n", + " 575.554149\n", " \n", " \n", " min\n", - " -999.983952\n", - " -999.968792\n", + " -999.965920\n", + " -999.963945\n", " -999.000000\n", " -999.000000\n", " \n", " \n", " 25%\n", - " -502.160012\n", - " -500.333306\n", - " -492.250000\n", - " -498.000000\n", + " -500.420461\n", + " -495.503431\n", + " -497.000000\n", + " -493.000000\n", " \n", " \n", " 50%\n", - " 4.057258\n", - " -4.377464\n", + " 3.895165\n", + " 9.582978\n", " 3.000000\n", - " 0.000000\n", + " 1.000000\n", " \n", " \n", " 75%\n", - " 502.159214\n", - " 494.614704\n", - " 499.000000\n", - " 500.250000\n", + " 496.851678\n", + " 505.826690\n", + " 501.000000\n", + " 499.250000\n", " \n", " \n", " max\n", - " 999.998666\n", - " 999.913716\n", + " 999.979256\n", + " 999.948488\n", " 999.000000\n", " 999.000000\n", " \n", @@ -1092,16 +1113,16 @@ "text/plain": [ " column1 column2 column3 column4\n", "count 50000.000000 50000.000000 50000.000000 50000.000000\n", - "mean 1.695724 -3.040382 4.001460 0.511100\n", - "std 578.627560 576.453798 576.244217 578.301376\n", - "min -999.983952 -999.968792 -999.000000 -999.000000\n", - "25% -502.160012 -500.333306 -492.250000 -498.000000\n", - "50% 4.057258 -4.377464 3.000000 0.000000\n", - "75% 502.159214 494.614704 499.000000 500.250000\n", - "max 999.998666 999.913716 999.000000 999.000000" + "mean 0.455395 5.588258 1.329300 1.763540\n", + "std 577.070175 577.342945 577.130306 575.554149\n", + "min -999.965920 -999.963945 -999.000000 -999.000000\n", + "25% -500.420461 -495.503431 -497.000000 -493.000000\n", + "50% 3.895165 9.582978 3.000000 1.000000\n", + "75% 496.851678 505.826690 501.000000 499.250000\n", + "max 999.979256 999.948488 999.000000 999.000000" ] }, - "execution_count": 31, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1120,7 +1141,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 19, "id": "c73063c8-4e9d-42ba-9804-1286b769ad54", "metadata": {}, "outputs": [ @@ -1156,56 +1177,56 @@ " count\n", " 100000.000000\n", " 100000.000000\n", - " 100000.00000\n", + " 100000.000000\n", " 100000.000000\n", " \n", " \n", " mean\n", - " 1.174175\n", - " -3.764614\n", - " 1.34378\n", - " 1.478400\n", + " 0.653302\n", + " 4.948220\n", + " 1.349060\n", + " 1.572440\n", " \n", " \n", " std\n", - " 408.414807\n", - " 577.167935\n", - " 575.56337\n", - " 577.826276\n", + " 408.053983\n", + " 577.169270\n", + " 576.321436\n", + " 576.108494\n", " \n", " \n", " min\n", " -500.000000\n", - " -999.980827\n", - " -999.00000\n", + " -999.988117\n", + " -999.000000\n", " -999.000000\n", " \n", " \n", " 25%\n", - " -499.179172\n", - " -503.072353\n", - " -497.00000\n", - " -497.000000\n", + " -499.069860\n", + " -494.820546\n", + " -496.000000\n", + " -496.000000\n", " \n", " \n", " 50%\n", - " 3.133260\n", - " -5.691740\n", - " 1.00000\n", - " 2.000000\n", + " 2.821413\n", + " 5.823958\n", + " 3.000000\n", + " 1.000000\n", " \n", " \n", " 75%\n", - " 500.000000\n", - " 495.773629\n", - " 497.00000\n", + " 499.807599\n", + " 505.991898\n", " 501.000000\n", + " 500.000000\n", " \n", " \n", " max\n", " 500.000000\n", - " 999.985790\n", - " 999.00000\n", + " 999.948488\n", + " 999.000000\n", " 999.000000\n", " \n", " \n", @@ -1213,18 +1234,18 @@ "" ], "text/plain": [ - " column1 column2 column3 column4\n", - "count 100000.000000 100000.000000 100000.00000 100000.000000\n", - "mean 1.174175 -3.764614 1.34378 1.478400\n", - "std 408.414807 577.167935 575.56337 577.826276\n", - "min -500.000000 -999.980827 -999.00000 -999.000000\n", - "25% -499.179172 -503.072353 -497.00000 -497.000000\n", - "50% 3.133260 -5.691740 1.00000 2.000000\n", - "75% 500.000000 495.773629 497.00000 501.000000\n", - "max 500.000000 999.985790 999.00000 999.000000" + " column1 column2 column3 column4\n", + "count 100000.000000 100000.000000 100000.000000 100000.000000\n", + "mean 0.653302 4.948220 1.349060 1.572440\n", + "std 408.053983 577.169270 576.321436 576.108494\n", + "min -500.000000 -999.988117 -999.000000 -999.000000\n", + "25% -499.069860 -494.820546 -496.000000 -496.000000\n", + "50% 2.821413 5.823958 3.000000 1.000000\n", + "75% 499.807599 505.991898 501.000000 500.000000\n", + "max 500.000000 999.948488 999.000000 999.000000" ] }, - "execution_count": 32, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1243,7 +1264,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 20, "id": "1151d8a4-db49-4524-b7f3-bd38866e3d5e", "metadata": {}, "outputs": [], @@ -1253,7 +1274,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 21, "id": "bd8ce0d1-73cb-4982-b4fe-65e7d2144943", "metadata": {}, "outputs": [ @@ -1289,56 +1310,56 @@ " count\n", " 100000.000000\n", " 100000.000000\n", - " 100000.00000\n", + " 100000.000000\n", " 100000.000000\n", " \n", " \n", " mean\n", - " 1.174175\n", - " -2.876739\n", - " 1.34378\n", - " 1.478400\n", + " 0.653302\n", + " 3.035664\n", + " 1.349060\n", + " 1.572440\n", " \n", " \n", " std\n", - " 408.414807\n", - " 408.107880\n", - " 575.56337\n", - " 577.826276\n", + " 408.053983\n", + " 408.488775\n", + " 576.321436\n", + " 576.108494\n", " \n", " \n", " min\n", " -500.000000\n", " -500.000000\n", - " -999.00000\n", + " -999.000000\n", " -999.000000\n", " \n", " \n", " 25%\n", - " -499.179172\n", - " -500.000000\n", - " -497.00000\n", - " -497.000000\n", + " -499.069860\n", + " -494.820546\n", + " -496.000000\n", + " -496.000000\n", " \n", " \n", " 50%\n", - " 3.133260\n", - " -5.691740\n", - " 1.00000\n", - " 2.000000\n", + " 2.821413\n", + " 5.823958\n", + " 3.000000\n", + " 1.000000\n", " \n", " \n", " 75%\n", + " 499.807599\n", " 500.000000\n", - " 495.773629\n", - " 497.00000\n", " 501.000000\n", + " 500.000000\n", " \n", " \n", " max\n", " 500.000000\n", " 500.000000\n", - " 999.00000\n", + " 999.000000\n", " 999.000000\n", " \n", " \n", @@ -1346,18 +1367,18 @@ "" ], "text/plain": [ - " column1 column2 column3 column4\n", - "count 100000.000000 100000.000000 100000.00000 100000.000000\n", - "mean 1.174175 -2.876739 1.34378 1.478400\n", - "std 408.414807 408.107880 575.56337 577.826276\n", - "min -500.000000 -500.000000 -999.00000 -999.000000\n", - "25% -499.179172 -500.000000 -497.00000 -497.000000\n", - "50% 3.133260 -5.691740 1.00000 2.000000\n", - "75% 500.000000 495.773629 497.00000 501.000000\n", - "max 500.000000 500.000000 999.00000 999.000000" + " column1 column2 column3 column4\n", + "count 100000.000000 100000.000000 100000.000000 100000.000000\n", + "mean 0.653302 3.035664 1.349060 1.572440\n", + "std 408.053983 408.488775 576.321436 576.108494\n", + "min -500.000000 -500.000000 -999.000000 -999.000000\n", + "25% -499.069860 -494.820546 -496.000000 -496.000000\n", + "50% 2.821413 5.823958 3.000000 1.000000\n", + "75% 499.807599 500.000000 501.000000 500.000000\n", + "max 500.000000 500.000000 999.000000 999.000000" ] }, - "execution_count": 34, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1376,7 +1397,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 22, "id": "bea974f5-367b-4432-893f-a33a449dda6f", "metadata": {}, "outputs": [ @@ -1417,50 +1438,50 @@ " \n", " \n", " mean\n", - " 1.695724\n", - " -3.040382\n", - " 4.001460\n", - " 0.511100\n", + " 0.455395\n", + " 5.588258\n", + " 1.329300\n", + " 1.763540\n", " \n", " \n", " std\n", - " 578.627560\n", - " 576.453798\n", - " 576.244217\n", - " 578.301376\n", + " 577.070175\n", + " 577.342945\n", + " 577.130306\n", + " 575.554149\n", " \n", " \n", " min\n", - " -999.983952\n", - " -999.968792\n", + " -999.965920\n", + " -999.963945\n", " -999.000000\n", " -999.000000\n", " \n", " \n", " 25%\n", - " -502.160012\n", - " -500.333306\n", - " -492.250000\n", - " -498.000000\n", + " -500.420461\n", + " -495.503431\n", + " -497.000000\n", + " -493.000000\n", " \n", " \n", " 50%\n", - " 4.057258\n", - " -4.377464\n", + " 3.895165\n", + " 9.582978\n", " 3.000000\n", - " 0.000000\n", + " 1.000000\n", " \n", " \n", " 75%\n", - " 502.159214\n", - " 494.614704\n", - " 499.000000\n", - " 500.250000\n", + " 496.851678\n", + " 505.826690\n", + " 501.000000\n", + " 499.250000\n", " \n", " \n", " max\n", - " 999.998666\n", - " 999.913716\n", + " 999.979256\n", + " 999.948488\n", " 999.000000\n", " 999.000000\n", " \n", @@ -1471,16 +1492,16 @@ "text/plain": [ " column1 column2 column3 column4\n", "count 50000.000000 50000.000000 50000.000000 50000.000000\n", - "mean 1.695724 -3.040382 4.001460 0.511100\n", - "std 578.627560 576.453798 576.244217 578.301376\n", - "min -999.983952 -999.968792 -999.000000 -999.000000\n", - "25% -502.160012 -500.333306 -492.250000 -498.000000\n", - "50% 4.057258 -4.377464 3.000000 0.000000\n", - "75% 502.159214 494.614704 499.000000 500.250000\n", - "max 999.998666 999.913716 999.000000 999.000000" + "mean 0.455395 5.588258 1.329300 1.763540\n", + "std 577.070175 577.342945 577.130306 575.554149\n", + "min -999.965920 -999.963945 -999.000000 -999.000000\n", + "25% -500.420461 -495.503431 -497.000000 -493.000000\n", + "50% 3.895165 9.582978 3.000000 1.000000\n", + "75% 496.851678 505.826690 501.000000 499.250000\n", + "max 999.979256 999.948488 999.000000 999.000000" ] }, - "execution_count": 35, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1507,7 +1528,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 23, "id": "e4aa3b34-c948-4efe-b523-37e3ab2b2522", "metadata": {}, "outputs": [], @@ -1517,7 +1538,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 24, "id": "018ed5c4-de6f-4a18-bc11-ae3917dcf481", "metadata": {}, "outputs": [], @@ -1532,7 +1553,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 25, "id": "08daae66-9e28-4394-8021-b64cd806a4ab", "metadata": {}, "outputs": [ @@ -1540,7 +1561,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "6.94 ms ± 638 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + "7.19 ms ± 1.07 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], @@ -1551,7 +1572,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 26, "id": "04dd1afc-11cb-46a6-9e15-28461c9e96b3", "metadata": {}, "outputs": [ @@ -1559,7 +1580,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "946 μs ± 49.5 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" + "962 μs ± 137 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" ] } ], @@ -1570,7 +1591,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 27, "id": "5bc546df-96b7-446c-be09-3bdc48b6e86b", "metadata": {}, "outputs": [], @@ -1580,7 +1601,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 28, "id": "e8d2d195-0c5f-48ee-956f-895c6ae45fa3", "metadata": {}, "outputs": [], @@ -1595,7 +1616,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 29, "id": "5d455f94-47f4-4582-8019-15763457198b", "metadata": {}, "outputs": [ @@ -1603,7 +1624,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "778 ms ± 78.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "971 ms ± 162 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -1614,7 +1635,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 30, "id": "b8b68166-7010-41d7-b2ea-f6536dc2d147", "metadata": {}, "outputs": [ @@ -1622,7 +1643,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "49 ms ± 6.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "66.8 ms ± 13.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -1636,7 +1657,7 @@ "id": "ca530907-9cda-408f-816a-f6f34351e0d9", "metadata": {}, "source": [ - "As you can see, assigning the column to perform an in--place operations is significantly faster." + "As you can see, assigning the column to perform an in-place operations is significantly faster." ] }, { @@ -1688,7 +1709,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.7" + "version": "3.12.12" } }, "nbformat": 4, diff --git a/source-code/pandas/from_long_to_wide_and_back_again.ipynb b/source-code/pandas/from_long_to_wide_and_back_again.ipynb new file mode 100644 index 0000000..a3610d4 --- /dev/null +++ b/source-code/pandas/from_long_to_wide_and_back_again.ipynb @@ -0,0 +1,1078 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4773bfbd-a3d3-4683-b51e-c80a649a7caf", + "metadata": {}, + "source": [ + "## Requirements" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "d7ab15e7-4fd2-4e81-a9ae-8469b05ec45c", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "ee9056b1-0b76-4cbb-ad68-049b0095b62d", + "metadata": {}, + "source": [ + "## Original dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "2c1acc53-9064-4589-9e88-541175d3deb0", + "metadata": {}, + "outputs": [], + "source": [ + "df_orig = pd.read_excel('data/patient_experiment.xlsx',\n", + " dtype={'dose': np.float32,\n", + " 'temperature': np.float32})" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "b64f834e-b044-422c-830f-112868864269", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 62 entries, 0 to 61\n", + "Data columns (total 4 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 patient 62 non-null int64 \n", + " 1 dose 61 non-null float32 \n", + " 2 date 62 non-null datetime64[ns]\n", + " 3 temperature 61 non-null float32 \n", + "dtypes: datetime64[ns](1), float32(2), int64(1)\n", + "memory usage: 1.6 KB\n" + ] + } + ], + "source": [ + "df_orig.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "df59f8e2-0cb6-4dae-b2b2-71a89a5a894a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
patientdosedatetemperature
010.02012-10-02 10:00:0038.299999
112.02012-10-02 11:00:0038.500000
212.02012-10-02 12:00:0038.099998
312.02012-10-02 13:00:0037.299999
410.02012-10-02 14:00:0037.500000
\n", + "
" + ], + "text/plain": [ + " patient dose date temperature\n", + "0 1 0.0 2012-10-02 10:00:00 38.299999\n", + "1 1 2.0 2012-10-02 11:00:00 38.500000\n", + "2 1 2.0 2012-10-02 12:00:00 38.099998\n", + "3 1 2.0 2012-10-02 13:00:00 37.299999\n", + "4 1 0.0 2012-10-02 14:00:00 37.500000" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_orig.head()" + ] + }, + { + "cell_type": "markdown", + "id": "115ce86f-4e2f-4ea3-a240-2cc68d5721d8", + "metadata": {}, + "source": [ + "## To wide format: pivot" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "1adfa94d-dbb6-41cc-af21-9ca6d1e97226", + "metadata": {}, + "outputs": [], + "source": [ + "df_wide = df_orig.pivot(\n", + " index='date',\n", + " values=['temperature', 'dose'],\n", + " columns=['patient']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "18c3ba13-5949-4c14-95f2-1577466313b3", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "DatetimeIndex: 7 entries, 2012-10-02 10:00:00 to 2012-10-02 16:00:00\n", + "Data columns (total 18 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 (temperature, 1) 7 non-null float32\n", + " 1 (temperature, 2) 7 non-null float32\n", + " 2 (temperature, 3) 6 non-null float32\n", + " 3 (temperature, 4) 7 non-null float32\n", + " 4 (temperature, 5) 7 non-null float32\n", + " 5 (temperature, 6) 6 non-null float32\n", + " 6 (temperature, 7) 7 non-null float32\n", + " 7 (temperature, 8) 7 non-null float32\n", + " 8 (temperature, 9) 7 non-null float32\n", + " 9 (dose, 1) 7 non-null float32\n", + " 10 (dose, 2) 7 non-null float32\n", + " 11 (dose, 3) 7 non-null float32\n", + " 12 (dose, 4) 6 non-null float32\n", + " 13 (dose, 5) 7 non-null float32\n", + " 14 (dose, 6) 6 non-null float32\n", + " 15 (dose, 7) 7 non-null float32\n", + " 16 (dose, 8) 7 non-null float32\n", + " 17 (dose, 9) 7 non-null float32\n", + "dtypes: float32(18)\n", + "memory usage: 560.0 bytes\n" + ] + } + ], + "source": [ + "df_wide.info()" + ] + }, + { + "cell_type": "markdown", + "id": "ea3a9d6b-fd0a-474d-8c42-df7a1f7d7e08", + "metadata": {}, + "source": [ + "Now you have a dataframe with the date as index, and multi-level columns. The top-level is the temerature and the dose, but next level is the patient ID." + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "a41a59d3-1d12-4aee-a7c1-3b85a9950b07", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
temperaturedose
patient123456789123456789
date
2012-10-02 10:00:0038.29999939.29999937.90000238.09999837.90000237.50000039.50000037.79999938.2999990.00.00.00.00.00.00.00.00.0
2012-10-02 11:00:0038.50000039.40000239.50000037.20000139.50000038.09999840.70000137.90000239.5000002.05.02.05.03.02.010.00.010.0
2012-10-02 12:00:0038.09999838.09999838.29999936.09999838.29999937.90000239.79999937.40000240.2000012.05.05.05.07.03.05.00.012.0
2012-10-02 13:00:0037.29999937.299999NaN35.90000238.50000037.70000140.20000137.59999839.0999982.05.02.00.05.02.08.00.04.0
2012-10-02 14:00:0037.50000036.79999937.70000136.29999939.40000237.20000138.29999937.29999937.9000020.00.02.0NaN9.01.03.00.04.0
\n", + "
" + ], + "text/plain": [ + " temperature \\\n", + "patient 1 2 3 4 5 \n", + "date \n", + "2012-10-02 10:00:00 38.299999 39.299999 37.900002 38.099998 37.900002 \n", + "2012-10-02 11:00:00 38.500000 39.400002 39.500000 37.200001 39.500000 \n", + "2012-10-02 12:00:00 38.099998 38.099998 38.299999 36.099998 38.299999 \n", + "2012-10-02 13:00:00 37.299999 37.299999 NaN 35.900002 38.500000 \n", + "2012-10-02 14:00:00 37.500000 36.799999 37.700001 36.299999 39.400002 \n", + "\n", + " dose \\\n", + "patient 6 7 8 9 1 2 \n", + "date \n", + "2012-10-02 10:00:00 37.500000 39.500000 37.799999 38.299999 0.0 0.0 \n", + "2012-10-02 11:00:00 38.099998 40.700001 37.900002 39.500000 2.0 5.0 \n", + "2012-10-02 12:00:00 37.900002 39.799999 37.400002 40.200001 2.0 5.0 \n", + "2012-10-02 13:00:00 37.700001 40.200001 37.599998 39.099998 2.0 5.0 \n", + "2012-10-02 14:00:00 37.200001 38.299999 37.299999 37.900002 0.0 0.0 \n", + "\n", + " \n", + "patient 3 4 5 6 7 8 9 \n", + "date \n", + "2012-10-02 10:00:00 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "2012-10-02 11:00:00 2.0 5.0 3.0 2.0 10.0 0.0 10.0 \n", + "2012-10-02 12:00:00 5.0 5.0 7.0 3.0 5.0 0.0 12.0 \n", + "2012-10-02 13:00:00 2.0 0.0 5.0 2.0 8.0 0.0 4.0 \n", + "2012-10-02 14:00:00 2.0 NaN 9.0 1.0 3.0 0.0 4.0 " + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_wide.head()" + ] + }, + { + "cell_type": "markdown", + "id": "fd0c0272-d182-47c4-bf41-81c869b06932", + "metadata": {}, + "source": [ + "## And back again: stack + reset index" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "81d18cc4-0451-4fd2-ac0a-56e868a31df0", + "metadata": {}, + "outputs": [], + "source": [ + "df_long = df_wide \\\n", + " .stack('patient', future_stack=True) \\\n", + " .reset_index() " + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "3a0b4e1f-560e-4639-bcc1-330bdf82c6d2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 63 entries, 0 to 62\n", + "Data columns (total 4 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 date 63 non-null datetime64[ns]\n", + " 1 patient 63 non-null int64 \n", + " 2 temperature 61 non-null float32 \n", + " 3 dose 61 non-null float32 \n", + "dtypes: datetime64[ns](1), float32(2), int64(1)\n", + "memory usage: 1.6 KB\n" + ] + } + ], + "source": [ + "df_long.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "551264e3-967b-43a2-b8c3-494095f88f87", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datepatienttemperaturedose
02012-10-02 10:00:00138.2999990.0
12012-10-02 10:00:00239.2999990.0
22012-10-02 10:00:00337.9000020.0
32012-10-02 10:00:00438.0999980.0
42012-10-02 10:00:00537.9000020.0
\n", + "
" + ], + "text/plain": [ + " date patient temperature dose\n", + "0 2012-10-02 10:00:00 1 38.299999 0.0\n", + "1 2012-10-02 10:00:00 2 39.299999 0.0\n", + "2 2012-10-02 10:00:00 3 37.900002 0.0\n", + "3 2012-10-02 10:00:00 4 38.099998 0.0\n", + "4 2012-10-02 10:00:00 5 37.900002 0.0" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_long.head()" + ] + }, + { + "cell_type": "markdown", + "id": "00be5fc1-e40d-4091-b0d6-4019a412f1e0", + "metadata": {}, + "source": [ + "Breaking it down into two steps, first the `stack()` method:" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "507b3ec3-6780-40e7-a7b8-c2345c3efa61", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
temperaturedose
datepatient
2012-10-02 10:00:00138.2999990.0
239.2999990.0
337.9000020.0
438.0999980.0
537.9000020.0
............
2012-10-02 16:00:00537.2000010.0
6NaNNaN
737.2999991.0
836.7999990.0
937.2999990.0
\n", + "

63 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " temperature dose\n", + "date patient \n", + "2012-10-02 10:00:00 1 38.299999 0.0\n", + " 2 39.299999 0.0\n", + " 3 37.900002 0.0\n", + " 4 38.099998 0.0\n", + " 5 37.900002 0.0\n", + "... ... ...\n", + "2012-10-02 16:00:00 5 37.200001 0.0\n", + " 6 NaN NaN\n", + " 7 37.299999 1.0\n", + " 8 36.799999 0.0\n", + " 9 37.299999 0.0\n", + "\n", + "[63 rows x 2 columns]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_wide.stack(\"patient\", future_stack=True)" + ] + }, + { + "cell_type": "markdown", + "id": "57e15671-7a85-41b0-bcb5-fe2e089efdc6", + "metadata": {}, + "source": [ + "As you can see, this has created a dataframe that has only two columns, but a multi-level index. The top-level index is the date, the sublevel is the patient." + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "4e351540-2f48-4ef3-aa47-e8929cb1c4e4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datepatienttemperaturedose
02012-10-02 10:00:00138.2999990.0
12012-10-02 10:00:00239.2999990.0
22012-10-02 10:00:00337.9000020.0
32012-10-02 10:00:00438.0999980.0
42012-10-02 10:00:00537.9000020.0
...............
582012-10-02 16:00:00537.2000010.0
592012-10-02 16:00:006NaNNaN
602012-10-02 16:00:00737.2999991.0
612012-10-02 16:00:00836.7999990.0
622012-10-02 16:00:00937.2999990.0
\n", + "

63 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " date patient temperature dose\n", + "0 2012-10-02 10:00:00 1 38.299999 0.0\n", + "1 2012-10-02 10:00:00 2 39.299999 0.0\n", + "2 2012-10-02 10:00:00 3 37.900002 0.0\n", + "3 2012-10-02 10:00:00 4 38.099998 0.0\n", + "4 2012-10-02 10:00:00 5 37.900002 0.0\n", + ".. ... ... ... ...\n", + "58 2012-10-02 16:00:00 5 37.200001 0.0\n", + "59 2012-10-02 16:00:00 6 NaN NaN\n", + "60 2012-10-02 16:00:00 7 37.299999 1.0\n", + "61 2012-10-02 16:00:00 8 36.799999 0.0\n", + "62 2012-10-02 16:00:00 9 37.299999 0.0\n", + "\n", + "[63 rows x 4 columns]" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_long = df_wide.stack(\"patient\", future_stack=True).reset_index()\n", + "df_long" + ] + }, + { + "cell_type": "markdown", + "id": "23489994-b08f-4aed-a49e-daa0908015b4", + "metadata": {}, + "source": [ + "Resetting the index will create columns out of the multi-level index, so one for the date, a second for the patient ID." + ] + }, + { + "cell_type": "markdown", + "id": "e7030ce2-291e-45fa-a791-05b7eaa8f1b2", + "metadata": {}, + "source": [ + "If you prefer to get rid of the column name, simply set it to `None`." + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "ca46e4f1-407a-4e64-ba85-837e0ec4997e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datepatienttemperaturedose
02012-10-02 10:00:00138.2999990.0
12012-10-02 10:00:00239.2999990.0
22012-10-02 10:00:00337.9000020.0
32012-10-02 10:00:00438.0999980.0
42012-10-02 10:00:00537.9000020.0
...............
582012-10-02 16:00:00537.2000010.0
592012-10-02 16:00:006NaNNaN
602012-10-02 16:00:00737.2999991.0
612012-10-02 16:00:00836.7999990.0
622012-10-02 16:00:00937.2999990.0
\n", + "

63 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " date patient temperature dose\n", + "0 2012-10-02 10:00:00 1 38.299999 0.0\n", + "1 2012-10-02 10:00:00 2 39.299999 0.0\n", + "2 2012-10-02 10:00:00 3 37.900002 0.0\n", + "3 2012-10-02 10:00:00 4 38.099998 0.0\n", + "4 2012-10-02 10:00:00 5 37.900002 0.0\n", + ".. ... ... ... ...\n", + "58 2012-10-02 16:00:00 5 37.200001 0.0\n", + "59 2012-10-02 16:00:00 6 NaN NaN\n", + "60 2012-10-02 16:00:00 7 37.299999 1.0\n", + "61 2012-10-02 16:00:00 8 36.799999 0.0\n", + "62 2012-10-02 16:00:00 9 37.299999 0.0\n", + "\n", + "[63 rows x 4 columns]" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_long.columns.name = None\n", + "df_long" + ] + }, + { + "cell_type": "markdown", + "id": "7447a0bd-fe9e-4976-9c22-0e3c73674def", + "metadata": {}, + "source": [ + "Except for the order of the columns, and the sorting of the rows, you are back to the original data format." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/source-code/pandas/numba_and_pandas.ipynb b/source-code/pandas/numba_and_pandas.ipynb new file mode 100644 index 0000000..6dc82e0 --- /dev/null +++ b/source-code/pandas/numba_and_pandas.ipynb @@ -0,0 +1,327 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4be65a5f-39d9-42f6-a1ec-beee22363ce3", + "metadata": {}, + "source": [ + "## Requirements" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "2c930efe-41a4-4665-a419-7d1ad683cc82", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "from numba import njit\n", + "import numpy as np\n", + "import pandas as pd\n", + "import time" + ] + }, + { + "cell_type": "markdown", + "id": "8cfbec4d-90d1-4e45-9c9e-310475a395f0", + "metadata": {}, + "source": [ + "## Using numba" + ] + }, + { + "cell_type": "markdown", + "id": "8ec085cd-5fce-420b-8f12-46b9a9e56269", + "metadata": {}, + "source": [ + "Consider the following dataframe with 2 million rows." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "31f2457e-b51a-4e6c-91fb-852c088c6792", + "metadata": {}, + "outputs": [], + "source": [ + "size = 2_000_000\n", + "df = pd.DataFrame({\n", + " 'x': np.random.rand(size),\n", + " 'y': np.random.rand(size),\n", + "})" + ] + }, + { + "cell_type": "markdown", + "id": "7203ebcd-b1ba-4877-90ef-e1b9665d1029", + "metadata": {}, + "source": [ + "You want to create a series computed as $\\sqrt{x^2 + y^2}$. You can consider three approoaches:\n", + "1. pandas' `.apply()` method,\n", + "2. numpy expressions, and\n", + "3. using a numba function." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "20974ffd-621b-4ba5-a168-01f27a307712", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 8.93 s, sys: 288 ms, total: 9.22 s\n", + "Wall time: 9.22 s\n" + ] + } + ], + "source": [ + "%time df.apply(lambda row: np.sqrt(row['x']**2 + row['y']**2), axis=1);" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "b15fabcb-22d5-487e-bd79-72d1f6924e7f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 34.4 ms, sys: 16.1 ms, total: 50.5 ms\n", + "Wall time: 48.9 ms\n" + ] + } + ], + "source": [ + "%time np.sqrt(df['x']**2 + df['y']**2);" + ] + }, + { + "cell_type": "markdown", + "id": "0f53a473-d8bc-40b3-ba63-7b4db3ec81ca", + "metadata": {}, + "source": [ + "It is clear that the using numpy is much more efficient, the speedup is 250." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "4bddc8bb-5e68-4a7c-9c45-589f797dac12", + "metadata": {}, + "outputs": [], + "source": [ + "@njit\n", + "def score_numba(x, y):\n", + " result = np.empty_like(x)\n", + " for i in range(len(x)):\n", + " result[i] = np.sqrt(x[i]**2 + y[i]**2)\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bd7cc279-76d1-4b2d-a300-24142d0fafa2", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 248 ms, sys: 47.8 ms, total: 296 ms\n", + "Wall time: 433 ms\n" + ] + } + ], + "source": [ + "%time score_numba(df.x.values, df.y.values);" + ] + }, + { + "cell_type": "markdown", + "id": "a5b4731e-593d-41ee-8354-54a066992c8c", + "metadata": {}, + "source": [ + "Using numba is about 25 times faster than the pandas `.apply()` method, but 10 slower than numpy, so is there a point?\n", + "\n", + "There is if you run that function multiple times." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8339e439-3d9c-4404-af22-3af502ecf941", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 5.7 ms, sys: 0 ns, total: 5.7 ms\n", + "Wall time: 5.77 ms\n" + ] + } + ], + "source": [ + "%time score_numba(df.x.values, df.y.values);" + ] + }, + { + "cell_type": "markdown", + "id": "d6900b2d-3dd1-4e90-aee2-a526b8dd058d", + "metadata": {}, + "source": [ + "As you can see, numba is now more than 2,000 times faster then the equivalent `.apply()` method call. Once the initial compilation has been done, there is little or no overhead on subsequent calls." + ] + }, + { + "cell_type": "markdown", + "id": "7b302dd6-97e9-4294-8b3c-2236fd3c03a2", + "metadata": {}, + "source": [ + "This can be compared to a similar implementation with a non-compiled Python function." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "024672ee-7e66-4e6b-a199-b78c81489735", + "metadata": {}, + "outputs": [], + "source": [ + "def score_python(x, y):\n", + " result = np.empty_like(x)\n", + " for i in range(len(x)):\n", + " result[i] = np.sqrt(x[i]**2 + y[i]**2)\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "796d265e-4c6b-4f70-ac08-dfe7d90036db", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.74 s, sys: 0 ns, total: 1.74 s\n", + "Wall time: 1.74 s\n" + ] + } + ], + "source": [ + "%time score_python(df.x.values, df.y.values);" + ] + }, + { + "cell_type": "markdown", + "id": "26476f95-fb22-4751-8ac9-80d90f210c6c", + "metadata": {}, + "source": [ + "Even this approach is faster than pandas' `.apply()`, but still more than 200 times slower than numba (once compiled)." + ] + }, + { + "cell_type": "markdown", + "id": "f613ea5c-eddc-4517-a31b-980ae613d072", + "metadata": {}, + "source": [ + "## Benchmark" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "15f830eb-3ea7-43b7-a31f-c2b234735df7", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sizes = [50_000, 100_000, 500_000, 1_000_000, 2_000_000]\n", + "times_apply, times_numpy, times_numba = [], [], []\n", + "\n", + "for size in sizes:\n", + " df = pd.DataFrame({\n", + " \"x\": np.random.rand(size),\n", + " \"y\": np.random.rand(size)\n", + " })\n", + " \n", + " start = time.time()\n", + " df.apply(lambda row: np.sqrt(row['x']**2 + row['y']**2), axis=1)\n", + " times_apply.append(time.time() - start)\n", + " \n", + " start = time.time()\n", + " np.sqrt(df['x']**2 + df['y']**2)\n", + " times_numpy.append(time.time() - start)\n", + " \n", + " score_numba(df['x'].values, df['y'].values) # warm-up compile\n", + " start = time.time()\n", + " score_numba(df['x'].values, df['y'].values)\n", + " times_numba.append(time.time() - start)\n", + "\n", + "plt.figure(figsize=(7,5))\n", + "plt.plot(sizes, times_apply, \"o-r\", label=\"Pandas apply()\")\n", + "plt.plot(sizes, times_numpy, \"x-g\", label=\"Numpy\")\n", + "plt.plot(sizes, times_numba, \"s-b\", label=\"Numba accelerated\")\n", + "plt.xlabel(\"Number of Rows\")\n", + "plt.ylabel(\"Runtime (seconds)\")\n", + "plt.yscale('log')\n", + "plt.legend()\n", + "plt.grid(True)" + ] + }, + { + "cell_type": "markdown", + "id": "3e37e637-a4e5-4422-b769-4de985926a46", + "metadata": {}, + "source": [ + "It is clear that numba can significantly speedup computations that rely on function that are called repeatedly." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}