diff --git a/demos/demos_databases_apis/gpu_rapids/part_iii_gpu_blazingsql.ipynb b/demos/demos_databases_apis/gpu_rapids/part_iii_gpu_blazingsql.ipynb new file mode 100644 index 0000000000..abfca6f256 --- /dev/null +++ b/demos/demos_databases_apis/gpu_rapids/part_iii_gpu_blazingsql.ipynb @@ -0,0 +1,367 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BlazingSQL + Graphistry: Netflow analysis\n", + "\n", + "This tutorial shows running BlazingSQL (GPU-accelerated SQL) on raw parquet files and visually analyzing the result with Graphistry" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!wget -Pq data/ https://blazingsql-colab.s3.amazonaws.com/netflow_parquet/1_0_0.parquet\n", + "!wget -Pq data/ https://blazingsql-colab.s3.amazonaws.com/netflow_parquet/1_1_0.parquet \n", + "!wget -Pq data/ https://blazingsql-colab.s3.amazonaws.com/netflow_parquet/1_2_0.parquet\n", + "!wget -Pq data/ https://blazingsql-colab.s3.amazonaws.com/netflow_parquet/1_3_0.parquet" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 434M\r\n", + "drwxr-xr-x 2 graphistry graphistry 4.0K Nov 19 23:26 .\r\n", + "drwxr-xr-x 1 graphistry root 4.0K Nov 19 23:37 ..\r\n", + "-rw-r--r-- 1 graphistry graphistry 108M Jul 30 22:02 1_0_0.parquet\r\n", + "-rw-r--r-- 1 graphistry graphistry 135M Jul 30 22:01 1_1_0.parquet\r\n", + "-rw-r--r-- 1 graphistry graphistry 85M Jul 30 22:01 1_2_0.parquet\r\n", + "-rw-r--r-- 1 graphistry graphistry 108M Jul 30 22:01 1_3_0.parquet\r\n" + ] + } + ], + "source": [ + "!ls -alh data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load data into table" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BlazingContext ready\n" + ] + } + ], + "source": [ + "from blazingsql import BlazingContext \n", + "\n", + "bc = BlazingContext()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['/home/graphistry']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "local_path = !pwd\n", + "local_path" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bc.create_table('netflow', local_path[0] + '/data/*_0.parquet')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compute IP<>IP flow summaries" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 311 ms, sys: 257 ms, total: 568 ms\n", + "Wall time: 4.75 s\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sourcedestinationtargetPortsbytesOutbytesIndurationSecondsfirstFlowDatelastFlowDateattemptCount
010.0.0.1346932850644113602013-04-01 08:57:312013-04-06 08:29:308
110.0.0.122605201266010035202013-04-01 09:29:152013-04-07 06:51:0020
210.0.0.7297911063205016302013-04-01 10:54:312013-04-06 07:03:5210
\n", + "
" + ], + "text/plain": [ + " source destination targetPorts bytesOut bytesIn durationSeconds \\\n", + "0 10.0.0.13 46932 8 5064 41 1360 \n", + "1 10.0.0.12 2605 20 12660 100 3520 \n", + "2 10.0.0.7 29791 10 6320 50 1630 \n", + "\n", + " firstFlowDate lastFlowDate attemptCount \n", + "0 2013-04-01 08:57:31 2013-04-06 08:29:30 8 \n", + "1 2013-04-01 09:29:15 2013-04-07 06:51:00 20 \n", + "2 2013-04-01 10:54:31 2013-04-06 07:03:52 10 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "\n", + "result = bc.sql('''\n", + " SELECT\n", + " a.firstSeenSrcIp as source,\n", + " a.firstSeenDestIp as destination,\n", + " count(a.firstSeenDestPort) as targetPorts,\n", + " SUM(a.firstSeenSrcTotalBytes) as bytesOut,\n", + " SUM(a.firstSeenDestTotalBytes) as bytesIn,\n", + " SUM(a.durationSeconds) as durationSeconds,\n", + " MIN(parsedDate) as firstFlowDate,\n", + " MAX(parsedDate) as lastFlowDate,\n", + " COUNT(*) as attemptCount\n", + " FROM\n", + " netflow a\n", + " GROUP BY\n", + " a.firstSeenSrcIp,\n", + " a.firstSeenDestIp\n", + " ''').get()\n", + "\n", + "gdf = result.columns\n", + "\n", + "gdf.head(3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualize network" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import graphistry\n", + "\n", + "#upload protobuf instead of json\n", + "graphistry.register(api=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1303786" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(gdf.to_pandas())" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING: Large graph: |nodes| + |edges| = 1369523. Layout/rendering might be slow.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Uploading 19425 kB. This may take a while...\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "graphistry.bind(source='source', destination='destination').plot(gdf.to_pandas())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.7 (RAPIDS)", + "language": "python", + "name": "rapids" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}