Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
367 changes: 367 additions & 0 deletions demos/demos_databases_apis/gpu_rapids/part_iii_gpu_blazingsql.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,367 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# BlazingSQL + Graphistry: Netflow analysis\n",
"\n",
"This tutorial shows running BlazingSQL (GPU-accelerated SQL) on raw parquet files and visually analyzing the result with Graphistry"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Download data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!wget -Pq data/ https://blazingsql-colab.s3.amazonaws.com/netflow_parquet/1_0_0.parquet\n",
"!wget -Pq data/ https://blazingsql-colab.s3.amazonaws.com/netflow_parquet/1_1_0.parquet \n",
"!wget -Pq data/ https://blazingsql-colab.s3.amazonaws.com/netflow_parquet/1_2_0.parquet\n",
"!wget -Pq data/ https://blazingsql-colab.s3.amazonaws.com/netflow_parquet/1_3_0.parquet"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"total 434M\r\n",
"drwxr-xr-x 2 graphistry graphistry 4.0K Nov 19 23:26 .\r\n",
"drwxr-xr-x 1 graphistry root 4.0K Nov 19 23:37 ..\r\n",
"-rw-r--r-- 1 graphistry graphistry 108M Jul 30 22:02 1_0_0.parquet\r\n",
"-rw-r--r-- 1 graphistry graphistry 135M Jul 30 22:01 1_1_0.parquet\r\n",
"-rw-r--r-- 1 graphistry graphistry 85M Jul 30 22:01 1_2_0.parquet\r\n",
"-rw-r--r-- 1 graphistry graphistry 108M Jul 30 22:01 1_3_0.parquet\r\n"
]
}
],
"source": [
"!ls -alh data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load data into table"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"BlazingContext ready\n"
]
}
],
"source": [
"from blazingsql import BlazingContext \n",
"\n",
"bc = BlazingContext()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['/home/graphistry']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"local_path = !pwd\n",
"local_path"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<pyblazing.apiv2.sql.Table at 0x7fd13cfa7cc0>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bc.create_table('netflow', local_path[0] + '/data/*_0.parquet')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Compute IP<>IP flow summaries"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 311 ms, sys: 257 ms, total: 568 ms\n",
"Wall time: 4.75 s\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>source</th>\n",
" <th>destination</th>\n",
" <th>targetPorts</th>\n",
" <th>bytesOut</th>\n",
" <th>bytesIn</th>\n",
" <th>durationSeconds</th>\n",
" <th>firstFlowDate</th>\n",
" <th>lastFlowDate</th>\n",
" <th>attemptCount</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>10.0.0.13</td>\n",
" <td>46932</td>\n",
" <td>8</td>\n",
" <td>5064</td>\n",
" <td>41</td>\n",
" <td>1360</td>\n",
" <td>2013-04-01 08:57:31</td>\n",
" <td>2013-04-06 08:29:30</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>10.0.0.12</td>\n",
" <td>2605</td>\n",
" <td>20</td>\n",
" <td>12660</td>\n",
" <td>100</td>\n",
" <td>3520</td>\n",
" <td>2013-04-01 09:29:15</td>\n",
" <td>2013-04-07 06:51:00</td>\n",
" <td>20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>10.0.0.7</td>\n",
" <td>29791</td>\n",
" <td>10</td>\n",
" <td>6320</td>\n",
" <td>50</td>\n",
" <td>1630</td>\n",
" <td>2013-04-01 10:54:31</td>\n",
" <td>2013-04-06 07:03:52</td>\n",
" <td>10</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" source destination targetPorts bytesOut bytesIn durationSeconds \\\n",
"0 10.0.0.13 46932 8 5064 41 1360 \n",
"1 10.0.0.12 2605 20 12660 100 3520 \n",
"2 10.0.0.7 29791 10 6320 50 1630 \n",
"\n",
" firstFlowDate lastFlowDate attemptCount \n",
"0 2013-04-01 08:57:31 2013-04-06 08:29:30 8 \n",
"1 2013-04-01 09:29:15 2013-04-07 06:51:00 20 \n",
"2 2013-04-01 10:54:31 2013-04-06 07:03:52 10 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"\n",
"result = bc.sql('''\n",
" SELECT\n",
" a.firstSeenSrcIp as source,\n",
" a.firstSeenDestIp as destination,\n",
" count(a.firstSeenDestPort) as targetPorts,\n",
" SUM(a.firstSeenSrcTotalBytes) as bytesOut,\n",
" SUM(a.firstSeenDestTotalBytes) as bytesIn,\n",
" SUM(a.durationSeconds) as durationSeconds,\n",
" MIN(parsedDate) as firstFlowDate,\n",
" MAX(parsedDate) as lastFlowDate,\n",
" COUNT(*) as attemptCount\n",
" FROM\n",
" netflow a\n",
" GROUP BY\n",
" a.firstSeenSrcIp,\n",
" a.firstSeenDestIp\n",
" ''').get()\n",
"\n",
"gdf = result.columns\n",
"\n",
"gdf.head(3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Visualize network"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import graphistry\n",
"\n",
"#upload protobuf instead of json\n",
"graphistry.register(api=2)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1303786"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(gdf.to_pandas())"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING: Large graph: |nodes| + |edges| = 1369523. Layout/rendering might be slow.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Uploading 19425 kB. This may take a while...\n"
]
},
{
"data": {
"text/html": [
"\n",
" <iframe id=\"f58748f2-17c5-45bc-b5d6-d1d352639007\" src=\"/graph/graph.html?dataset=d615fd18441c53f2bfbf099c8b86d5c5&type=jsonMeta&viztoken=d6ee78fd457c4aee8e06c02a3c3c3f59&usertag=1849d922-pygraphistry-0.9.69&splashAfter=1574207267&info=true\"\n",
" allowfullscreen=\"true\" webkitallowfullscreen=\"true\" mozallowfullscreen=\"true\"\n",
" oallowfullscreen=\"true\" msallowfullscreen=\"true\"\n",
" style=\"width:100%; height:500px; border: 1px solid #DDD\">\n",
" </iframe>\n",
" \n",
" <script>\n",
" $(\"#f58748f2-17c5-45bc-b5d6-d1d352639007\").bind('mousewheel', function(e) {\n",
" e.preventDefault();\n",
" });\n",
" </script>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"graphistry.bind(source='source', destination='destination').plot(gdf.to_pandas())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7 (RAPIDS)",
"language": "python",
"name": "rapids"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}