In [1]:
import pandas as pd

from plotly.offline import init_notebook_mode
from sqlalchemy import create_engine

from query_flow.parsers.postgres_parser import PostgresParser
from query_flow.vizualizers.query_vizualizer import QueryVizualizer

In [2]:
%load_ext autoreload
%autoreload 2

init_notebook_mode(connected=True)

In [3]:
con_str = 'postgresql:///etrabelsi_thesis'

## Identifying performance bottlenecks in a single query

In [4]:
with create_engine(con_str).connect() as con:
    execution_plan = con.execute("DROP INDEX if exists crew_index")

In [5]:
query_renderer = QueryVizualizer(parser=PostgresParser())

In [6]:
query = """
SELECT titles.title_id
FROM titles
INNER JOIN crew ON crew.title_id = titles.title_id
INNER JOIN people ON people.person_id = crew.person_id
WHERE genres like '%Comedy%' 
  AND name in ('Owen Wilson', 'Adam Sandler', 'Jason Segel')
"""

In [7]:
flow_df = query_renderer.get_flow_df(query, con_str=con_str)
query_renderer.vizualize(flow_df, title="Identifying performance bottlenecks in a single query", metrics=["actual_rows"], open_=False)



In [8]:
flow_df

Unnamed: 0,source,target,operation_type,node_hash,shared_hit_blocks,plan_width,shared_dirtied_blocks,temp_written_blocks,shared_written_blocks,shared_read_blocks,...,label,label_metadata,query_hash,estimated_cost,redundent_operation,actual_startup_duration,actual_duration,estimated_cost_pct,actual_duration_pct,actual_plan_rows_ratio
0,0,1,Seq Scan,7b2ff103133a83b22b41108280f7924a25804f91dc2035...,576,10,0,0,0,81450,...,People,\nDescription: Finds relevant records by seque...,92b91691e897f24d13ca0d9ed35a9df27209676f829eb5...,141258.61,False,147.91,679.258,100.0,100.0,689252.2
1,1,3,Where,d8b31b60823399b14a499d111a44012b81e010f4b11a0a...,576,10,0,0,0,81450,...,People*,\nDescription: Filter relation to hold only re...,92b91691e897f24d13ca0d9ed35a9df27209676f829eb5...,0.0,False,-141110.7,0.0,0.0,0.0,1.666667
2,2,3,Seq Scan,b5f8d79f2d02c6cb19e25b0db815b3b3e5b6a1b3134476...,1077,20,0,0,0,394874,...,Crew,\nDescription: Finds relevant records by seque...,92b91691e897f24d13ca0d9ed35a9df27209676f829eb5...,566549.52,False,0.095,2205.973,100.0,100.0,1.249632
3,3,6,Hash Join,0938bcd8c01c6800ab4264b706fb8b5db5e057ff85258c...,1653,10,0,0,0,476324,...,People* ⋈ Crew,\nDescription: Joins to record sets by hashing...,92b91691e897f24d13ca0d9ed35a9df27209676f829eb5...,205233.19,False,-565868.99,3071.006,26.592095,58.19629,26.904762
4,4,5,Seq Scan,d1a40b2d694e5cd4498df2df53650e4ffb9dfd6bbfa550...,2174,10,0,0,0,121319,...,Titles,\nDescription: Finds relevant records by seque...,92b91691e897f24d13ca0d9ed35a9df27209676f829eb5...,160668.53,False,0.074,986.66,100.0,100.0,3.85993
5,5,6,Where,8a19d9e25a08eb9387a16ac2acc2b2449c55997644e5ae...,2174,10,0,0,0,121319,...,Titles*,\nDescription: Filter relation to hold only re...,92b91691e897f24d13ca0d9ed35a9df27209676f829eb5...,0.0,False,-160668.456,0.0,0.0,0.0,1.260322
6,6,7,Hash Join,12cdfcecd6fc80cc75de2d14a0fae4eb71cb1925a03c03...,3998,10,0,0,0,597643,...,People* ⋈ Crew ⋈ Titles*,\nDescription: Joins to record sets by hashing...,92b91691e897f24d13ca0d9ed35a9df27209676f829eb5...,162980.28,False,-766473.231,1092.088,17.435466,17.14675,37.2


In [9]:
query_renderer.vizualize(flow_df, title="Identifying performance bottlenecks in a single query", metrics=["actual_duration"], open_=False)

In [10]:
with create_engine(con_str).connect() as con:
    execution_plan = con.execute("CREATE INDEX if not exists crew_index ON crew(person_id)")

In [11]:
flow_df = query_renderer.get_flow_df(query, con_str=con_str)
query_renderer.vizualize(flow_df, title="Identifying performance bottlenecks in a single query", metrics=["actual_duration"], open_=False)

In [12]:
flow_df

Unnamed: 0,source,target,operation_type,node_hash,shared_hit_blocks,plan_width,shared_dirtied_blocks,temp_written_blocks,shared_written_blocks,shared_read_blocks,...,label,label_metadata,query_hash,estimated_cost,redundent_operation,actual_startup_duration,actual_duration,estimated_cost_pct,actual_duration_pct,actual_plan_rows_ratio
0,0,3,Index Scan,3f6c1b7fcc178617b549e05e7f0c03b0804ad6de7c27a6...,41,20,0,0,0,1231,...,Crew_Index,\nDescription: Finds relevant records based on...,d2310644453e6fc097822c87df574cf0277a882421cd18...,1355.27,False,0.258,3.207,100.0,100.0,1.861702
1,1,2,Seq Scan,7b2ff103133a83b22b41108280f7924a25804f91dc2035...,864,10,0,0,0,81162,...,People,\nDescription: Finds relevant records by seque...,d2310644453e6fc097822c87df574cf0277a882421cd18...,141258.61,False,350.537,982.185,100.0,100.0,689252.2
2,2,3,Where,d8b31b60823399b14a499d111a44012b81e010f4b11a0a...,864,10,0,0,0,81162,...,People*,\nDescription: Filter relation to hold only re...,d2310644453e6fc097822c87df574cf0277a882421cd18...,0.0,False,-140908.073,0.0,0.0,0.0,1.666667
3,3,6,Nested Loop,eafbc976e3c7445b3ddd6059680875898b6c82a690317f...,905,10,0,0,0,82393,...,Crew_Index ⋈ People*,\nDescription: Merges two record sets by loopi...,d2310644453e6fc097822c87df574cf0277a882421cd18...,6793.82,False,-140907.838,9.752,4.588793,0.983127,26.904762
4,4,5,Seq Scan,d1a40b2d694e5cd4498df2df53650e4ffb9dfd6bbfa550...,3019,10,0,0,0,120474,...,Titles,\nDescription: Finds relevant records by seque...,d2310644453e6fc097822c87df574cf0277a882421cd18...,160668.53,False,0.131,844.0,100.0,100.0,3.85993
5,5,6,Where,8a19d9e25a08eb9387a16ac2acc2b2449c55997644e5ae...,3019,10,0,0,0,120474,...,Titles*,\nDescription: Filter relation to hold only re...,d2310644453e6fc097822c87df574cf0277a882421cd18...,0.0,False,-160668.399,0.0,0.0,0.0,1.260322
6,6,7,Hash Join,12cdfcecd6fc80cc75de2d14a0fae4eb71cb1925a03c03...,4015,10,0,0,0,202867,...,Crew_Index ⋈ People* ⋈ Titles*,\nDescription: Joins to record sets by hashing...,d2310644453e6fc097822c87df574cf0277a882421cd18...,150364.19,False,-159672.616,965.926,48.343528,49.33573,37.2


In [None]:
with create_engine(con_str).connect() as con:
    execution_plan = con.execute("DROP INDEX if exists crew_index")