In [1]:
import pandas as pd

from plotly.offline import init_notebook_mode
from sqlalchemy import create_engine

from query_flow.parsers.postgres_parser import PostgresParser
from query_flow.vizualizers.query_vizualizer import QueryVizualizer

In [2]:
%load_ext autoreload
%autoreload 2

init_notebook_mode(connected=True)

In [3]:
con_str = 'postgresql:///etrabelsi_thesis'

## Identifying performance bottlenecks in a single query

In [4]:
with create_engine(con_str).connect() as con:
    execution_plan = con.execute("DROP INDEX if exists crew_index")

In [5]:
query_renderer = QueryVizualizer(parser=PostgresParser())

In [6]:
query = """
SELECT titles.title_id
FROM titles
INNER JOIN crew ON crew.title_id = titles.title_id
INNER JOIN people ON people.person_id = crew.person_id
WHERE genres like '%Comedy%' 
  AND name in ('Owen Wilson', 'Adam Sandler', 'Jason Segel')
"""

In [7]:
flow_df = query_renderer.get_flow_df(query, con_str=con_str)
query_renderer.vizualize(flow_df, title="Identifying performance bottlenecks in a single query", metrics=["actual_rows"], open_=False)

In [8]:
flow_df

Unnamed: 0,source,target,operation_type,shared_dirtied_blocks,shared_written_blocks,actual_startup_time,actual_total_time,local_written_blocks,actual_loops,temp_written_blocks,...,label,label_metadata,query_hash,actual_duration,estimated_cost,actual_startup_duration,redundent_operation,actual_duration_pct,estimated_cost_pct,actual_plan_rows_ratio
0,0,1,Seq Scan,0,0,242.185,564.576,0,3,0,...,People,,b35a608ede9f78d5acc015306606b2d993f64d303bbdc9...,564.576,141258.61,242.185,False,100.0,100.0,True
1,1,3,Where,0,0,242.185,564.576,0,3,0,...,People*,"Filter condition: (people.name = ANY ('{""Owen ...",b35a608ede9f78d5acc015306606b2d993f64d303bbdc9...,0.0,0.0,-141016.425,False,0.0,0.0,False
2,2,3,Seq Scan,0,0,0.114,1552.245,0,3,0,...,Crew,,b35a608ede9f78d5acc015306606b2d993f64d303bbdc9...,1552.245,566508.77,0.114,False,100.0,100.0,False
3,3,6,Hash Join,0,0,642.49,3758.55,0,3,0,...,People* ⋈ Crew,"Hash Cond ('Inner', '(crew.person_id = people....",b35a608ede9f78d5acc015306606b2d993f64d303bbdc9...,2206.305,205252.03,-565866.28,False,58.700962,26.595291,True
4,4,5,Seq Scan,0,0,0.676,500.972,0,3,0,...,Titles,,b35a608ede9f78d5acc015306606b2d993f64d303bbdc9...,500.972,160656.55,0.676,False,100.0,100.0,False
5,5,6,Where,0,0,0.676,500.972,0,3,0,...,Titles*,Filter condition: (titles.genres ~~ '%Comedy%'...,b35a608ede9f78d5acc015306606b2d993f64d303bbdc9...,0.0,0.0,-160655.874,False,0.0,0.0,False
6,6,7,Hash Join,0,0,3766.063,4327.007,0,3,0,...,People* ⋈ Crew ⋈ Titles*,"Hash Cond ('Inner', '(titles.title_id = crew.t...",b35a608ede9f78d5acc015306606b2d993f64d303bbdc9...,568.457,162967.56,-767994.737,False,13.137418,17.434751,True


In [9]:
query_renderer.vizualize(flow_df, title="Identifying performance bottlenecks in a single query", metrics=["actual_duration"], open_=False)

In [10]:
with create_engine(con_str).connect() as con:
    execution_plan = con.execute("CREATE INDEX if not exists crew_index ON crew(person_id)")

In [11]:
flow_df = query_renderer.get_flow_df(query, con_str=con_str)
query_renderer.vizualize(flow_df, title="Identifying performance bottlenecks in a single query", metrics=["actual_duration"], open_=False)


divide by zero encountered in long_scalars



In [12]:
flow_df

Unnamed: 0,source,target,operation_type,shared_dirtied_blocks,shared_written_blocks,actual_startup_time,actual_total_time,local_written_blocks,actual_loops,temp_written_blocks,...,label,label_metadata,query_hash,actual_duration,estimated_cost,actual_startup_duration,redundent_operation,actual_duration_pct,estimated_cost_pct,actual_plan_rows_ratio
0,0,5,Index Only Scan,0,0,0.006,0.006,0,1695,0,...,Titles_Index,,9be4186f5b750affa50d9f2f649a8bf09276c492fb1259...,0.006,1.57,0.006,False,100.0,100.0,True
1,1,4,Index Scan,0,0,0.601,2.162,0,9,0,...,Crew_Index,,9be4186f5b750affa50d9f2f649a8bf09276c492fb1259...,2.162,1246.82,0.601,False,100.0,100.0,False
2,2,3,Seq Scan,0,0,14.049,647.119,0,3,0,...,People,,9be4186f5b750affa50d9f2f649a8bf09276c492fb1259...,647.119,141258.61,14.049,False,100.0,100.0,True
3,3,4,Where,0,0,14.049,647.119,0,3,0,...,People*,"Filter condition: (people.name = ANY ('{""Owen ...",9be4186f5b750affa50d9f2f649a8bf09276c492fb1259...,0.0,0.0,-141244.561,False,0.0,0.0,False
4,4,5,Nested Loop,0,0,14.886,653.747,0,3,0,...,Crew_Index ⋈ People*,,9be4186f5b750affa50d9f2f649a8bf09276c492fb1259...,6.628,6250.77,-141243.724,False,1.013848,4.237541,True
5,5,6,Nested Loop,0,0,14.956,657.504,0,3,0,...,Titles_Index ⋈ Crew_Index ⋈ People*,,9be4186f5b750affa50d9f2f649a8bf09276c492fb1259...,3.757,33.25,-147494.424,False,0.571403,0.022536,True


In [13]:
with create_engine(con_str).connect() as con:
    execution_plan = con.execute("DROP INDEX if exists crew_index")