In [15]:
import pandas as pd

from plotly.offline import init_notebook_mode
from sqlalchemy import create_engine

from query_flow.parsers.postgres_parser import PostgresParser
from query_flow.vizualizers.query_vizualizer import QueryVizualizer

In [16]:
%load_ext autoreload
%autoreload 2

init_notebook_mode(connected=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
con_str = 'postgresql:///etrabelsi_thesis'

## Identifying performance bottlenecks in a single query

In [18]:
with create_engine(con_str).connect() as con:
    execution_plan = con.execute("DROP INDEX if exists crew_index")

In [19]:
query_renderer = QueryVizualizer(parser=PostgresParser())

In [20]:
query = """
SELECT titles.title_id
FROM titles
INNER JOIN crew ON crew.title_id = titles.title_id
INNER JOIN people ON people.person_id = crew.person_id
WHERE genres like '%Comedy%' 
  AND name in ('Owen Wilson', 'Adam Sandler', 'Jason Segel')
"""

In [21]:
cardinality_df = query_renderer.get_cardinality_df(query, con_str=con_str)
query_renderer.vizualize(cardinality_df, title="Identifying performance bottlenecks in a single query", metrics=["actual_rows"], open_=False)

In [22]:
cardinality_df

Unnamed: 0,source,target,operation_type,total_cost,actual_total_time,plan_rows,actual_startup_time,shared_hit_blocks,local_read_blocks,plan_width,...,shared_read_blocks,label,label_metadata,actual_duration,estimated_cost,actual_startup_duration,redundent_operation,actual_duration_pct,estimated_cost_pct,actual_plan_rows_ratio
0,0,1,Seq Scan,141258.61,962.687,5,202.031,3743,0,10,...,78283,People,,962.687,141258.61,202.031,False,100.0,100.0,True
1,1,3,Where,141258.61,962.687,5,202.031,3743,0,10,...,78283,People*,"Filter condition: (people.name = ANY ('{""Owen ...",0.0,0.0,-141056.579,False,0.0,0.0,False
2,2,3,Seq Scan,566508.77,3085.546,17064877,0.072,4290,0,20,...,391570,Crew,,3085.546,566508.77,0.072,False,100.0,100.0,False
3,3,6,Hash Join,771760.8,7249.204,21,1078.668,8033,0,10,...,469853,People* ⋈ Crew,"Hash Cond ('Inner', '(crew.person_id = people....",4163.658,205252.03,-565430.102,False,57.436072,26.595291,True
4,4,5,Seq Scan,160668.53,1053.739,616393,0.021,7922,0,10,...,115571,Titles,,1053.739,160668.53,0.021,False,100.0,100.0,False
5,5,6,Where,160668.53,1053.739,616393,0.021,7922,0,10,...,115571,Titles*,Filter condition: (titles.genres ~~ '%Comedy%'...,0.0,0.0,-160668.509,False,0.0,0.0,False
6,6,7,Hash Join,934741.09,8420.476,5,7271.636,16126,0,10,...,585424,People* ⋈ Crew ⋈ Titles*,"Hash Cond ('Inner', '(titles.title_id = crew.t...",1171.272,162980.29,-764489.164,False,13.909807,17.435875,True


In [23]:
query_renderer.vizualize(cardinality_df, title="Identifying performance bottlenecks in a single query", metrics=["actual_duration"], open_=False)

In [24]:
with create_engine(con_str).connect() as con:
    execution_plan = con.execute("CREATE INDEX if not exists crew_index ON crew(person_id)")

In [25]:
cardinality_df = query_renderer.get_cardinality_df(query, con_str=con_str)
query_renderer.vizualize(cardinality_df, title="Identifying performance bottlenecks in a single query", metrics=["actual_duration"], open_=False)

In [26]:
cardinality_df

Unnamed: 0,source,target,operation_type,total_cost,actual_total_time,plan_rows,actual_startup_time,shared_hit_blocks,local_read_blocks,plan_width,...,shared_read_blocks,label,label_metadata,actual_duration,estimated_cost,actual_startup_duration,redundent_operation,actual_duration_pct,estimated_cost_pct,actual_plan_rows_ratio
0,0,3,Index Scan,1246.82,3.441,333,0.551,130,0,20,...,1127,Crew_Index,,3.441,1246.82,0.551,False,100.0,100.0,False
1,1,2,Seq Scan,141258.61,1098.464,5,78.553,2989,0,10,...,79037,People,,1098.464,141258.61,78.553,False,100.0,100.0,True
2,2,3,Where,141258.61,1098.464,5,78.553,2989,0,10,...,79037,People*,"Filter condition: (people.name = ANY ('{""Owen ...",0.0,0.0,-141180.057,False,0.0,0.0,False
3,3,6,Nested Loop,147509.38,1108.926,21,134.493,3119,0,10,...,80164,Crew_Index ⋈ People*,,10.462,6250.77,-141124.117,False,0.943435,4.237541,True
4,4,5,Seq Scan,160668.53,1050.58,616393,0.826,8267,0,10,...,115226,Titles,,1050.58,160668.53,0.826,False,100.0,100.0,False
5,5,6,Where,160668.53,1050.58,616393,0.826,8267,0,10,...,115226,Titles*,Filter condition: (titles.genres ~~ '%Comedy%'...,0.0,0.0,-160667.704,False,0.0,0.0,False
6,6,7,Hash Join,310489.66,2312.044,5,1115.795,11477,0,10,...,195390,Crew_Index ⋈ People* ⋈ Titles*,"Hash Cond ('Inner', '(titles.title_id = crew.t...",1203.118,149821.13,-159552.735,False,52.036985,48.253179,True


In [27]:
with create_engine(con_str).connect() as con:
    execution_plan = con.execute("DROP INDEX if exists crew_index")