In [1]:
import pandas as pd

from plotly.offline import init_notebook_mode
from sqlalchemy import create_engine

from query_flow.parsers.postgres_parser import PostgresParser
from query_flow.vizualizers.query_vizualizer import QueryVizualizer

In [2]:
%load_ext autoreload
%autoreload 2

init_notebook_mode(connected=True)

In [3]:
con_str = 'postgresql:///etrabelsi_thesis'

## Identifying performance bottlenecks in a single query

In [4]:
with create_engine(con_str).connect() as con:
    execution_plan = con.execute("DROP INDEX if exists crew_index")

In [5]:
query_renderer = QueryVizualizer(parser=PostgresParser())

In [6]:
query = """
SELECT titles.title_id
FROM titles
INNER JOIN crew ON crew.title_id = titles.title_id
INNER JOIN people ON people.person_id = crew.person_id
WHERE genres like '%Comedy%' 
  AND name in ('Owen Wilson', 'Adam Sandler', 'Jason Segel')
"""

In [7]:
cardinality_df = query_renderer.get_cardinality_df(query, con_str=con_str)
query_renderer.vizualize(cardinality_df, title="Identifying performance bottlenecks in a single query", metrics=["actual_rows"], open_=False)

In [8]:
cardinality_df

Unnamed: 0,source,target,operation_type,actual_rows,actual_total_time,plan_rows,plan_width,total_cost,actual_startup_time,actual_loops,...,temp_read_blocks,label,label_metadata,actual_duration,estimated_cost,actual_startup_duration,redundent_operation,actual_duration_pct,estimated_cost_pct,actual_plan_rows_ratio
6,0,1,Seq Scan,3446261,506.91,5,10,141258.61,38.819,3,...,0,People,,506.91,141258.61,38.819,False,100.0,100.0,True
5,1,3,Where,3,506.91,5,10,141258.61,38.819,3,...,0,People*,"Filter condition: (people.name = ANY ('{""Owen ...",0.0,0.0,-141219.791,False,0.0,0.0,False
4,2,3,Seq Scan,13651901,1461.272,17064877,20,566508.77,0.053,3,...,0,Crew,,1461.272,566508.77,0.053,False,100.0,100.0,False
3,3,6,Hash Join,565,3544.129,21,10,771760.8,532.453,3,...,0,People* ⋈ Crew,"Hash Cond ('Inner', '(crew.person_id = people....",2082.857,205252.03,-565976.317,False,58.769221,26.595291,True
2,4,5,Seq Scan,2379234,470.227,616393,10,160668.53,0.058,3,...,0,Titles,,470.227,160668.53,0.058,False,100.0,100.0,False
1,5,6,Where,489076,470.227,616393,10,160668.53,0.058,3,...,0,Titles*,Filter condition: (titles.genres ~~ '%Comedy%'...,0.0,0.0,-160668.472,False,0.0,0.0,False
0,6,7,Hash Join,186,4079.429,5,10,934741.09,3549.232,3,...,0,People* ⋈ Crew ⋈ Titles*,"Hash Cond ('Inner', '(titles.title_id = crew.t...",535.3,162980.29,-768211.568,False,13.121934,17.435875,True


In [9]:
query_renderer.vizualize(cardinality_df, title="Identifying performance bottlenecks in a single query", metrics=["actual_duration"], open_=False)

In [None]:
with create_engine(con_str).connect() as con:
    execution_plan = con.execute("CREATE INDEX if not exists crew_index ON crew(person_id)")

In [None]:
cardinality_df = query_renderer.get_cardinality_df(query, con_str=con_str)
query_renderer.vizualize(cardinality_df, title="Identifying performance bottlenecks in a single query", metrics=["actual_duration"], open_=False)

In [None]:
cardinality_df

In [None]:
with create_engine(con_str).connect() as con:
    execution_plan = con.execute("DROP INDEX if exists crew_index")