In [19]:
import pandas as pd

from plotly.offline import init_notebook_mode
from sqlalchemy import create_engine

from query_flow.parsers.postgres_parser import PostgresParser
from query_flow.vizualizers.query_vizualizer import QueryVizualizer

In [20]:
%load_ext autoreload
%autoreload 2

init_notebook_mode(connected=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
con_str = 'postgresql:///etrabelsi_thesis'

## Identifying performance bottlenecks in a single query

In [22]:
with create_engine(con_str).connect() as con:
    execution_plan = con.execute("DROP INDEX if exists crew_index")

In [23]:
query_renderer = QueryVizualizer(parser=PostgresParser())

In [24]:
query = """
SELECT titles.title_id
FROM titles
INNER JOIN crew ON crew.title_id = titles.title_id
INNER JOIN people ON people.person_id = crew.person_id
WHERE genres like '%Comedy%' 
  AND name in ('Owen Wilson', 'Adam Sandler', 'Jason Segel')
"""

In [25]:
cardinality_df = query_renderer.get_cardinality_df(query, con_str=con_str)
query_renderer.vizualize(cardinality_df, title="Identifying performance bottlenecks in a single query", metrics=["actual_rows"], open_=False)


divide by zero encountered in long_scalars



In [26]:
cardinality_df

Unnamed: 0,source,target,operation_type,local_hit_blocks,actual_total_time,plan_rows,temp_read_blocks,local_read_blocks,plan_width,shared_hit_blocks,...,actual_loops,label,label_metadata,actual_duration,estimated_cost,actual_startup_duration,redundent_operation,actual_duration_pct,estimated_cost_pct,actual_plan_rows_ratio
0,0,5,Index Only Scan,0,0.008,1,0,0,10,5645,...,1695,Titles_Index,,0.008,1.57,0.008,False,100.0,100.0,True
1,1,2,Seq Scan,0,577.74,5,0,0,10,768,...,3,People,,577.74,141258.61,96.556,False,100.0,100.0,True
2,2,4,Where,0,577.74,5,0,0,10,768,...,3,People*,"Filter condition: (people.name = ANY ('{""Owen ...",0.0,0.0,-141162.054,False,0.0,0.0,False
3,3,4,Seq Scan,0,1538.434,17064877,0,0,20,1916,...,3,Crew,,1538.434,566508.77,0.061,False,100.0,100.0,False
4,4,5,Hash Join,0,3760.301,21,0,0,10,2775,...,3,People* ⋈ Crew,"Hash Cond ('Inner', '(crew.person_id = people....",2221.867,205252.03,-565921.403,False,59.087477,26.595291,True
5,5,6,Nested Loop,0,3765.805,5,0,0,10,8420,...,3,Titles_Index ⋈ People* ⋈ Crew,,5.504,33.25,-771173.368,False,0.146157,0.004308,True


In [27]:
query_renderer.vizualize(cardinality_df, title="Identifying performance bottlenecks in a single query", metrics=["actual_duration"], open_=False)

In [None]:
with create_engine(con_str).connect() as con:
    execution_plan = con.execute("CREATE INDEX if not exists crew_index ON crew(person_id)")

In [None]:
cardinality_df = query_renderer.get_cardinality_df(query, con_str=con_str)
query_renderer.vizualize(cardinality_df, title="Identifying performance bottlenecks in a single query", metrics=["actual_duration"], open_=False)

In [None]:
cardinality_df

In [None]:
with create_engine(con_str).connect() as con:
    execution_plan = con.execute("DROP INDEX if exists crew_index")