In [1]:
import pandas as pd

from plotly.offline import init_notebook_mode
from sqlalchemy import create_engine

from query_flow.parsers.postgres_parser import PostgresParser
from query_flow.vizualizers.query_vizualizer import QueryVizualizer

In [2]:
%load_ext autoreload
%autoreload 2

init_notebook_mode(connected=True)

In [3]:
con_str = 'postgresql:///etrabelsi_thesis'

## Identifying performance bottlenecks in a single query

In [4]:
with create_engine(con_str).connect() as con:
    execution_plan = con.execute("DROP INDEX if exists crew_index")

In [5]:
query_renderer = QueryVizualizer(parser=PostgresParser())

In [6]:
query = """
SELECT titles.title_id
FROM titles
INNER JOIN crew ON crew.title_id = titles.title_id
INNER JOIN people ON people.person_id = crew.person_id
WHERE genres like '%Comedy%' 
  AND name in ('Owen Wilson', 'Adam Sandler', 'Jason Segel')
"""

In [7]:
cardinality_df = query_renderer.get_cardinality_df(query, con_str=con_str)
query_renderer.vizualize(cardinality_df, title="Identifying performance bottlenecks in a single query", metrics=["actual_rows"], open_=False)


divide by zero encountered in long_scalars



In [8]:
cardinality_df

Unnamed: 0,source,target,operation_type,shared_hit_blocks,local_hit_blocks,local_dirtied_blocks,actual_rows,local_written_blocks,shared_dirtied_blocks,shared_read_blocks,...,temp_written_blocks,label,label_metadata,actual_duration,estimated_cost,actual_startup_duration,redundent_operation,actual_duration_pct,estimated_cost_pct,actual_plan_rows_ratio
0,0,5,Index Only Scan,5644,0,0,0,0,0,0,...,0,Titles_Index,,0.044,1.57,0.044,False,100.0,100.0,True
1,1,2,Seq Scan,36718,0,0,5169390,0,0,45308,...,0,People,,8936.956,141258.61,679.943,False,100.0,100.0,True
2,2,4,Where,36718,0,0,4,0,0,45308,...,0,People*,"Filter condition: (people.name = ANY ('{""Owen ...",0.0,0.0,-140578.667,False,0.0,0.0,False
3,3,4,Seq Scan,395322,0,0,20477852,0,0,538,...,0,Crew,,38009.881,566508.77,0.021,False,100.0,100.0,False
4,4,5,Hash Join,432087,0,0,848,0,0,45846,...,0,People* ⋈ Crew,"Hash Cond ('Inner', '(crew.person_id = people....",16612.165,205252.03,-557464.568,False,30.412931,26.595291,True
5,5,6,Nested Loop,437731,0,0,279,0,0,45846,...,0,Titles_Index ⋈ People* ⋈ Crew,,39.835,33.25,-762716.489,False,0.072875,0.004308,True


In [9]:
query_renderer.vizualize(cardinality_df, title="Identifying performance bottlenecks in a single query", metrics=["actual_duration"], open_=False)

In [None]:
with create_engine(con_str).connect() as con:
    execution_plan = con.execute("CREATE INDEX if not exists crew_index ON crew(person_id)")

In [None]:
cardinality_df = query_renderer.get_cardinality_df(query, con_str=con_str)
query_renderer.vizualize(cardinality_df, title="Identifying performance bottlenecks in a single query", metrics=["actual_duration"], open_=False)

In [None]:
cardinality_df

In [None]:
with create_engine(con_str).connect() as con:
    execution_plan = con.execute("DROP INDEX if exists crew_index")