In [1]:
import pandas as pd

from plotly.offline import init_notebook_mode
from sqlalchemy import create_engine

from query_flow.parsers.postgres_parser import PostgresParser
from query_flow.vizualizers.query_vizualizer import QueryVizualizer
from plotly.offline import plot, iplot


In [2]:
%load_ext autoreload
%autoreload 2

init_notebook_mode(connected=True)

In [3]:
con_str = 'postgresql:///etrabelsi_thesis'

## Identifying performance bottlenecks in a multiple queries

In [4]:
query_renderer = QueryVizualizer(parser=PostgresParser())

In [5]:
36451.822 - 27376.277

9075.545000000002

In [6]:
#          0         1          2      3         4              5                  6              7                           8
label = ["Title" , "Title*", "Crew", "People", "People*", "Title* U Title*", "Crew ⋈ People*", "Crew ⋈ People* ⋈ Titles*", "Finish"]
source = [0      , 0       , 0       , 1      , 1       , 2         , 3    , 1               , 6                    , 4               , 7    , 5]
target = [1      , 1       , 1       , 5      , 5       , 6         , 4    , 7               , 7                    , 6               , 8    , 8]
value =  [975.287, 677.225 , 8071.176, 975.287, 677.225 , 6108.962  , 6108 , 8071.176        , 36451.822 - 27376.277, 6108            , 44683- 36451, 1885-975.287]

color =  ["#B0C4DE", "#B0C4DE" , "#FFE4E1"  , "#B0C4DE", "#B0C4DE" , "#FFE4E1"    ,"#FFE4E1", "#FFE4E1"          , "#FFE4E1"               , "#FFE4E1"          , "#FFE4E1"      , "#B0C4DE"]
color_node = ["black"] * len(color)
layout = dict(font=dict(size=10), height=750)
data_trace = dict(type="sankey",
                  orientation="h",
                  valueformat=",",
                  node=dict(pad=200, label=label, color=color_node),
                  link=dict(source=source,target=target, value=value, color=color))

iplot(dict(data=[data_trace], layout=layout),
      validate=False)

In [7]:

# query_renderer.vizualize(
#         pd.read_csv("/Users/etrabelsi/IdeaProjects/thesis/crap.csv"), title='Missing Records in Where Clause', metrics=['actual_rows'],
#         open_=False,
#     )

In [8]:
with create_engine(con_str).connect() as con:
    execution_plan = con.execute("CREATE INDEX if not exists titles_index ON titles(title_id,genres) WHERE genres like '%%Comedy%%'")

In [9]:
query = """
SELECT titles.title_id
FROM titles
INNER JOIN crew ON crew.title_id = titles.title_id
INNER JOIN people ON people.person_id = crew.person_id
WHERE genres like '%Comedy%' 
  AND name in ('Owen Wilson', 'Adam Sandler', 'Jason Segel')
"""
query1 = query_renderer.get_cardinality_df(query, con_str=con_str)

query = """
SELECT titles.title_id
FROM titles
WHERE genres = 'Comedy' 
UNION 
SELECT titles.title_id
FROM titles
WHERE genres = 'Action' 
"""
query2 = query_renderer.get_cardinality_df(query, con_str=con_str)

cardinality_df = pd.concat([query1, query2])
cardinality_df


divide by zero encountered in long_scalars



Unnamed: 0,source,target,operation_type,actual_rows,plan_width,actual_loops,actual_total_time,shared_dirtied_blocks,shared_written_blocks,local_hit_blocks,...,shared_read_blocks,label,label_metadata,actual_duration,estimated_cost,actual_startup_duration,redundent_operation,actual_duration_pct,estimated_cost_pct,actual_plan_rows_ratio
0,0,5,Index Only Scan,0,10,1695,0.027,0,0,0,...,0,Titles_Index,,0.027,1.57,0.026,False,100.0,100.0,True
1,1,2,Seq Scan,3446261,10,3,8091.177,0,0,0,...,81118,People,,8091.177,141258.61,1291.856,False,100.0,100.0,True
2,2,4,Where,3,10,3,8091.177,0,0,0,...,81118,People*,"Filter condition: (people.name = ANY ('{""Owen ...",0.0,0.0,-139966.754,False,0.0,0.0,False
3,3,4,Seq Scan,13651901,20,3,39547.711,0,0,0,...,154988,Crew,,39547.711,566508.77,0.022,False,100.0,100.0,False
4,4,5,Hash Join,565,10,3,53349.033,0,0,0,...,236106,People* ⋈ Crew,"Hash Cond ('Inner', '(crew.person_id = people....",13801.322,205252.03,-558390.934,False,25.869863,26.595291,True
5,5,6,Nested Loop,186,10,3,53365.808,0,0,0,...,236106,Titles_Index ⋈ People* ⋈ Crew,,16.775,33.25,-763562.447,False,0.031434,0.004308,True
0,0,1,Seq Scan,3568850,10,2,1554.263,0,0,0,...,122505,Titles,,1554.263,160656.55,0.088,False,100.0,100.0,True
1,1,4,Where,15538,10,2,1554.263,0,0,0,...,122505,Titles*,Filter condition: (titles_1.genres = 'Action':...,0.0,0.0,-160656.462,False,0.0,0.0,False
2,2,3,Seq Scan,2379234,10,3,963.58,0,0,0,...,122380,Titles,,963.58,160656.55,0.029,False,100.0,100.0,True
3,3,4,Where,171294,10,3,963.58,0,0,0,...,122380,Titles*,Filter condition: (titles.genres = 'Comedy'::t...,0.0,0.0,-160656.521,False,0.0,0.0,False


In [10]:
cardinality_df[["label", "source", "target", "actual_total_time", "label_metadata"]]

Unnamed: 0,label,source,target,actual_total_time,label_metadata
0,Titles_Index,0,5,0.027,
1,People,1,2,8091.177,
2,People*,2,4,8091.177,"Filter condition: (people.name = ANY ('{""Owen ..."
3,Crew,3,4,39547.711,
4,People* ⋈ Crew,4,5,53349.033,"Hash Cond ('Inner', '(crew.person_id = people...."
5,Titles_Index ⋈ People* ⋈ Crew,5,6,53365.808,
0,Titles,0,1,1554.263,
1,Titles*,1,4,1554.263,Filter condition: (titles_1.genres = 'Action':...
2,Titles,2,3,963.58,
3,Titles*,3,4,963.58,Filter condition: (titles.genres = 'Comedy'::t...


In [11]:
#          0         1          2      3         4              5                  6              7                           8
label = ["Title" , "Title*", "Crew", "People", "People*", "Title* U Title*", "Crew ⋈ People*", "Crew ⋈ People* ⋈ Titles*", "Finish"]
source = [0      , 0       , 0       , 1      , 1       , 2         , 3    , 1               , 6                    , 4               , 7    , 5]
target = [1      , 1       , 1       , 5      , 5       , 6         , 4    , 7               , 7                    , 6               , 8    , 8]
value =  [675.287, 10.225  , 1       , 975.287, 677.225 , 6108.962  , 6108 , 1               , 36451.822 - 27376.277, 6108            , 44683- 36451, 1885-975.287]

color =  ["#B0C4DE", "#B0C4DE" , "#FFE4E1"  , "#B0C4DE", "#B0C4DE" , "#FFE4E1"    ,"#FFE4E1", "#FFE4E1"          , "#FFE4E1"               , "#FFE4E1"          , "#FFE4E1"      , "#B0C4DE"]
color_node = ["black"] * len(color)
layout = dict(font=dict(size=10), height=750)
data_trace = dict(type="sankey",
                  orientation="h",
                  valueformat=",",
                  node=dict(pad=200, label=label, color=color_node),
                  link=dict(source=source,target=target, value=value, color=color))

iplot(dict(data=[data_trace], layout=layout),
      validate=False)

In [12]:
with create_engine(con_str).connect() as con:
    execution_plan = con.execute("DROP INDEX if exists titles_index")