In [1]:
# Fixing working directory of the execution

from sys import path
from os.path import dirname

path.append(dirname(path[0]))

In [15]:
import pandas as pd

from plotly.offline import init_notebook_mode
from sqlalchemy import create_engine

from query_flow.parsers.postgres_parser import PostgresParser
from query_flow.vizualizers.query_vizualizer import QueryVizualizer
from plotly.offline import plot, iplot


In [3]:
%load_ext autoreload
%autoreload 2

init_notebook_mode(connected=True)

In [4]:
con_str = 'postgresql:///etrabelsi_thesis'

## Identifying performance bottlenecks in a multiple queries

In [8]:
query_renderer = QueryVizualizer(parser=PostgresParser())

In [27]:
36451.822 - 27376.277

9075.545000000002

In [41]:
#          0         1          2      3         4              5                  6              7                           8
label = ["Title" , "Title*", "Crew", "People", "People*", "Title* U Title*", "Crew ⋈ People*", "Crew ⋈ People* ⋈ Titles*", "Finish"]
source = [0      , 0       , 0       , 1      , 1       , 2         , 3    , 1               , 6                    , 4               , 7    , 5]
target = [1      , 1       , 1       , 5      , 5       , 6         , 4    , 7               , 7                    , 6               , 8    , 8]
value =  [975.287, 677.225 , 8071.176, 975.287, 677.225 , 6108.962  , 6108 , 8071.176        , 36451.822 - 27376.277, 6108            , 44683- 36451, 1885-975.287]

color =  ["#B0C4DE", "#B0C4DE" , "#FFE4E1"  , "#B0C4DE", "#B0C4DE" , "#FFE4E1"    ,"#FFE4E1", "#FFE4E1"          , "#FFE4E1"               , "#FFE4E1"          , "#FFE4E1"      , "#B0C4DE"]
color_node = ["black"] * len(color)
layout = dict(font=dict(size=10), height=750)
data_trace = dict(type="sankey",
                  orientation="h",
                  valueformat=",",
                  node=dict(pad=200, label=label, color=color_node),
                  link=dict(source=source,target=target, value=value, color=color))

iplot(dict(data=[data_trace], layout=layout),
      validate=False)

In [42]:
with create_engine(con_str).connect() as con:
    execution_plan = con.execute("CREATE INDEX if not exists titles_index ON titles(title_id,genres) WHERE genres like '%%Comedy%%'")

In [43]:
query = """
SELECT titles.title_id
FROM titles
INNER JOIN crew ON crew.title_id = titles.title_id
INNER JOIN people ON people.person_id = crew.person_id
WHERE genres like '%Comedy%' 
  AND name in ('Owen Wilson', 'Adam Sandler', 'Jason Segel')
"""
query1 = query_renderer.get_cardinality_df(query, con_str=con_str)

query = """
SELECT titles.title_id
FROM titles
WHERE genres = 'Comedy' 
UNION 
SELECT titles.title_id
FROM titles
WHERE genres = 'Action' 
"""
query2 = query_renderer.get_cardinality_df(query, con_str=con_str)

cardinality_df = pd.concat([query1, query2])
cardinality_df

Unnamed: 0,source,target,operation_type,actual_rows,actual_total_time,plan_rows,plan_width,total_cost,actual_startup_time,actual_loops,...,shared_dirtied_blocks,shared_written_blocks,local_hit_blocks,local_dirtied_blocks,local_read_blocks,local_written_blocks,temp_written_blocks,temp_read_blocks,label,label_metadata
0,9999,10000,Nested Loop,186,35047.266,5,10,771794.05,6564.872,3,...,0,0,0,0,0,0,0,0,JOIN,
1,9998,9999,Hash Join,565,34964.147,21,10,771760.8,6564.435,3,...,0,0,0,0,0,0,0,0,JOIN,"Hash Cond ('Inner', '(crew.person_id = people...."
2,9997,9998,Seq Scan,13651901,26170.753,17064877,20,566508.77,0.677,3,...,0,0,0,0,0,0,0,0,Crew,
3,9996,9998,Where,3,5891.085,5,10,141258.61,429.656,3,...,0,0,0,0,0,0,0,0,People*,"Filter condition: (people.name = ANY ('{""Owen ..."
4,9995,9996,Seq Scan,3446261,5891.085,5,10,141258.61,429.656,3,...,0,0,0,0,0,0,0,0,People,
5,9994,9999,Index Only Scan,0,0.144,1,10,1.57,0.144,1695,...,0,0,0,0,0,0,0,0,Titles_Index,
0,9993,9994,Unique,544958,1875.919,535566,32,450382.96,1616.676,1,...,0,0,0,0,0,0,1357,1348,Unique,
1,9992,9993,Append,181653,1302.344,535566,32,329370.54,0.335,3,...,0,0,0,0,0,0,0,0,UNION ALL,
2,9991,9992,Where,171294,635.596,210166,10,160668.53,0.045,3,...,0,0,0,0,0,0,0,0,Titles*,Filter condition: (titles.genres = 'Comedy'::t...
3,9990,9991,Seq Scan,2379234,635.596,210166,10,160668.53,0.045,3,...,0,0,0,0,0,0,0,0,Titles,


In [45]:
cardinality_df[["label", "source", "target", "actual_total_time", "label_metadata"]]

Unnamed: 0,label,source,target,actual_total_time,label_metadata
0,JOIN,9999,10000,35047.266,
1,JOIN,9998,9999,34964.147,"Hash Cond ('Inner', '(crew.person_id = people...."
2,Crew,9997,9998,26170.753,
3,People*,9996,9998,5891.085,"Filter condition: (people.name = ANY ('{""Owen ..."
4,People,9995,9996,5891.085,
5,Titles_Index,9994,9999,0.144,
0,Unique,9993,9994,1875.919,
1,UNION ALL,9992,9993,1302.344,
2,Titles*,9991,9992,635.596,Filter condition: (titles.genres = 'Comedy'::t...
3,Titles,9990,9991,635.596,


In [47]:
#          0         1          2      3         4              5                  6              7                           8
label = ["Title" , "Title*", "Crew", "People", "People*", "Title* U Title*", "Crew ⋈ People*", "Crew ⋈ People* ⋈ Titles*", "Finish"]
source = [0      , 0       , 0       , 1      , 1       , 2         , 3    , 1               , 6                    , 4               , 7    , 5]
target = [1      , 1       , 1       , 5      , 5       , 6         , 4    , 7               , 7                    , 6               , 8    , 8]
value =  [675.287, 10.225  , 1       , 975.287, 677.225 , 6108.962  , 6108 , 1               , 36451.822 - 27376.277, 6108            , 44683- 36451, 1885-975.287]

color =  ["#B0C4DE", "#B0C4DE" , "#FFE4E1"  , "#B0C4DE", "#B0C4DE" , "#FFE4E1"    ,"#FFE4E1", "#FFE4E1"          , "#FFE4E1"               , "#FFE4E1"          , "#FFE4E1"      , "#B0C4DE"]
color_node = ["black"] * len(color)
layout = dict(font=dict(size=10), height=750)
data_trace = dict(type="sankey",
                  orientation="h",
                  valueformat=",",
                  node=dict(pad=200, label=label, color=color_node),
                  link=dict(source=source,target=target, value=value, color=color))

iplot(dict(data=[data_trace], layout=layout),
      validate=False)

In [None]:
with create_engine(con_str).connect() as con:
    execution_plan = con.execute("DROP INDEX if exists titles_index")

In [None]:
query = """
SELECT titles.title_id
FROM titles
WHERE genres = 'Comedy' 
UNION 
SELECT titles.title_id
FROM titles
WHERE genres = 'Action' 
"""

query ="""
SELECT titles.title_id
FROM titles
INNER JOIN crew ON crew.title_id = titles.title_id
INNER JOIN people ON people.person_id = crew.person_id
WHERE genres like '%Comedy%' 
  AND name in ('Owen Wilson', 'Adam Sandler', 'Jason Segel')
"""