In [1]:
import pandas as pd

from plotly.offline import init_notebook_mode
from sqlalchemy import create_engine

from query_flow.parsers.postgres_parser import PostgresParser
from query_flow.vizualizers.query_vizualizer import QueryVizualizer
from plotly.offline import plot, iplot


In [2]:
%load_ext autoreload
%autoreload 2

init_notebook_mode(connected=True)

In [3]:
con_str = 'postgresql:///etrabelsi_thesis'

## Identifying performance bottlenecks in a multiple queries

In [4]:
query_renderer = QueryVizualizer(parser=PostgresParser())

In [5]:
36451.822 - 27376.277

9075.545000000002

In [6]:
a = set(['Titles*', 'Titles* U Titles*', 'Unique', 'People', 'People*', 'Crew', 'People* ⋈ Crew', 'Titles', 'Titles','Titles', 'Titles*', 'Titles*', 'People* ⋈ Crew ⋈ Titles*'])
res = []
for i in a:
    if i not in res:
        res.append(i)
res

['People* ⋈ Crew',
 'Titles*',
 'People*',
 'Titles* U Titles*',
 'People',
 'Unique',
 'People* ⋈ Crew ⋈ Titles*',
 'Titles',
 'Crew']

In [7]:
#          0         1          2      3         4              5                  6              7                           8
label = ["Title" , "Title*", "Crew", "People", "People*", "Title* U Title*", "Crew ⋈ People*", "Crew ⋈ People* ⋈ Titles*", "Finish"]
source = [0      , 0       , 0       , 1      , 1       , 2         , 3    , 1               , 6                    , 4               , 7    , 5]
target = [1      , 1       , 1       , 5      , 5       , 6         , 4    , 7               , 7                    , 6               , 8    , 8]
value =  [975.287, 677.225 , 8071.176, 975.287, 677.225 , 6108.962  , 6108 , 8071.176        , 36451.822 - 27376.277, 6108            , 44683- 36451, 1885-975.287]


#          0                 1                 2      3         4         5            6              7        8        9            10
# label  = ['Titles*', 'Titles* U Titles*', 'Unique', 'People', 'People*', 'Crew', 'People* ⋈ Crew', 'Titles', 'Titles','Titles', 'Titles*', 'Titles*', 'People* ⋈ Crew ⋈ Titles*']
# source = [0        , 1                  , 2       , 4       , 5        , 6     , 7               , 8       , 8       , 8      , 9        , 9        , 10]

label  = ['Titles*', 'Titles* U Titles*', 'Unique', 'People', 'People*', 'Crew', 'People* ⋈ Crew', 'Titles', 'Titles','Titles', 'Titles*', 'Titles*', 'People* ⋈ Crew ⋈ Titles*']
source = [0        , 1                  , 2       , 4       , 5        , 6     , 7               , 7       , 7       , 8      , 9        , 9        , 10]
target = [1        , 2                  , 3       , 5       , 7        , 7     , 10              , 9       , 9       , 0      , 10       , 1        , 11]
value  = [15538    , 181653             , 544958  , 3446261, 3, 13651901, 565, 2379234, 2379234, 3568850, 171294, 171294, 15]


layout = dict(font=dict(size=10), height=750)
data_trace = dict(type="sankey",
                  orientation="h",
                  valueformat=",",
                  node=dict(pad=200, label=label),
                  link=dict(source=source,target=target, value=value))

iplot(dict(data=[data_trace], layout=layout),
      validate=False)

In [8]:
#          0         1          2      3         4              5                  6              7                           8
label = ["Title" , "Title*", "Crew", "People", "People*", "Title* U Title*", "Crew ⋈ People*", "Crew ⋈ People* ⋈ Titles*", "Finish"]
source = [0      , 0       , 0       , 1      , 1       , 2         , 3    , 1               , 6                    , 4               , 7    , 5]
target = [1      , 1       , 1       , 5      , 5       , 6         , 4    , 7               , 7                    , 6               , 8    , 8]
value =  [975.287, 677.225 , 8071.176, 975.287, 677.225 , 6108.962  , 6108 , 8071.176        , 36451.822 - 27376.277, 6108            , 44683- 36451, 1885-975.287]

color =  ["#B0C4DE", "#B0C4DE" , "#FFE4E1"  , "#B0C4DE", "#B0C4DE" , "#FFE4E1"    ,"#FFE4E1", "#FFE4E1"          , "#FFE4E1"               , "#FFE4E1"          , "#FFE4E1"      , "#B0C4DE"]
color_node = ["black"] * len(color)
layout = dict(font=dict(size=10), height=750)
data_trace = dict(type="sankey",
                  orientation="h",
                  valueformat=",",
                  node=dict(pad=200, label=label, color=color_node),
                  link=dict(source=source,target=target, value=value, color=color))

iplot(dict(data=[data_trace], layout=layout),
      validate=False)

In [10]:
with create_engine(con_str).connect() as con:
    execution_plan = con.execute("CREATE INDEX if not exists titles_index ON titles(title_id,genres) WHERE genres like '%%Comedy%%'")

In [None]:
query1 = """
SELECT titles.title_id
FROM titles
INNER JOIN crew ON crew.title_id = titles.title_id
INNER JOIN people ON people.person_id = crew.person_id
WHERE genres = 'Comedy' 
  AND name in ('Owen Wilson', 'Adam Sandler', 'Jason Segel')
"""


query2 = """
SELECT titles.title_id
FROM titles
WHERE genres = 'Comedy' 
UNION 
SELECT titles.title_id
FROM titles
WHERE genres = 'Action' 
"""

queries = [query1, query2]
flow_df = query_renderer.get_flow_df(queries, con_str=con_str)
flow_df

In [None]:

flow_df = query_renderer.get_flow_df(queries, con_str=con_str)
flow_df[["label", "source", "target", "actual_total_time", "label_metadata", "query_hash"]]

In [None]:
query_renderer.vizualize(flow_df, metrics=["actual_rows"], title="crap",open_=False)

In [15]:
#          0         1          2      3         4              5                  6              7                           8
label = ["Title" , "Title*", "Crew", "People", "People*", "Title* U Title*", "Crew ⋈ People*", "Crew ⋈ People* ⋈ Titles*", "Finish"]
source = [0      , 0       , 0       , 1      , 1       , 2         , 3    , 1               , 6                    , 4               , 7    , 5]
target = [1      , 1       , 1       , 5      , 5       , 6         , 4    , 7               , 7                    , 6               , 8    , 8]
value =  [675.287, 10.225  , 1       , 975.287, 677.225 , 6108.962  , 6108 , 1               , 36451.822 - 27376.277, 6108            , 44683- 36451, 1885-975.287]

color =  ["#B0C4DE", "#B0C4DE" , "#FFE4E1"  , "#B0C4DE", "#B0C4DE" , "#FFE4E1"    ,"#FFE4E1", "#FFE4E1"          , "#FFE4E1"               , "#FFE4E1"          , "#FFE4E1"      , "#B0C4DE"]
color_node = ["black"] * len(color)
layout = dict(font=dict(size=10), height=750)
data_trace = dict(type="sankey",
                  orientation="h",
                  valueformat=",",
                  node=dict(pad=200, label=label, color=color_node),
                  link=dict(source=source,target=target, value=value, color=color))

iplot(dict(data=[data_trace], layout=layout),
      validate=False)

In [16]:
with create_engine(con_str).connect() as con:
    execution_plan = con.execute("DROP INDEX if exists titles_index")