In [1]:
import pandas as pd

from plotly.offline import init_notebook_mode
from sqlalchemy import create_engine

from query_flow.parsers.postgres_parser import PostgresParser
from query_flow.vizualizers.query_vizualizer import QueryVizualizer
from plotly.offline import plot, iplot


In [2]:
%load_ext autoreload
%autoreload 2

init_notebook_mode(connected=True)

In [3]:
con_str = 'postgresql:///etrabelsi_thesis'

## Identifying performance bottlenecks in a multiple queries

In [13]:
query_renderer = QueryVizualizer(parser=PostgresParser(is_compact=True))

In [54]:
with create_engine(con_str).connect() as con:
    con.execute("DROP INDEX if exists crew_index")
    con.execute("DROP INDEX if exists titles_index")

In [55]:
query1 = """
SELECT titles.title_id
FROM titles
INNER JOIN crew ON crew.title_id = titles.title_id
INNER JOIN people ON people.person_id = crew.person_id
WHERE genres = 'Comedy' 
  AND name in ('Owen Wilson', 'Adam Sandler', 'Jason Segel')
"""


query2 = """
SELECT titles.title_id
FROM titles
WHERE genres = 'Comedy' 
UNION ALL
SELECT titles.title_id
FROM titles
WHERE genres = 'Action' 
"""

queries = [query1, query2]


In [56]:
%%time
with create_engine(con_str).connect() as con:
    for query in queries:
        con.execute(query)

CPU times: user 34.9 ms, sys: 22.3 ms, total: 57.2 ms
Wall time: 7.74 s


In [57]:
flow_df = query_renderer.get_flow_df(queries, con_str=con_str)
query_renderer.vizualize(flow_df, metrics=["actual_duration"], title="multiple queries optimization",open_=False)
flow_df[["source","target", "label", "actual_duration", "actual_rows"]]



Unnamed: 0,source,target,label,actual_duration,actual_rows
0,0,1,Titles*,0.0,10483
1,1,10,Titles* U Titles*,981.526,562164
2,2,3,People,856.387,3560478
3,3,5,People*,0.0,3
4,4,5,Crew,2971.245,14269149
5,5,8,People* ⋈ Crew,3843.184,572
6,6,7,Titles,834.174,2522032
7,6,7,Titles,2135.406,7566096
8,6,0,Titles,789.677,2522032
9,7,8,Titles*,0.0,176905


In [58]:
query_renderer.vizualize(flow_df, metrics=["actual_rows"], title="multiple queries optimization",open_=False)

In [59]:
with create_engine(con_str).connect() as con:
    con.execute("CREATE INDEX if not exists crew_index ON crew(person_id)")

In [60]:
%%time
with create_engine(con_str).connect() as con:
    for query in queries:
        con.execute(query)

CPU times: user 32.3 ms, sys: 17.5 ms, total: 49.8 ms
Wall time: 4.33 s


In [61]:
flow_df = query_renderer.get_flow_df(queries, con_str=con_str)
query_renderer.vizualize(flow_df, metrics=["actual_duration"], title="multiple queries optimization",open_=False)
flow_df[["source","target", "label", "actual_duration", "actual_rows"]]



Unnamed: 0,source,target,label,actual_duration,actual_rows
0,0,1,Titles*,0.0,10483
1,1,10,Titles* U Titles*,839.371,562164
2,2,5,Crew_Index,0.665,191
3,3,4,People,865.543,3560478
4,4,5,People*,0.0,3
5,5,8,Crew_Index ⋈ People*,2.125,572
6,6,7,Titles,722.86,2522032
7,6,7,Titles,1831.504,7566096
8,6,0,Titles,715.499,2522032
9,7,8,Titles*,0.0,176905


In [62]:
with create_engine(con_str).connect() as con:
    con.execute("CREATE INDEX if not exists titles_index ON titles (title_id,genres)")

In [63]:
fake_flow_df = query_renderer.get_flow_df(queries, con_str=con_str)
# 2,6
query_renderer.vizualize(flow_df, metrics=["actual_duration"], title="multiple queries optimization",open_=False)
flow_df[["source","target", "label", "actual_duration", "actual_rows"]]



Unnamed: 0,source,target,label,actual_duration,actual_rows
0,0,1,Titles*,0.0,10483
1,1,10,Titles* U Titles*,839.371,562164
2,2,5,Crew_Index,0.665,191
3,3,4,People,865.543,3560478
4,4,5,People*,0.0,3
5,5,8,Crew_Index ⋈ People*,2.125,572
6,6,7,Titles,722.86,2522032
7,6,7,Titles,1831.504,7566096
8,6,0,Titles,715.499,2522032
9,7,8,Titles*,0.0,176905


In [64]:
%%time
with create_engine(con_str).connect() as con:
    for query in queries:
        con.execute(query)

CPU times: user 35.3 ms, sys: 23.9 ms, total: 59.2 ms
Wall time: 3.69 s


In [65]:
with create_engine(con_str).connect() as con:
    con.execute("DROP INDEX if exists crew_index")
    con.execute("DROP INDEX if exists titles_index")