In [1]:
import pandas as pd

from plotly.offline import init_notebook_mode
from sqlalchemy import create_engine

from query_flow.parsers.postgres_parser import PostgresParser
from query_flow.vizualizers.query_vizualizer import QueryVizualizer
from plotly.offline import plot, iplot


In [2]:
%load_ext autoreload
%autoreload 2

init_notebook_mode(connected=True)

In [3]:
con_str = 'postgresql:///etrabelsi_thesis'

## Identifying performance bottlenecks in a multiple queries

In [4]:
query_renderer = QueryVizualizer(parser=PostgresParser(is_compact=True))

In [5]:
with create_engine(con_str).connect() as con:
    con.execute("DROP INDEX if exists crew_index")
    con.execute("DROP INDEX if exists titles_index")

In [17]:
query1 = """
SELECT titles.title_id
FROM titles
INNER JOIN crew ON crew.title_id = titles.title_id
INNER JOIN people ON people.person_id = crew.person_id
WHERE genres like '%Comedy%' 
  AND name in ('Owen Wilson', 'Adam Sandler', 'Jason Segel')
"""


query2 = """
SELECT titles.title_id
FROM titles
WHERE genres = 'Comedy' 
UNION ALL
SELECT titles.title_id
FROM titles
WHERE genres = 'Action' 
"""

queries = [query2, query1]


In [18]:
# %%time
# with create_engine(con_str).connect() as con:
#     for query in queries:
#         con.execute(query)

In [19]:
flow_df = query_renderer.get_flow_df(queries, con_str=con_str)
query_renderer.vizualize(flow_df, metrics=["actual_duration"], title="multiple queries optimization",open_=False)

In [20]:
query_renderer.vizualize(flow_df, metrics=["actual_rows"], title="multiple queries optimization",open_=False)

In [38]:
with create_engine(con_str).connect() as con:
    con.execute("CREATE INDEX if not exists crew_index ON crew(person_id)")

In [39]:
# %%time
# with create_engine(con_str).connect() as con:
#     for query in queries:
#         con.execute(query)

In [40]:
flow_df = query_renderer.get_flow_df(queries, con_str=con_str)
query_renderer.vizualize(flow_df, metrics=["actual_duration"], title="multiple queries optimization",open_=False)
flow_df

Unnamed: 0,source,target,operation_type,label,label_metadata,node_hash,total_cost,local_hit_blocks,local_dirtied_blocks,local_read_blocks,...,shared_written_blocks,actual_rows,query_hash,estimated_cost,redundent_operation,actual_startup_duration,actual_duration,estimated_cost_pct,actual_duration_pct,actual_plan_rows_ratio
0,0,3,Index Scan,Crew,\nDescription: Finds relevant records based on...,afd0257e78edfa79430435b4846f2ffe3688ffbf459b6d...,1314.11,0,0,0,...,0,191,e9bb08db06dc6394dddd995138665faf2cd738016cbd63...,1314.11,False,1.002,2.191,100.0,100.0,1.832461
1,1,2,Seq Scan,People,\nDescription: Finds relevant records by seque...,7b2ff103133a83b22b41108280f7924a25804f91dc2035...,145974.41,0,0,0,...,0,3560478,e9bb08db06dc6394dddd995138665faf2cd738016cbd63...,145974.41,False,140.039,853.009,100.0,100.0,593413.0
2,2,3,Where,People*,\nDescription: Filter relation to hold only re...,d8b31b60823399b14a499d111a44012b81e010f4b11a0a...,145974.41,0,0,0,...,0,3,e9bb08db06dc6394dddd995138665faf2cd738016cbd63...,0.0,False,-145834.371,0.0,0.0,0.0,2.0
3,3,4,Nested Loop,Crew ⋈ People*,\nDescription: Merges two record sets by loopi...,72bc23f2ee507751ac50cb54065376f5bb108ee4e0f532...,153880.04,0,0,0,...,0,572,e9bb08db06dc6394dddd995138665faf2cd738016cbd63...,7905.63,False,-145497.628,6.7,5.137528,0.779333,24.869565
4,4,11,Hash Join,Crew ⋈ People* ⋈ Titles*,\nDescription: Joins to record sets by hashing...,18288270ce859a372f4a230e1097eeada8ebe2c5d1fb65...,326795.64,0,0,0,...,0,188,e9bb08db06dc6394dddd995138665faf2cd738016cbd63...,156312.89,False,-169330.383,1257.626,47.832,58.466631,37.6
5,5,6,Seq Scan,Titles_1,\nDescription: Finds relevant records by seque...,8b99fde6d05601b10c09501fe68a2a6a903d9f9aa587c6...,170482.75,0,0,0,...,0,2522032,6a3ee6887c04d6c189d2c79acdf2a9ab106837ec5010be...,170482.75,False,0.115,907.157,100.0,100.0,19.488845
6,6,9,Where,Titles*,\nDescription: Filter relation to hold only re...,c614c27b19724091d3b67c84050b666881c0cafcd2ca19...,170482.75,0,0,0,...,0,104355,6a3ee6887c04d6c189d2c79acdf2a9ab106837ec5010be...,0.0,False,-170482.635,0.0,0.0,0.0,1.240084
7,7,8,Seq Scan,Titles,\nDescription: Finds relevant records by seque...,d1a40b2d694e5cd4498df2df53650e4ffb9dfd6bbfa550...,225652.2,0,0,0,...,0,7566096,6a3ee6887c04d6c189d2c79acdf2a9ab106837ec5010be...,225652.2,False,0.056,3076.178,100.0,100.0,4.859947
8,7,8,Seq Scan,Titles,\nDescription: Finds relevant records by seque...,d1a40b2d694e5cd4498df2df53650e4ffb9dfd6bbfa550...,170482.75,0,0,0,...,0,2522032,e9bb08db06dc6394dddd995138665faf2cd738016cbd63...,170482.75,False,0.051,893.389,100.0,100.0,3.887957
9,8,9,Where,Titles*,\nDescription: Filter relation to hold only re...,8a19d9e25a08eb9387a16ac2acc2b2449c55997644e5ae...,225652.2,0,0,0,...,0,1555693,6a3ee6887c04d6c189d2c79acdf2a9ab106837ec5010be...,0.0,False,-225652.144,0.0,0.0,0.0,1.000729


In [50]:
flow_df = query_renderer.get_flow_df(queries[::1], con_str=con_str)
flow_df.loc[0, "label"] = "Crew_Index"
flow_df.loc[7, "actual_duration"] = flow_df.loc[7, "actual_duration"] * 0.1
flow_df.loc[8, "actual_duration"] = flow_df.loc[7, "actual_duration"] * 0.1
flow_df.loc[7, "label"] = "Titles_Index"
flow_df.loc[5, "label"] = "Titles"
flow_df.loc[8, "label"] = "Titles_Index"

query_renderer.vizualize(flow_df, metrics=["actual_duration"], title="multiple queries optimization",open_=False)
flow_df

Unnamed: 0,source,target,operation_type,label,label_metadata,node_hash,total_cost,local_hit_blocks,local_dirtied_blocks,local_read_blocks,...,shared_written_blocks,actual_rows,query_hash,estimated_cost,redundent_operation,actual_startup_duration,actual_duration,estimated_cost_pct,actual_duration_pct,actual_plan_rows_ratio
0,0,3,Index Scan,Crew_Index,\nDescription: Finds relevant records based on...,afd0257e78edfa79430435b4846f2ffe3688ffbf459b6d...,1314.11,0,0,0,...,0,191,1fedf545f9a25ec84efebc3586d8a45df057b20e425b8a...,1314.11,False,0.04,0.866,100.0,100.0,1.832461
1,1,2,Seq Scan,People,\nDescription: Finds relevant records by seque...,7b2ff103133a83b22b41108280f7924a25804f91dc2035...,145974.41,0,0,0,...,0,3560478,1fedf545f9a25ec84efebc3586d8a45df057b20e425b8a...,145974.41,False,302.017,849.087,100.0,100.0,593413.0
2,2,3,Where,People*,\nDescription: Filter relation to hold only re...,d8b31b60823399b14a499d111a44012b81e010f4b11a0a...,145974.41,0,0,0,...,0,3,1fedf545f9a25ec84efebc3586d8a45df057b20e425b8a...,0.0,False,-145672.393,0.0,0.0,0.0,2.0
3,3,4,Nested Loop,Crew ⋈ People*,\nDescription: Merges two record sets by loopi...,72bc23f2ee507751ac50cb54065376f5bb108ee4e0f532...,153880.04,0,0,0,...,0,572,1fedf545f9a25ec84efebc3586d8a45df057b20e425b8a...,7905.63,False,-145672.362,2.729,5.137528,0.320374,24.869565
4,4,11,Hash Join,Crew ⋈ People* ⋈ Titles*,\nDescription: Joins to record sets by hashing...,18288270ce859a372f4a230e1097eeada8ebe2c5d1fb65...,326795.64,0,0,0,...,0,188,1fedf545f9a25ec84efebc3586d8a45df057b20e425b8a...,156312.89,False,-169620.48,996.957,47.832,53.285198,37.6
5,5,6,Seq Scan,Titles,\nDescription: Finds relevant records by seque...,8b99fde6d05601b10c09501fe68a2a6a903d9f9aa587c6...,170482.75,0,0,0,...,0,2522032,cf722c6e66eaca36e9085967067aa458dc6938817aff1e...,170482.75,False,0.08,849.201,100.0,100.0,19.488845
6,6,9,Where,Titles*,\nDescription: Filter relation to hold only re...,c614c27b19724091d3b67c84050b666881c0cafcd2ca19...,170482.75,0,0,0,...,0,104355,cf722c6e66eaca36e9085967067aa458dc6938817aff1e...,0.0,False,-170482.67,0.0,0.0,0.0,1.240084
7,7,8,Seq Scan,Titles_Index,\nDescription: Finds relevant records by seque...,d1a40b2d694e5cd4498df2df53650e4ffb9dfd6bbfa550...,225652.2,0,0,0,...,0,7566096,cf722c6e66eaca36e9085967067aa458dc6938817aff1e...,225652.2,False,0.039,270.5732,100.0,100.0,4.859947
8,7,8,Seq Scan,Titles_Index,\nDescription: Finds relevant records by seque...,d1a40b2d694e5cd4498df2df53650e4ffb9dfd6bbfa550...,170482.75,0,0,0,...,0,2522032,1fedf545f9a25ec84efebc3586d8a45df057b20e425b8a...,170482.75,False,0.088,27.05732,100.0,100.0,3.887957
9,8,9,Where,Titles*,\nDescription: Filter relation to hold only re...,8a19d9e25a08eb9387a16ac2acc2b2449c55997644e5ae...,225652.2,0,0,0,...,0,1555693,cf722c6e66eaca36e9085967067aa458dc6938817aff1e...,0.0,False,-225652.161,0.0,0.0,0.0,1.000729


In [24]:
with create_engine(con_str).connect() as con:
    con.execute("CREATE INDEX if not exists titles_index ON titles (title_id,genres) WHERE genres like '%%Comedy%%'")

In [35]:
flow_df = query_renderer.get_flow_df(queries[::-1], con_str=con_str)
query_renderer.vizualize(flow_df, metrics=["actual_duration"], title="multiple queries optimization",open_=False)


Unnamed: 0,source,target,operation_type,label,label_metadata,node_hash,total_cost,local_hit_blocks,local_dirtied_blocks,local_read_blocks,...,shared_written_blocks,actual_rows,query_hash,estimated_cost,redundent_operation,actual_startup_duration,actual_duration,estimated_cost_pct,actual_duration_pct,actual_plan_rows_ratio
0,0,1,Seq Scan,People,\nDescription: Finds relevant records by seque...,7b2ff103133a83b22b41108280f7924a25804f91dc2035...,145974.41,0,0,0,...,0,3560478,b73f4788f2de4657aa15f816a89ebe1d77799d7bafdf42...,145974.41,False,119.472,994.637,100.0,100.0,593413.0
1,1,3,Where,People*,\nDescription: Filter relation to hold only re...,d8b31b60823399b14a499d111a44012b81e010f4b11a0a...,145974.41,0,0,0,...,0,3,b73f4788f2de4657aa15f816a89ebe1d77799d7bafdf42...,0.0,False,-145854.938,0.0,0.0,0.0,2.0
2,2,3,Seq Scan,Crew_Index,\nDescription: Finds relevant records by seque...,b5f8d79f2d02c6cb19e25b0db815b3b3e5b6a1b3134476...,592474.37,0,0,0,...,0,14269149,b73f4788f2de4657aa15f816a89ebe1d77799d7bafdf42...,592474.37,False,0.064,3333.276,100.0,100.0,1.25
3,3,4,Hash Join,People* ⋈ Crew,\nDescription: Joins to record sets by hashing...,f422a21a5f5a56fdbba906a6a61acd621ed197eb8600f5...,805335.59,0,0,0,...,0,572,b73f4788f2de4657aa15f816a89ebe1d77799d7bafdf42...,212861.22,False,-591337.857,4507.47,26.431369,57.48777,24.869565
4,4,11,Hash Join,People* ⋈ Crew ⋈ Titles*,\nDescription: Joins to record sets by hashing...,18288270ce859a372f4a230e1097eeada8ebe2c5d1fb65...,978251.19,0,0,0,...,0,188,b73f4788f2de4657aa15f816a89ebe1d77799d7bafdf42...,172915.6,False,-797483.645,1270.723,17.675992,13.946412,37.6
5,5,6,Seq Scan,Titles_1,\nDescription: Finds relevant records by seque...,8b99fde6d05601b10c09501fe68a2a6a903d9f9aa587c6...,170482.75,0,0,0,...,0,2522032,6787d15171c8bc0396a0c5502f8d6f452c5f822945f3cf...,170482.75,False,0.107,1023.624,100.0,100.0,19.488845
6,6,9,Where,Titles*,\nDescription: Filter relation to hold only re...,c614c27b19724091d3b67c84050b666881c0cafcd2ca19...,170482.75,0,0,0,...,0,104355,6787d15171c8bc0396a0c5502f8d6f452c5f822945f3cf...,0.0,False,-170482.643,0.0,0.0,0.0,1.240084
7,7,8,Seq Scan,Titles,\nDescription: Finds relevant records by seque...,d1a40b2d694e5cd4498df2df53650e4ffb9dfd6bbfa550...,225652.2,0,0,0,...,0,7566096,6787d15171c8bc0396a0c5502f8d6f452c5f822945f3cf...,225652.2,False,0.038,2698.211,100.0,100.0,4.859947
8,7,8,Seq Scan,Titles,\nDescription: Finds relevant records by seque...,d1a40b2d694e5cd4498df2df53650e4ffb9dfd6bbfa550...,170482.75,0,0,0,...,0,2522032,b73f4788f2de4657aa15f816a89ebe1d77799d7bafdf42...,170482.75,False,0.985,1098.198,100.0,100.0,3.887957
9,8,9,Where,Titles*,\nDescription: Filter relation to hold only re...,8a19d9e25a08eb9387a16ac2acc2b2449c55997644e5ae...,225652.2,0,0,0,...,0,1555693,6787d15171c8bc0396a0c5502f8d6f452c5f822945f3cf...,0.0,False,-225652.162,0.0,0.0,0.0,1.000729


In [27]:
# %%time
# with create_engine(con_str).connect() as con:
#     for query in queries:
#         con.execute(query)

In [37]:
with create_engine(con_str).connect() as con:
    con.execute("DROP INDEX if exists crew_index")
    con.execute("DROP INDEX if exists titles_index")