In [1]:
import pandas as pd 

from plotly.offline import init_notebook_mode
from sqlalchemy import create_engine

from query_flow.parsers.postgres_parser import PostgresParser
from query_flow.vizualizers.query_vizualizer import QueryVizualizer

In [11]:
%load_ext autoreload
%autoreload 2

init_notebook_mode(connected=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
con_str = 'postgresql:///etrabelsi_thesis'

## Identifying missing records

In [5]:
query_renderer = QueryVizualizer(parser=PostgresParser())

In [6]:
query = """
SELECT titles.title_id
FROM titles
INNER JOIN crew ON crew.title_id = titles.title_id
INNER JOIN people ON people.person_id = crew.person_id
WHERE genres = 'comedy'
  AND name in ('Owen Wilson', 'Adam Sandler', 'Jason Segel')
"""

In [7]:
query_renderer.parser.from_query(query, con_str)

{'Node Type': 'Nested Loop',
 'Parallel Aware': False,
 'Join Type': 'Inner',
 'Startup Cost': 143258.67,
 'Total Cost': 935485.66,
 'Plan Rows': 1,
 'Plan Width': 10,
 'Actual Startup Time': 377.606,
 'Actual Total Time': 377.606,
 'Actual Rows': 0,
 'Actual Loops': 1,
 'Output': ['titles.title_id'],
 'Inner Unique': False,
 'Join Filter': '(crew.title_id = titles.title_id)',
 'Rows Removed by Join Filter': 0,
 'Shared Hit Blocks': 4668,
 'Shared Read Blocks': 118825,
 'Shared Dirtied Blocks': 0,
 'Shared Written Blocks': 0,
 'Local Hit Blocks': 0,
 'Local Read Blocks': 0,
 'Local Dirtied Blocks': 0,
 'Local Written Blocks': 0,
 'Temp Read Blocks': 0,
 'Temp Written Blocks': 0,
 'Plans': [{'Node Type': 'Gather',
   'Parent Relationship': 'Outer',
   'Parallel Aware': False,
   'Startup Cost': 1000.0,
   'Total Cost': 161779.45,
   'Plan Rows': 1229,
   'Plan Width': 10,
   'Actual Startup Time': 377.606,
   'Actual Total Time': 379.456,
   'Actual Rows': 0,
   'Actual Loops': 1,
   'O

In [8]:
cardinality_df = query_renderer.get_cardinality_df(query, con_str)[["source","target","operation_type","actual_rows","label","label_metadata"]]
cardinality_df.to_csv("/Users/etrabelsi/IdeaProjects/thesis/tests/parsers/data/missing_records/cardinality.csv", index=False)



divide by zero encountered in long_scalars



In [22]:
cardinality_df = query_renderer.get_cardinality_df(query, con_str)
query_renderer.vizualize(cardinality_df, title="Identifying missing records", metrics=["actual_rows"], open_=False)

In [29]:
query ="""
SELECT titles.title_id
FROM titles
INNER JOIN crew ON crew.title_id = titles.title_id
INNER JOIN people ON people.person_id = crew.person_id
WHERE genres ilike '%Comedy%' 
  AND name in ('Owen Wilson', 'Adam Sandler', 'Jason Segel')
"""

In [30]:
cardinality_df = query_renderer.get_cardinality_df(query, con_str)
query_renderer.vizualize(cardinality_df, title="Missing Records in Where Clause", metrics=["actual_rows"], open_=False)
