In [None]:
import random
import datetime
from hashlib import md5
import sqlite3

import pandas as pd

In [None]:
# ! test_cases_for_sql_data_connector.db

In [None]:
filename = "test_cases_for_sql_data_connector_new.db"
db = sqlite3.connect(filename)

In [None]:
def generate_ascending_list_of_datetimes(
    k,
    start_date=datetime.date(2020,1,1),
    end_date=datetime.date(2020,12,31)
):
    start_time = datetime.datetime(start_date.year, start_date.month, start_date.day)
    days_between_dates = (end_date - start_date).total_seconds()
    
    datetime_list = [start_time + datetime.timedelta(seconds=random.randrange(days_between_dates)) for i in range(k)]
    datetime_list.sort()
    return datetime_list

generate_ascending_list_of_datetimes(10)

In [None]:
k = 120
random.seed(1)

timestamp_list = generate_ascending_list_of_datetimes(k, end_date=datetime.date(2020,1,31))
date_list = [datetime.date(ts.year, ts.month, ts.day) for ts in timestamp_list]

batch_ids = [random.randint(0,10) for i in range(k)]
batch_ids.sort()

session_ids = [random.randint(2,60) for i in range(k)]
session_ids.sort()
session_ids = [i-random.randint(0,2) for i in session_ids]

events_df = pd.DataFrame({
    "id" : range(k),
    "batch_id" : batch_ids,
    "date" : date_list,
    "y" : [d.year for d in date_list],
    "m" : [d.month for d in date_list],
    "d" : [d.day for d in date_list],
    "timestamp" : timestamp_list,
    "session_id" : session_ids,
    "event_type" : [random.choice(["start", "stop", "continue"]) for i in range(k)],
    "favorite_color" : ["#"+"".join([random.choice(list("0123456789ABCDEF")) for j in range(6)]) for i in range(k)]
})

# NOTE: in this fake example, id, batch_id, date, and timestamp are all sorted in strictly the same order
# This would not necessarily be true in the real world.
# Events might arrive and be indexed out of order.
# Batches might not correspond strictly with units in time.
# etc.

# events_df.to_sql("events_df", db)

In [None]:
events_df[["id", "date", "event_type", "favorite_color"]].to_sql("table_partitioned_by_date_column__A", db)

In [None]:
events_df[["id", "timestamp", "event_type", "favorite_color"]].to_sql("table_partitioned_by_timestamp_column__B", db)

In [None]:
df = events_df[["id", "event_type", "favorite_color"]].to_sql("table_partitioned_by_regularly_spaced_incrementing_id_column__C", db)

In [None]:
events_df[["id", "event_type", "favorite_color"]].to_sql("table_partitioned_by_irregularly_spaced_incrementing_id_with_spacing_in_a_second_table__D", db)
lookup_df = events_df.groupby("date").id.min()
lookup_df.to_sql("table_containing_id_spacers_for_D", db)

In [None]:
events_df[["id", "batch_id", "event_type", "favorite_color"]].to_sql("table_partitioned_by_incrementing_batch_id__E", db)

In [None]:
events_df[["id", "session_id", "event_type", "favorite_color"]].to_sql("table_partitioned_by_foreign_key__F", db)
sessions_df = events_df.groupby("date").id.min()
sessions_df.to_sql("table_with_fk_reference_from_F", db)

In [None]:
events_df[["id", "y", "m", "d", "event_type", "favorite_color"]].to_sql("table_partitioned_by_multiple_columns__G", db)

In [None]:
events_df[["id", "event_type", "favorite_color"]].to_sql("table_that_should_be_partitioned_by_random_hash__H", db)

In [None]:
events_df.to_sql("table_full__I", db)

## Add Views

In [None]:
db_cursor = db.cursor()

In [None]:
db_cursor.execute("CREATE VIEW view_by_date_column__A AS SELECT id, date, event_type, favorite_color FROM table_full__I")

In [None]:
db_cursor.execute("CREATE VIEW view_by_timestamp_column__B AS SELECT id, timestamp, event_type, favorite_color FROM table_full__I")

In [None]:
db_cursor.execute("CREATE VIEW view_by_regularly_spaced_incrementing_id_column__C AS SELECT id, event_type, favorite_color FROM table_full__I")

In [None]:
db_cursor.execute("CREATE VIEW view_by_irregularly_spaced_incrementing_id_with_spacing_in_a_second_table__D AS SELECT id, event_type, favorite_color FROM table_full__I")

In [None]:
db_cursor.execute("CREATE VIEW view_containing_id_spacers_for_D AS SELECT id, batch_id, event_type, favorite_color FROM table_full__I")

In [None]:
db_cursor.execute("CREATE VIEW view_by_incrementing_batch_id__E AS SELECT id, batch_id, event_type, favorite_color FROM table_full__I")

In [None]:
db_cursor.execute("CREATE VIEW view_partitioned_by_foreign_key__F AS SELECT id, session_id, event_type, favorite_color FROM table_full__I")

In [None]:
db_cursor.execute("CREATE VIEW view_with_fk_reference_from_F AS SELECT id, date FROM table_full__I GROUP BY date")

In [None]:
db_cursor.execute("CREATE VIEW view_by_multiple_columns__G AS SELECT id, y, m, d, event_type, favorite_color FROM table_full__I")

In [None]:
db_cursor.execute("CREATE VIEW view_that_should_be_partitioned_by_random_hash__H AS SELECT id, event_type, favorite_color FROM table_full__I")

## Test if Views Work

In [None]:
db_cursor.execute("SELECT * FROM view_by_date_column__A;")
rows = db_cursor.fetchall()

for row in rows: 
    print(row) 

In [None]:
db_cursor.execute("SELECT * FROM view_by_timestamp_column__B;")
rows = db_cursor.fetchall()
for row in rows: 
    print(row) 

In [None]:
db_cursor.execute("SELECT * FROM view_by_regularly_spaced_incrementing_id_column__C;")
rows = db_cursor.fetchall()
for row in rows: 
    print(row) 

In [None]:
db_cursor.execute("SELECT * FROM view_by_regularly_spaced_incrementing_id_column__C;")
rows = db_cursor.fetchall()
for row in rows: 
    print(row) 

In [None]:
db_cursor.execute("SELECT * FROM view_by_irregularly_spaced_incrementing_id_with_spacing_in_a_second_table__D;")
rows = db_cursor.fetchall()
for row in rows: 
    print(row) 

In [None]:
db_cursor.execute("SELECT * FROM view_by_incrementing_batch_id__E;")
rows = db_cursor.fetchall()
for row in rows: 
    print(row) 

In [None]:
db_cursor.execute("SELECT * FROM view_partitioned_by_foreign_key__F;")
rows = db_cursor.fetchall()
for row in rows: 
    print(row) 

In [None]:
db_cursor.execute("SELECT * FROM view_with_fk_reference_from_F;")
rows = db_cursor.fetchall()
for row in rows: 
    print(row) 

In [None]:
db_cursor.execute("SELECT * FROM view_by_multiple_columns__G;")
rows = db_cursor.fetchall()
for row in rows: 
    print(row) 

In [None]:
db_cursor.execute("SELECT * FROM view_that_should_be_partitioned_by_random_hash__H;")
rows = db_cursor.fetchall()
for row in rows: 
    print(row) 