In [1]:
import random
import datetime
from hashlib import md5
import sqlite3

import pandas as pd

In [2]:
# ! test_cases_for_sql_data_connector.db

In [3]:
filename = "test_cases_for_sql_data_connector.db"
db = sqlite3.connect(filename)

In [4]:
def generate_ascending_list_of_datetimes(
    k,
    start_date=datetime.date(2020,1,1),
    end_date=datetime.date(2020,12,31)
):
    start_time = datetime.datetime(start_date.year, start_date.month, start_date.day)
    days_between_dates = (end_date - start_date).total_seconds()
    
    datetime_list = [start_time + datetime.timedelta(seconds=random.randrange(days_between_dates)) for i in range(k)]
    datetime_list.sort()
    return datetime_list

generate_ascending_list_of_datetimes(10)

[datetime.datetime(2020, 1, 15, 20, 21, 27),
 datetime.datetime(2020, 2, 17, 11, 41, 59),
 datetime.datetime(2020, 2, 24, 22, 15, 34),
 datetime.datetime(2020, 3, 24, 19, 38, 13),
 datetime.datetime(2020, 4, 16, 1, 32, 54),
 datetime.datetime(2020, 5, 8, 18, 37, 28),
 datetime.datetime(2020, 6, 26, 9, 9, 1),
 datetime.datetime(2020, 8, 14, 16, 47, 23),
 datetime.datetime(2020, 10, 31, 7, 8, 30),
 datetime.datetime(2020, 11, 28, 13, 36, 31)]

In [5]:
k = 120
random.seed(1)

timestamp_list = generate_ascending_list_of_datetimes(k, end_date=datetime.date(2020,1,31))
date_list = [datetime.date(ts.year, ts.month, ts.day) for ts in timestamp_list]

batch_ids = [random.randint(0,10) for i in range(k)]
batch_ids.sort()

session_ids = [random.randint(2,60) for i in range(k)]
session_ids.sort()
session_ids = [i-random.randint(0,2) for i in session_ids]

events_df = pd.DataFrame({
    "id" : range(k),
    "batch_id" : batch_ids,
    "date" : date_list,
    "y" : [d.year for d in date_list],
    "m" : [d.month for d in date_list],
    "d" : [d.day for d in date_list],
    "timestamp" : timestamp_list,
    "session_id" : session_ids,
    "event_type" : [random.choice(["start", "stop", "continue"]) for i in range(k)],
    "favorite_color" : ["#"+"".join([random.choice(list("0123456789ABCDEF")) for j in range(6)]) for i in range(k)]
})

# NOTE: in this fake example, id, batch_id, date, and timestamp are all sorted in strictly the same order
# This would not necessarily be true in the real world.
# Events might arrive and be indexed out of order.
# Batches might not correspond strictly with units in time.
# etc.

# events_df.to_sql("events_df", db)

In [6]:
events_df[["id", "date", "event_type", "favorite_color"]].to_sql("table_partitioned_by_date_column__A", db)

In [7]:
events_df[["id", "timestamp", "event_type", "favorite_color"]].to_sql("table_partitioned_by_timestamp_column__B", db)

In [8]:
df = events_df[["id", "event_type", "favorite_color"]].to_sql("table_partitioned_by_regularly_spaced_incrementing_id_column__C", db)

In [9]:
events_df[["id", "event_type", "favorite_color"]].to_sql("table_partitioned_by_irregularly_spaced_incrementing_id_with_spacing_in_a_second_table__D", db)
lookup_df = events_df.groupby("date").id.min()
lookup_df.to_sql("table_containing_id_spacers_for_D", db)

In [10]:
events_df[["id", "batch_id", "event_type", "favorite_color"]].to_sql("table_partitioned_by_incrementing_batch_id__E", db)

In [11]:
events_df[["id", "session_id", "event_type", "favorite_color"]].to_sql("table_partitioned_by_foreign_key__F", db)
sessions_df = events_df.groupby("date").id.min()
sessions_df.to_sql("table_with_fk_reference_from_F", db)

In [12]:
events_df[["id", "y", "m", "d", "event_type", "favorite_color"]].to_sql("table_partitioned_by_multiple_columns__G", db)

In [13]:
events_df[["id", "event_type", "favorite_color"]].to_sql("table_that_should_be_partitioned_by_random_hash__H", db)