In [1]:
import duckdb
from threading import Lock
from contextlib import contextmanager
from functools import wraps
import time

db_file = "../jupyter/ils-analytics-import/circ_trans_master.db"
    
def retry(max_attempts=10, initial_delay=1, backoff_factor=2):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            delay = initial_delay
            for attempt in range(max_attempts):
                try:
                    return func(*args, **kwargs)
                except Exception as e:
                    print(f"Attempt {attempt+1} failed with error: {e}")
                    if attempt < max_attempts - 1:  # i.e., if it's not the last attempt
                        print(f"Waiting {delay} seconds before retrying...")
                        time.sleep(delay)  # pause execution for 'delay' seconds
                        delay *= backoff_factor  # increase the delay
                        continue  # go to the next attempt
                    else:  # this was the last attempt
                        raise  # re-raise the last exception
        return wrapper
    return decorator


@retry(max_attempts=10)
def execute_query(cursor, query, params):
    cursor.execute(query, params)
    data = cursor.fetchall()
    columns = [description[0] for description in cursor.description]
    # print(branch1, '<->', branch2, [data[0] for data in data])
    
    return {'data': data, 'columns': columns}


@contextmanager
def get_duckdb_cursor(db_file):
    with Lock():
        sqlite_attach = f"""\
        CALL sqlite_attach('{db_file}', overwrite=true);
        """    
        try:
            con = duckdb.connect(':memory:')
            con.install_extension('sqlite')
            con.load_extension('sqlite')

            cursor = con.cursor()
            cursor.execute(sqlite_attach)
            yield cursor

        finally:
            cursor.close()
            del(cursor)
            con.close()
            del(con)
        

# test things out ...
with get_duckdb_cursor(db_file) as cursor:
    sql = """\
    SELECT
        count(*) 
    FROM
        circ_trans
    """
    result = execute_query(cursor, sql, None)
    print(result)


with get_duckdb_cursor(db_file) as cursor:
    sql = """\
    SELECT
        count(DISTINCT patron_record_id) 
    FROM
        circ_trans
    """
    result = execute_query(cursor, sql, None)
    print(result)

{'data': [(82587794,)], 'columns': ['count_star()']}
{'data': [(367309,)], 'columns': ['count(DISTINCT patron_record_id)']}


In [2]:
# get the branch data

sql = """\
SELECT DISTINCT 
    map_location.name
FROM
    map_stat_group
    JOIN map_location ON map_location.code = map_stat_group.location_code
ORDER BY
    map_location.name
"""

with get_duckdb_cursor(db_file) as cursor:
    result = execute_query(cursor, sql, params=None)
    # print(result)
    
    branch_data = [data[0] for data in result['data']]
    # print(branch_data)

In [3]:
sql = """\
SELECT
    -- map_stat_group.location_code,
    map_location.name as location_name,
    count(DISTINCT circ_trans.patron_record_id) as count_distinct_patrons
FROM
    circ_trans
    left outer join map_stat_group on map_stat_group.code = circ_trans.stat_group_code_num
    left outer join map_location on map_location.code = map_stat_group.location_code
WHERE
    circ_trans.patron_record_id IN (
        SELECT
            ct.patron_record_id
        FROM
            circ_trans as ct
        WHERE
            ct.op_code in ('o','i')
            AND ct.transaction_gmt >= '2022-01-01'
            AND ct.stat_group_code_num in (
                SELECT DISTINCT 
                    map_stat_group.code
                FROM
                    map_stat_group
                    JOIN map_location ON map_location.code = map_stat_group.location_code
                WHERE
                    map_location.name IN (?)
            )
            -- AND circ_trans.ptype_code < '196'
        GROUP BY 
            ct.patron_record_id
    )
    AND circ_trans.op_code in ('o','i')
    AND circ_trans.transaction_gmt >= '2022-01-01'
    AND map_location.name NOT IN (
        'Distribution Center',
        'Main Library'
    )
GROUP BY 1
ORDER BY 2 DESC
"""

results = {}
for branch in branch_data:
    print(branch, end='')
    results[branch] = []
    with get_duckdb_cursor(db_file) as cursor:
        result = execute_query(cursor, sql, params=[branch])
        results[branch].extend(result['data'])
    print('.')

Anderson.
Anderson Pickup Window.
Avondale.
Blue Ash.
Bond Hill.
CheviotAttempt 1 failed with error: Invalid Error: Failed to prepare query "SELECT "code", "location_code" FROM "map_stat_group" WHERE ROWID BETWEEN 0 AND 122879": database is locked
Waiting 1 seconds before retrying...
.
Clifton.
Clifton Pickup Locker.
College Hill.
Corryville.
CovedaleAttempt 1 failed with error: Invalid Error: Failed to prepare query "SELECT "code", "location_code" FROM "map_stat_group" WHERE ROWID BETWEEN 0 AND 122879": database is locked
Waiting 1 seconds before retrying...
.
Covedale Pickup WindowAttempt 1 failed with error: Invalid Error: Failed to prepare query "SELECT "code", "location_code" FROM "map_stat_group" WHERE ROWID BETWEEN 0 AND 122879": database is locked
Waiting 1 seconds before retrying...
.
Deer Park.
Delhi Township.
Delhi Township Pickup Window.
Distribution Center.
Elmwood PlaceAttempt 1 failed with error: Invalid Error: Failed to prepare query "SELECT "code", "location_code" FROM

In [4]:
results.keys()

dict_keys(['Anderson', 'Anderson Pickup Window', 'Avondale', 'Blue Ash', 'Bond Hill', 'Cheviot', 'Clifton', 'Clifton Pickup Locker', 'College Hill', 'Corryville', 'Covedale', 'Covedale Pickup Window', 'Deer Park', 'Delhi Township', 'Delhi Township Pickup Window', 'Distribution Center', 'Elmwood Place', 'Forest Park', 'Green Township', 'Greenhills', 'Groesbeck', 'Groesbeck Pickup Window', 'Harrison', 'Harrison Pickup Window', 'Hyde Park', 'Loveland', 'Madeira', 'Madisonville', 'Main Library', 'Main Pickup Window', 'Mariemont', 'Miami Township', 'Monfort Heights', 'Mt Healthy', 'Mt Washington', 'North Central', 'Northside', 'Norwood', 'Oakley', 'Oakley Pickup Locker', 'Outreach Services', 'Pleasant Ridge', 'Price Hill', 'Reading', 'Reading Pickup Window', 'Sharonville', 'St Bernard', 'Symmes Township', 'Symmes Township Pickup Window', 'Walnut Hills', 'West End', 'Westwood', 'Wyoming'])

In [13]:
overlaps_columns = ['start_branch', 'end_branch', 'count_shared_patrons']
overlaps_data = []

for start_branch in results.keys():
    for end_result in results[start_branch]:
        overlaps_data.append((
            start_branch,
            end_result[0], # end_branch
            end_result[1]  # count_shared_patrons
        ))
        # print(results[result])
    # break

In [16]:
# import pickle

# with open('patron_branch_overlaps.pickle', 'wb') as f:
#     pickle.dump(
#         {
#             'overlaps_columns': overlaps_columns,
#             'overlaps_data': overlaps_data
    
#         }, 
#         f
#     )

In [46]:
with open('patron_branch_overlaps.pickle', 'rb') as f:
    my_data = pickle.load(f)
    
import pandas as pd
import altair as alt

df = pd.DataFrame(
    data=my_data['overlaps_data'],
    columns=my_data['overlaps_columns'],
)

In [47]:
df.columns

Index(['start_branch', 'end_branch', 'count_shared_patrons'], dtype='object')

In [48]:
chart1 = alt.Chart(df).mark_rect().encode(
    x=alt.X('start_branch:O'),
    y=alt.Y('end_branch:O'),
    color='count_shared_patrons:Q',
    tooltip=[
        alt.Tooltip('start_branch:O', title='Start Branch'),
        alt.Tooltip('end_branch:O', title='End Branch'),
        alt.Tooltip('count_shared_patrons:Q', title='Count of Shared Patrons')
    ]
)

chart1

In [53]:
# Filter rows where 'start_branch' and 'end_branch' have different values
filtered_df = df.loc[df['start_branch'] != df['end_branch']]

# # Calculate the sum of 'count_shared_patrons' for each unique 'end_branch'
# sum_of_shared_patrons = filtered_df.groupby('end_branch')['count_shared_patrons'].sum().reset_index()

# # Create a mapping of 'end_branch' to their corresponding sum
# sum_mapping = dict(zip(sum_of_shared_patrons['end_branch'], sum_of_shared_patrons['count_shared_patrons']))

# # Convert 'end_branch' to categorical with custom sorting order
# filtered_df['end_branch'] = pd.Categorical(filtered_df['end_branch'], categories=sorted(sum_mapping, key=sum_mapping.get, reverse=False))

chart1 = alt.Chart(filtered_df).mark_rect().encode(
    x=alt.X('start_branch:O'),
    y=alt.Y('end_branch:O'),
    color='count_shared_patrons:Q',
    tooltip=[
        alt.Tooltip('start_branch:O', title='Start Branch'),
        alt.Tooltip('end_branch:O', title='End Branch'),
        alt.Tooltip('count_shared_patrons:Q', title='Count of Shared Patrons')
    ]
)

chart1
