In [1]:
import duckdb
from threading import Lock
from contextlib import contextmanager
from functools import wraps
import time

db_file = "../jupyter/ils-analytics-import/circ_trans_master.db"
    
def retry(max_attempts=10, initial_delay=1, backoff_factor=2):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            delay = initial_delay
            for attempt in range(max_attempts):
                try:
                    return func(*args, **kwargs)
                except Exception as e:
                    print(f"Attempt {attempt+1} failed with error: {e}")
                    if attempt < max_attempts - 1:  # i.e., if it's not the last attempt
                        print(f"Waiting {delay} seconds before retrying...")
                        time.sleep(delay)  # pause execution for 'delay' seconds
                        delay *= backoff_factor  # increase the delay
                        continue  # go to the next attempt
                    else:  # this was the last attempt
                        raise  # re-raise the last exception
        return wrapper
    return decorator


@retry(max_attempts=10)
def execute_query(cursor, query, params):
    cursor.execute(query, params)
    data = cursor.fetchall()
    columns = [description[0] for description in cursor.description]
    # print(branch1, '<->', branch2, [data[0] for data in data])
    
    return {'data': data, 'columns': columns}


@contextmanager
def get_duckdb_cursor(db_file):
    with Lock():
        sqlite_attach = f"""\
        CALL sqlite_attach('{db_file}', overwrite=true);
        """    
        try:
            con = duckdb.connect(':memory:')
            con.install_extension('sqlite')
            con.load_extension('sqlite')

            cursor = con.cursor()
            cursor.execute(sqlite_attach)
            yield cursor

        finally:
            cursor.close()
            del(cursor)
            con.close()
            del(con)
        

# test things out ...
with get_duckdb_cursor(db_file) as cursor:
    sql = """\
    SELECT
        count(*) 
    FROM
        circ_trans
    """
    result = execute_query(cursor, sql, None)
    print(result)


with get_duckdb_cursor(db_file) as cursor:
    sql = """\
    SELECT
        count(DISTINCT patron_record_id) 
    FROM
        circ_trans
    """
    result = execute_query(cursor, sql, None)
    print(result)

{'data': [(82587794,)], 'columns': ['count_star()']}
{'data': [(367309,)], 'columns': ['count(DISTINCT patron_record_id)']}


In [2]:
# get the branch data

sql = """\
SELECT DISTINCT 
    map_location.name
FROM
    map_stat_group
    JOIN map_location ON map_location.code = map_stat_group.location_code
ORDER BY
    map_location.name
"""

with get_duckdb_cursor(db_file) as cursor:
    result = execute_query(cursor, sql, params=None)
    # print(result)
    
    branch_data = [data[0] for data in result['data']]
    # print(branch_data)

In [3]:
sql = """\
SELECT
    -- map_stat_group.location_code,
    map_location.name as location_name,
    count(DISTINCT circ_trans.patron_record_id) as count_distinct_patrons
FROM
    circ_trans
    left outer join map_stat_group on map_stat_group.code = circ_trans.stat_group_code_num
    left outer join map_location on map_location.code = map_stat_group.location_code
WHERE
    circ_trans.patron_record_id IN (
        SELECT
            ct.patron_record_id
        FROM
            circ_trans as ct
        WHERE
            ct.op_code in ('o','i')
            AND ct.transaction_gmt >= '2022-01-01'
            AND ct.stat_group_code_num in (
                SELECT DISTINCT 
                    map_stat_group.code
                FROM
                    map_stat_group
                    JOIN map_location ON map_location.code = map_stat_group.location_code
                WHERE
                    map_location.name IN (?)
            )
            -- AND circ_trans.ptype_code < '196'
        GROUP BY 
            ct.patron_record_id
    )
    AND circ_trans.op_code in ('o','i')
    AND circ_trans.transaction_gmt >= '2022-01-01'
    AND map_location.name NOT IN (
        'Distribution Center',
        'Main Library'
    )
GROUP BY 1
ORDER BY 2 DESC
"""

results = {}
for branch in branch_data:
    print(branch, end='')
    results[branch] = []
    with get_duckdb_cursor(db_file) as cursor:
        result = execute_query(cursor, sql, params=[branch])
        results[branch].extend(result['data'])
    print('.')

Anderson.
Anderson Pickup Window.
Avondale.
Blue Ash.
Bond Hill.
CheviotAttempt 1 failed with error: Invalid Error: Failed to prepare query "SELECT "code", "location_code" FROM "map_stat_group" WHERE ROWID BETWEEN 0 AND 122879": database is locked
Waiting 1 seconds before retrying...
.
Clifton.
Clifton Pickup Locker.
College Hill.
Corryville.
CovedaleAttempt 1 failed with error: Invalid Error: Failed to prepare query "SELECT "code", "location_code" FROM "map_stat_group" WHERE ROWID BETWEEN 0 AND 122879": database is locked
Waiting 1 seconds before retrying...
.
Covedale Pickup WindowAttempt 1 failed with error: Invalid Error: Failed to prepare query "SELECT "code", "location_code" FROM "map_stat_group" WHERE ROWID BETWEEN 0 AND 122879": database is locked
Waiting 1 seconds before retrying...
.
Deer Park.
Delhi Township.
Delhi Township Pickup Window.
Distribution Center.
Elmwood PlaceAttempt 1 failed with error: Invalid Error: Failed to prepare query "SELECT "code", "location_code" FROM

In [4]:
results.keys()

dict_keys(['Anderson', 'Anderson Pickup Window', 'Avondale', 'Blue Ash', 'Bond Hill', 'Cheviot', 'Clifton', 'Clifton Pickup Locker', 'College Hill', 'Corryville', 'Covedale', 'Covedale Pickup Window', 'Deer Park', 'Delhi Township', 'Delhi Township Pickup Window', 'Distribution Center', 'Elmwood Place', 'Forest Park', 'Green Township', 'Greenhills', 'Groesbeck', 'Groesbeck Pickup Window', 'Harrison', 'Harrison Pickup Window', 'Hyde Park', 'Loveland', 'Madeira', 'Madisonville', 'Main Library', 'Main Pickup Window', 'Mariemont', 'Miami Township', 'Monfort Heights', 'Mt Healthy', 'Mt Washington', 'North Central', 'Northside', 'Norwood', 'Oakley', 'Oakley Pickup Locker', 'Outreach Services', 'Pleasant Ridge', 'Price Hill', 'Reading', 'Reading Pickup Window', 'Sharonville', 'St Bernard', 'Symmes Township', 'Symmes Township Pickup Window', 'Walnut Hills', 'West End', 'Westwood', 'Wyoming'])

In [13]:
overlaps_columns = ['start_branch', 'end_branch', 'count_shared_patrons']
overlaps_data = []

for start_branch in results.keys():
    for end_result in results[start_branch]:
        overlaps_data.append((
            start_branch,
            end_result[0], # end_branch
            end_result[1]  # count_shared_patrons
        ))
        # print(results[result])
    # break

In [16]:
# import pickle

# with open('patron_branch_overlaps.pickle', 'wb') as f:
#     pickle.dump(
#         {
#             'overlaps_columns': overlaps_columns,
#             'overlaps_data': overlaps_data
    
#         }, 
#         f
#     )

In [46]:
with open('patron_branch_overlaps.pickle', 'rb') as f:
    my_data = pickle.load(f)
    
import pandas as pd
import altair as alt

df = pd.DataFrame(
    data=my_data['overlaps_data'],
    columns=my_data['overlaps_columns'],
)

In [47]:
df.columns

Index(['start_branch', 'end_branch', 'count_shared_patrons'], dtype='object')

In [48]:
chart1 = alt.Chart(df).mark_rect().encode(
    x=alt.X('start_branch:O'),
    y=alt.Y('end_branch:O'),
    color='count_shared_patrons:Q',
    tooltip=[
        alt.Tooltip('start_branch:O', title='Start Branch'),
        alt.Tooltip('end_branch:O', title='End Branch'),
        alt.Tooltip('count_shared_patrons:Q', title='Count of Shared Patrons')
    ]
)

chart1

In [53]:
# Filter rows where 'start_branch' and 'end_branch' have different values
filtered_df = df.loc[df['start_branch'] != df['end_branch']]

# # Calculate the sum of 'count_shared_patrons' for each unique 'end_branch'
# sum_of_shared_patrons = filtered_df.groupby('end_branch')['count_shared_patrons'].sum().reset_index()

# # Create a mapping of 'end_branch' to their corresponding sum
# sum_mapping = dict(zip(sum_of_shared_patrons['end_branch'], sum_of_shared_patrons['count_shared_patrons']))

# # Convert 'end_branch' to categorical with custom sorting order
# filtered_df['end_branch'] = pd.Categorical(filtered_df['end_branch'], categories=sorted(sum_mapping, key=sum_mapping.get, reverse=False))

chart1 = alt.Chart(filtered_df).mark_rect().encode(
    x=alt.X('start_branch:O'),
    y=alt.Y('end_branch:O'),
    color='count_shared_patrons:Q',
    tooltip=[
        alt.Tooltip('start_branch:O', title='Start Branch'),
        alt.Tooltip('end_branch:O', title='End Branch'),
        alt.Tooltip('count_shared_patrons:Q', title='Count of Shared Patrons')
    ]
)

chart1


In [59]:
branches = df['start_branch'].unique()

In [122]:
charts = []

filtered_df = df[
    (df['start_branch'] != df['end_branch'])
    & (df['start_branch'] != 'Distribution Center')
]

for location in filtered_df['start_branch'].unique():
    source = filtered_df[
        (filtered_df['start_branch'] == location)
    ]
    
    source = source.sort_values('count_shared_patrons', ascending=False)
    
    chart = alt.Chart(source).mark_rect().encode(
        x=alt.X(
            'end_branch:O', 
            # title=f'{location} Patrons Also Use ...', 
            title=None,
            sort=None, # sort by the order defined in the dataframe
            axis=alt.Axis(labelAngle=40)
        ),  
        color=alt.Color(
            'count_shared_patrons:Q',
            legend=None
        ),
        tooltip=[
            alt.Tooltip('start_branch:O', title='Start Branch'),
            alt.Tooltip('end_branch:O', title='End Branch'),
            alt.Tooltip('count_shared_patrons:Q', title='Count of Shared Patrons')
        ]
    ).properties(
        title=f'{location} Patrons Also Use ...'  # Use chart title as pseudo x-axis title
    )

    charts.append(chart)

In [123]:
alt.vconcat(*charts)

# Geodesic Distances Between Branches 

In [124]:
import requests
import json
import pandas as pd

from geopy.distance import geodesic

base_url = "https://collection-analysis.cincy.pl/chpl.json"

session = requests.session()

sql = """\
select
  rowid,
  chpl_branch_location_name,
  code_num,
  address,
  latitude,
  longitude,
  global_plus_code
from
  branch_locations
order by
  rowid
"""

limit = 1000
offset = 0
result_rows = list()

while True:
  response = session.get(
      base_url,
      params={
          'sql': sql,
          'item_types': 'Juvenile Book',
          # 'language_code': 'spa'
          'language_code': 'spa',
          'limit': limit,
          'offset': offset
      }
  )

  data = json.loads(response.text)
  result_rows.extend(data['rows'])
  print('.', end='')
  offset+=limit

  if (len(data['rows']) < limit):
    result_columns = data['columns']
    print('done.')
    break
    

df = pd.DataFrame(
    data=result_rows,
    columns=result_columns
)

.done.


In [136]:
distances_columns = ['branch_start', 'branch_end', 'KM']
distances_data = []

for i, row in enumerate(result_rows):
  branch_start = (
      row[
          result_columns.index('chpl_branch_location_name')
      ].strip(),
      (
        row[
            result_columns.index('latitude')
        ],
        row[
            result_columns.index('longitude')
        ]
      )
  )
  # print(branch_start, end=': ')

  for j, row2 in enumerate(result_rows):
    branch_end = (
      row2[
        result_columns.index('chpl_branch_location_name')
      ].strip(),
      (
        row2[
          result_columns.index('latitude')
        ],
        row2[
          result_columns.index('longitude')
        ]
      )
    )

    distance = geodesic(
        branch_start[1],
        branch_end[1]   
    ).km
    # print(distance)

    distances_data.append(
        [
          branch_start[0],
          branch_end[0],
          round(
              distance,
              2
          )
        ]
    )
    
distances_df = pd.DataFrame(data=distances_data, columns=distances_columns)

filtered_distances_df = distances_df[
    (distances_df['branch_start'] != distances_df['branch_end'])
    & (distances_df['branch_start'] != 'Distribution Center')
]

In [137]:
filtered_distances_df

Unnamed: 0,branch_start,branch_end,KM
1,Main Library,Anderson,14.11
2,Main Library,Avondale,5.07
3,Main Library,Blue Ash,18.35
4,Main Library,Bond Hill,9.90
5,Main Library,Cheviot,10.78
...,...,...,...
1930,Cincinnati & Hamilton County Public Library - ...,Walnut Hills,3.90
1931,Cincinnati & Hamilton County Public Library - ...,West End,0.97
1932,Cincinnati & Hamilton County Public Library - ...,Westwood,6.86
1933,Cincinnati & Hamilton County Public Library - ...,Wyoming,13.25


In [148]:
geo_charts = []

for branch_start in sorted(
    set(
        [
            value[0] 
            for value 
            in distances_data
            if value[0] not in (
                'Cincinnati & Hamilton County Public Library - Administrative Offices (Distribution Center)',
            )
        ]
    )
):
    # print(branch_start)

    source = filtered_distances_df[
        (filtered_distances_df['branch_start'] == branch_start)
    ]
    
    source = source.sort_values('KM', ascending=True)
    
    chart = alt.Chart(source).mark_rect().encode(
        x=alt.X(
            'branch_end:O', 
            # title=f'{location} Patrons Also Use ...', 
            title=None,
            sort=None, # sort by the order defined in the dataframe
            axis=alt.Axis(labelAngle=40)
        ),  
        color=alt.Color(
            'KM:Q',
            legend=None
        ),
        tooltip=[
            alt.Tooltip('branch_start:O', title='Start Branch'),
            alt.Tooltip('branch_end:O', title='End Branch'),
            alt.Tooltip('KM:Q', title='KM to Location')
        ]
    ).properties(
        title=f'{branch_start} Geograhically Close to ...'  # Use chart title as pseudo x-axis title
    )

    geo_charts.append(chart)

Anderson
Avondale
Blue Ash
Bond Hill
Cheviot
Clifton
College Hill
Corryville
Covedale
Deer Park
Delhi Township
Elmwood Place
Forest Park
Green Township
Greenhills
Groesbeck
Harrison
Hyde Park
Loveland
Madeira
Madisonville
Main Library
Mariemont
Miami Township
Monfort Heights
Mt Healthy
Mt Washington
North Central
Northside
Norwood
Oakley
Outreach Services
Pleasant Ridge
Price Hill
Reading
Sharonville
St Bernard
Symmes Township
Virtual Library
Walnut Hills
West End
Westwood
Wyoming


In [151]:
alt.vconcat(*geo_charts)