<a href="https://colab.research.google.com/github/cincinnatilibrary/collection-analysis/blob/master/reports/hold_shelf_reports.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CHPL - Collection Analysis - **Hold-Shelves**
<img src="https://ilsweb.cincinnatilibrary.org/img/CHPL_Brandmark_Primary.png" alt="CHPL" title="CHPL" width="300"/>

This report provides some queries and generates some visualizations for data
related to items that have been delievered to various hold-shelf locations in
the CHPL system.

Data sources:

* https://ilsweb.cincinnatilibrary.org/collection-analysis/
* https://github.com/plch/plch-holds-shelf

In [2]:
import pandas as pd
import numpy as np
import altair as alt
from urllib.parse import urlencode

In [3]:
# engine = create_engine('sqlite:///file:current.db?mode=rw&uri=true')

# instead of the local db, we'll use the current_collection data set instead ...
# NOTE: the queries change slighly from the local db
chpl_collection_url = 'https://ilsweb.cincinnatilibrary.org/collection-analysis/current_collection'

In [4]:
# get the range of dates that this data covers ... 
sql = """\
with min_max as (
    select
    min(modified_epoch) as min_modified,
    max(modified_epoch) as max_modified
    from holds_shelf
)

select
datetime(min_modified, 'unixepoch', 'localtime') as min_modified,
datetime(max_modified, 'unixepoch', 'localtime') as max_modified

from
min_max
"""

# df = pd.read_sql(sql=sql, con=engine)
df = pd.read_csv(
    chpl_collection_url + '.csv?' + urlencode(query={'sql': sql})
)

min_date = df.iloc[0]['min_modified']
max_date = df.iloc[0]['max_modified']

print(min_date, '\n', max_date, sep='')

2018-10-12 13:50:06
2022-01-09 17:00:01


In [5]:
# forget that ... just limit to a window of 18 months
min_date = (pd.Timestamp(df.iloc[0]['max_modified']) - pd.Timedelta(weeks=4*18)).strftime('%Y-%m-%d')
max_date = (pd.Timestamp(df.iloc[0]['max_modified'])).strftime('%Y-%m-%d')

In [6]:
# generate our time series data ...
# NOTE: to examine a different range, adjust the `min_date` and `max_date`
dates_series = pd.date_range(start=min_date, end=max_date, freq='W-MON')

# convert the timestames to unixepoch
unix_series = (dates_series - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")

In [7]:
# this is the string we're going to modify in the loop
sql = """\
SELECT
date({val1}, 'unixepoch') as start_date,
date({val2}, 'unixepoch') as end_date,
count(*) as count_items,
count(distinct patron_record_hash) as count_distinct_patrons

FROM
holds_shelf

WHERE
modified_epoch >= {val1}
AND modified_epoch < {val2}
"""

target_sql = sql.format(val1=unix_series[0], val2=unix_series[1])
print(target_sql)

SELECT
date(1598227200, 'unixepoch') as start_date,
date(1598832000, 'unixepoch') as end_date,
count(*) as count_items,
count(distinct patron_record_hash) as count_distinct_patrons

FROM
holds_shelf

WHERE
modified_epoch >= 1598227200
AND modified_epoch < 1598832000



In [8]:
df = pd.DataFrame(columns=['start_date', 'end_date', 'count_items', 'count_distinct_patrons'])

print(
    'start_date             ', 'end_date               ',
    'count_items            ', 'count_distinct_patrons',
)   

for i, value in enumerate(unix_series):
    if i+1 == len(unix_series):
        break
        
#     temp_df = pd.read_sql(sql=sql.format(val1=unix_series[i], val2=unix_series[i+1]), con=engine)
    temp_df = pd.read_csv(
        chpl_collection_url + '.csv?' + urlencode(query={'sql': sql.format(val1=unix_series[i], val2=unix_series[i+1])})
    )
    
    df = df.append(temp_df, ignore_index=True)
    print(
        temp_df['start_date'][0], temp_df['end_date'][0],
        temp_df['count_items'][0], "\t", temp_df['count_distinct_patrons'][0],
        sep="\t"
    )

start_date              end_date                count_items             count_distinct_patrons
2020-08-24	2020-08-31	42933			14357
2020-08-31	2020-09-07	46095			15110
2020-09-07	2020-09-14	38892			13758
2020-09-14	2020-09-21	44721			14850
2020-09-21	2020-09-28	42816			14305
2020-09-28	2020-10-05	44407			14345
2020-10-05	2020-10-12	44799			14650
2020-10-12	2020-10-19	42344			14246
2020-10-19	2020-10-26	41235			14188
2020-10-26	2020-11-02	42949			14424
2020-11-02	2020-11-09	41407			13783
2020-11-09	2020-11-16	42935			14108
2020-11-16	2020-11-23	44771			14372
2020-11-23	2020-11-30	27792			10910
2020-11-30	2020-12-07	48742			15370
2020-12-07	2020-12-14	43627			15106
2020-12-14	2020-12-21	37883			13603
2020-12-21	2020-12-28	17716			8301
2020-12-28	2021-01-04	39266			14053
2021-01-04	2021-01-11	51580			15774
2021-01-11	2021-01-18	43820			14697
2021-01-18	2021-01-25	39789			13230
2021-01-25	2021-02-01	45913			14272
2021-02-01	2021-02-08	43427			13808
2021-02-08	2021-02-15	37078			12367
2021-0

In [9]:
df.head()

Unnamed: 0,start_date,end_date,count_items,count_distinct_patrons
0,2020-08-24,2020-08-31,42933,14357
1,2020-08-31,2020-09-07,46095,15110
2,2020-09-07,2020-09-14,38892,13758
3,2020-09-14,2020-09-21,44721,14850
4,2020-09-21,2020-09-28,42816,14305


In [10]:
# we will have a divide by zero error here, so we have to replace the 0 with
# np.inf so that we can divide without error

df['count_distinct_patrons'] = df['count_distinct_patrons'].replace(to_replace=0, value=np.inf)
df['items_per_patron'] = df['count_items'] / df['count_distinct_patrons']
df['count_distinct_patrons'] = df['count_distinct_patrons'].replace(to_replace=np.inf, value=int(0))

# just for example ...
print(df[df['start_date'] == '2020-03-23'])

Empty DataFrame
Columns: [start_date, end_date, count_items, count_distinct_patrons, items_per_patron]
Index: []


In [11]:
source = df
# knock some precision off so it's not so crazy to look at
source['items_per_patron'] = source.apply(
    lambda x: float(f"{x.items_per_patron:.2f}"), axis=1
)
base = alt.Chart(source).encode(
    # x='end_date:T',
    x=alt.X(
        'end_date:T',
        axis=alt.Axis(title='snapshot date', labelAngle=-70),  
    ),
    tooltip=['start_date:T', 'end_date:T', 'count_items', 'count_distinct_patrons', 'items_per_patron' ]
)

area = base.mark_area(color="#0090bd").encode(
    # y='count_items:Q',
    y=alt.Y(
        'count_items:Q',
        title = 'Count Items Placed on Hold Shelves (blue area)'
    )
)

line = base.mark_line(color='firebrick').encode(
    # y='count_distinct_patrons:Q'
    y = alt.Y(
        'count_distinct_patrons:Q',
        title = 'Count Distinct Patrons (red line)'
    )
)

(area + line).resolve_scale(y='independent').properties(
    title="CHPL Items Placed on Hold Shelves Weekly",
    width=900
)

In [12]:
source = df
alt.Chart(source).mark_area(color="#0090bd").encode(
    # x='end_date:T',
    x=alt.X(
        'end_date:T',
        axis=alt.Axis(title='snapshot date', labelAngle=-70),  
    ),
    y=alt.Y(
        'items_per_patron:Q',
        title = 'Count Items Per Patron Placed on Hold Shelves'
    ),
    tooltip=['start_date:T', 'end_date:T', 'count_items', 'count_distinct_patrons', 'items_per_patron' ]
).properties(
    title="CHPL Items Per Patron Placed on Hold Shelves Weekly",
    width=900
)

In [13]:
sql = """\
SELECT
date({val1}, 'unixepoch') as start_date,
date({val2}, 'unixepoch') as end_date,
count(*) as count_items,
--date(modified_epoch, 'unixepoch') as modified_date,
--date(placed_epoch, 'unixepoch') as placed_date,
avg(julianday(modified_epoch, 'unixepoch') - julianday(placed_epoch, 'unixepoch')) as avg_days_diff
FROM
holds_shelf
WHERE
modified_epoch >= {val1}
AND modified_epoch < {val2}
"""

In [14]:
# df = pd.read_sql(sql=sql.format(val1=unix_series[0], val2=unix_series[1]), con=engine)
df = pd.read_csv(
        chpl_collection_url + '.csv?' + urlencode(query={'sql': sql.format(val1=unix_series[0], val2=unix_series[1])})
    )

df.head()

Unnamed: 0,start_date,end_date,count_items,avg_days_diff
0,2020-08-24,2020-08-31,42933,48.182858


In [15]:
df = pd.DataFrame(columns=['start_date', 'end_date', 'count_items', 'avg_days_diff'])

print(
    'start_date             ', 'end_date               ',
    'count_items            ', 'avg_days_diff',
)   

for i, value in enumerate(unix_series):
    if i+1 == len(unix_series):
        break
        
    # temp_df = pd.read_sql(sql=sql.format(val1=unix_series[i], val2=unix_series[i+1]), con=engine)
    temp_df = pd.read_csv(
        chpl_collection_url + '.csv?' + urlencode(query={'sql': sql.format(val1=unix_series[i], val2=unix_series[i+1])})
    )
    
    df = df.append(temp_df, ignore_index=True)
    print(
        temp_df['start_date'][0], temp_df['end_date'][0],
        temp_df['count_items'][0], "\t", temp_df['avg_days_diff'][0],
        sep="\t"
    )

start_date              end_date                count_items             avg_days_diff
2020-08-24	2020-08-31	42933			48.18285757748278
2020-08-31	2020-09-07	46095			47.83427803086539
2020-09-07	2020-09-14	38892			46.37615275271217
2020-09-14	2020-09-21	44721			46.73728844787757
2020-09-21	2020-09-28	42816			43.86182033751381
2020-09-28	2020-10-05	44407			43.81403988821979
2020-10-05	2020-10-12	44799			43.65324686349702
2020-10-12	2020-10-19	42344			42.58450556009277
2020-10-19	2020-10-26	41235			44.04600180703471
2020-10-26	2020-11-02	42949			41.72997240966989
2020-11-02	2020-11-09	41407			41.35348115593865
2020-11-09	2020-11-16	42935			36.57610393320231
2020-11-16	2020-11-23	44771			34.46501698843747
2020-11-23	2020-11-30	27792			36.644117963684394
2020-11-30	2020-12-07	48742			34.580439641696444
2020-12-07	2020-12-14	43627			39.67345032652201
2020-12-14	2020-12-21	37883			37.357370958968794
2020-12-21	2020-12-28	17716			40.039896246231955
2020-12-28	2021-01-04	39266			38.0657955276084

In [16]:
source = df
chart = alt.Chart(source).mark_area(color="#0090bd").encode(
    # x='end_date:T',
    x=alt.X(
        'end_date:T',
        axis=alt.Axis(title='snapshot date', labelAngle=-70),  
    ),
    y=alt.Y(
        'avg_days_diff:Q',
        title = 'Days to Holdshelf'
    ),
    tooltip=['start_date:T', 'end_date:T', 'count_items', 'avg_days_diff' ]
).properties(
    title="CHPL Average Days Between Hold-Placed to Item-on-Hold-Shelf",
    width=900
)

In [17]:
chart

In [19]:
chart.save('{}_chpl-days-to-holdshelf.html'.format(max_date))