<a href="https://colab.research.google.com/github/cincinnatilibrary/collection-analysis/blob/master/reports/hold_shelf_reports.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CHPL - Collection Analysis - **Hold-Shelves**
<img src="https://ilsweb.cincinnatilibrary.org/img/CHPL_Brandmark_Primary.png" alt="CHPL" title="CHPL" width="300"/>

This report provides some queries and generates some visualizations for data
related to items that have been delievered to various hold-shelf locations in
the CHPL system.

Data sources:

* https://ilsweb.cincinnatilibrary.org/collection-analysis/
* https://github.com/plch/plch-holds-shelf

In [1]:
import pandas as pd
import numpy as np
import altair as alt
from urllib.parse import urlencode

In [2]:
# engine = create_engine('sqlite:///file:current.db?mode=rw&uri=true')

# instead of the local db, we'll use the current_collection data set instead ...
# NOTE: the queries change slighly from the local db
chpl_collection_url = 'https://ilsweb.cincinnatilibrary.org/collection-analysis/current_collection'

In [7]:
# get the range of dates that this data covers ... 
sql = """\
with min_max as (
    select
    min(modified_epoch) as min_modified,
    max(modified_epoch) as max_modified
    from holds_shelf
)

select
datetime(min_modified, 'unixepoch', 'localtime') as min_modified,
datetime(max_modified, 'unixepoch', 'localtime') as max_modified

from
min_max
"""

# df = pd.read_sql(sql=sql, con=engine)
df = pd.read_csv(
    chpl_collection_url + '.csv?' + urlencode(query={'sql': sql})
)

min_date = df.iloc[0]['min_modified']
max_date = df.iloc[0]['max_modified']

print(min_date, '\n', max_date, sep='')

2018-10-12 13:50:06
2021-11-28 17:00:02


In [10]:
# generate our time series data ...
# NOTE: to examine a different range, adjust the `min_date` and `max_date`
dates_series = pd.date_range(start=min_date, end=max_date, freq='W-MON')

# convert the timestames to unixepoch
unix_series = (dates_series - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")

In [11]:
# this is the string we're going to modify in the loop
sql = """\
SELECT
date({val1}, 'unixepoch') as start_date,
date({val2}, 'unixepoch') as end_date,
count(*) as count_items,
count(distinct patron_record_hash) as count_distinct_patrons

FROM
holds_shelf

WHERE
modified_epoch >= {val1}
AND modified_epoch < {val2}
"""

target_sql = sql.format(val1=unix_series[0], val2=unix_series[1])
print(target_sql)

SELECT
date(1539611406, 'unixepoch') as start_date,
date(1540216206, 'unixepoch') as end_date,
count(*) as count_items,
count(distinct patron_record_hash) as count_distinct_patrons

FROM
holds_shelf

WHERE
modified_epoch >= 1539611406
AND modified_epoch < 1540216206



In [13]:
df = pd.DataFrame(columns=['start_date', 'end_date', 'count_items', 'count_distinct_patrons'])

print(
    'start_date             ', 'end_date               ',
    'count_items            ', 'count_distinct_patrons',
)   

for i, value in enumerate(unix_series):
    if i+1 == len(unix_series):
        break
        
#     temp_df = pd.read_sql(sql=sql.format(val1=unix_series[i], val2=unix_series[i+1]), con=engine)
    temp_df = pd.read_csv(
        chpl_collection_url + '.csv?' + urlencode(query={'sql': sql.format(val1=unix_series[i], val2=unix_series[i+1])})
    )
    
    df = df.append(temp_df, ignore_index=True)
    print(
        temp_df['start_date'][0], temp_df['end_date'][0],
        temp_df['count_items'][0], "\t", temp_df['count_distinct_patrons'][0],
        sep="\t"
    )

start_date              end_date                count_items             count_distinct_patrons
2018-10-15	2018-10-22	49165			16789
2018-10-22	2018-10-29	49370			16689
2018-10-29	2018-11-05	47529			16025
2018-11-05	2018-11-12	51411			16889
2018-11-12	2018-11-19	54190			17762
2018-11-19	2018-11-26	41782			15308
2018-11-26	2018-12-03	55041			17378
2018-12-03	2018-12-10	50159			17110
2018-12-10	2018-12-17	48932			16855
2018-12-17	2018-12-24	46410			16023
2018-12-24	2018-12-31	31675			12505
2018-12-31	2019-01-07	39613			14975
2019-01-07	2019-01-14	61224			19157
2019-01-14	2019-01-21	60049			18843
2019-01-21	2019-01-28	57286			17680
2019-01-28	2019-02-04	52620			16929
2019-02-04	2019-02-11	55391			17603
2019-02-11	2019-02-18	50424			16762
2019-02-18	2019-02-25	52418			17384
2019-02-25	2019-03-04	52504			17176
2019-03-04	2019-03-11	56645			18021
2019-03-11	2019-03-18	49362			17132
2019-03-18	2019-03-25	53597			17963
2019-03-25	2019-04-01	52651			17693
2019-04-01	2019-04-08	53078			17522
2019-

In [14]:
df.head()

Unnamed: 0,start_date,end_date,count_items,count_distinct_patrons
0,2018-10-15,2018-10-22,49165,16789
1,2018-10-22,2018-10-29,49370,16689
2,2018-10-29,2018-11-05,47529,16025
3,2018-11-05,2018-11-12,51411,16889
4,2018-11-12,2018-11-19,54190,17762


In [15]:
# we will have a divide by zero error here, so we have to replace the 0 with
# np.inf so that we can divide without error

df['count_distinct_patrons'] = df['count_distinct_patrons'].replace(to_replace=0, value=np.inf)
df['items_per_patron'] = df['count_items'] / df['count_distinct_patrons']
df['count_distinct_patrons'] = df['count_distinct_patrons'].replace(to_replace=np.inf, value=int(0))

# just for example ...
print(df[df['start_date'] == '2020-03-23'])

    start_date    end_date count_items  count_distinct_patrons items_per_patron
75  2020-03-23  2020-03-30           0                     0.0                0


In [None]:
source = df
# knock some precision off so it's not so crazy to look at
source['items_per_patron'] = source.apply(
    lambda x: float(f"{x.items_per_patron:.2f}"), axis=1
)
base = alt.Chart(source).encode(
    # x='end_date:T',
    x=alt.X(
        'end_date:T',
        axis=alt.Axis(title='snapshot date', labelAngle=-70),  
    ),
    tooltip=['start_date:T', 'end_date:T', 'count_items', 'count_distinct_patrons', 'items_per_patron' ]
)

area = base.mark_area(color="#0090bd").encode(
    # y='count_items:Q',
    y=alt.Y(
        'count_items:Q',
        title = 'Count Items Placed on Hold Shelves (blue area)'
    )
)

line = base.mark_line(color='firebrick').encode(
    # y='count_distinct_patrons:Q'
    y = alt.Y(
        'count_distinct_patrons:Q',
        title = 'Count Distinct Patrons (red line)'
    )
)

(area + line).resolve_scale(y='independent').properties(
    title="CHPL Items Placed on Hold Shelves Weekly",
    width=900
)

In [None]:
source = df
alt.Chart(source).mark_area(color="#0090bd").encode(
    # x='end_date:T',
    x=alt.X(
        'end_date:T',
        axis=alt.Axis(title='snapshot date', labelAngle=-70),  
    ),
    y=alt.Y(
        'items_per_patron:Q',
        title = 'Count Items Per Patron Placed on Hold Shelves'
    ),
    tooltip=['start_date:T', 'end_date:T', 'count_items', 'count_distinct_patrons', 'items_per_patron' ]
).properties(
    title="CHPL Items Per Patron Placed on Hold Shelves Weekly",
    width=900
)

In [None]:
sql = """\
SELECT
date({val1}, 'unixepoch') as start_date,
date({val2}, 'unixepoch') as end_date,
count(*) as count_items,
--date(modified_epoch, 'unixepoch') as modified_date,
--date(placed_epoch, 'unixepoch') as placed_date,
avg(julianday(modified_epoch, 'unixepoch') - julianday(placed_epoch, 'unixepoch')) as avg_days_diff
FROM
holds_shelf
WHERE
modified_epoch >= {val1}
AND modified_epoch < {val2}
"""

In [None]:
# df = pd.read_sql(sql=sql.format(val1=unix_series[0], val2=unix_series[1]), con=engine)
df = pd.read_csv(
        chpl_collection_url + '.csv?' + urlencode(query={'sql': sql.format(val1=unix_series[0], val2=unix_series[1])})
    )

df.head()

Unnamed: 0,start_date,end_date,count_items,avg_days_diff
0,2018-12-31,2019-01-07,40136,26.204387


In [None]:
df = pd.DataFrame(columns=['start_date', 'end_date', 'count_items', 'avg_days_diff'])

print(
    'start_date             ', 'end_date               ',
    'count_items            ', 'avg_days_diff',
)   

for i, value in enumerate(unix_series):
    if i+1 == len(unix_series):
        break
        
    # temp_df = pd.read_sql(sql=sql.format(val1=unix_series[i], val2=unix_series[i+1]), con=engine)
    temp_df = pd.read_csv(
        chpl_collection_url + '.csv?' + urlencode(query={'sql': sql.format(val1=unix_series[i], val2=unix_series[i+1])})
    )
    
    df = df.append(temp_df, ignore_index=True)
    print(
        temp_df['start_date'][0], temp_df['end_date'][0],
        temp_df['count_items'][0], "\t", temp_df['avg_days_diff'][0],
        sep="\t"
    )

start_date              end_date                count_items             avg_days_diff
2018-12-31	2019-01-07	40136			26.204387037746645
2019-01-07	2019-01-14	61468			22.103181577701463
2019-01-14	2019-01-21	59766			21.450523656047935
2019-01-21	2019-01-28	57222			20.55424374584137
2019-01-28	2019-02-04	52687			19.537421691162084
2019-02-04	2019-02-11	54138			21.117479494255747
2019-02-11	2019-02-18	51798			20.778974739644656
2019-02-18	2019-02-25	51787			21.62282821494604
2019-02-25	2019-03-04	52517			20.732983290793523
2019-03-04	2019-03-11	53766			20.465701095983743
2019-03-11	2019-03-18	50721			22.287916058051422
2019-03-18	2019-03-25	51714			22.289914874108145
2019-03-25	2019-04-01	53396			24.729965282519995
2019-04-01	2019-04-08	52806			23.788198620910816
2019-04-08	2019-04-15	52597			24.61392225029764
2019-04-15	2019-04-22	54498			26.911294044230843
2019-04-22	2019-04-29	48677			25.71655843096248
2019-04-29	2019-05-06	49750			26.597220342665125
2019-05-06	2019-05-13	50041			30.549

In [None]:
source = df
chart = alt.Chart(source).mark_area(color="#0090bd").encode(
    # x='end_date:T',
    x=alt.X(
        'end_date:T',
        axis=alt.Axis(title='snapshot date', labelAngle=-70),  
    ),
    y=alt.Y(
        'avg_days_diff:Q',
        title = 'Days to Holdshelf'
    ),
    tooltip=['start_date:T', 'end_date:T', 'count_items', 'avg_days_diff' ]
).properties(
    title="CHPL Average Days Between Hold-Placed to Item-on-Hold-Shelf",
    width=900
)

In [None]:
chart

In [None]:
chart.save('2021-11-01_chpl-days-to-holdshelf.html')