In [1]:
# IMPORTANT!!!
# make sure that you run the backup first!
# /home/plchuser/.backup/plch-ilsaux2-collection-analysis.sh

## note 
make sure to run the backup script first ...

`/home/plchuser/.backup/plch-ilsaux2-collection-analysis.sh`

```bash
#!/bin/bash

# backup for collection analysis files for local and collectionHQ

unset HISTFILE
export B2_ACCOUNT_ID=""
export B2_ACCOUNT_KEY=""
export RESTIC_REPOSITORY="b2:plch-collection-analysis"
export RESTIC_PASSWORD=""
export RESTIC_CACHE_DIR="/home/plchuser/.cache/restic"

# we should only need to do this on an empty repo
# /home/plchuser/.backup/restic/restic init

cd /home/plchuser/output/collection-analysis
/usr/bin/find *.csv -print0 | /usr/bin/xargs -0 xz -9 -T0

# add --verbose after restic command for more info
/home/plchuser/.backup/restic/restic \
        backup \
        /home/plchuser/output/collection-analysis
```


In [2]:
!pwd
!which python

/home/plchuser/output/jupyter/collection-analysis
/home/plchuser/output/jupyter/collection-analysis/venv/bin/python


In [3]:
!pip install -U pip
!pip install -U pandas
!pip install -U sqlalchemy
!pip install -U psycopg2-binary
!pip install -U sqlite-utils

Collecting sqlalchemy
  Downloading SQLAlchemy-1.4.27-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
     |████████████████████████████████| 1.6 MB 2.6 MB/s            
Installing collected packages: sqlalchemy
  Attempting uninstall: sqlalchemy
    Found existing installation: SQLAlchemy 1.4.26
    Uninstalling SQLAlchemy-1.4.26:
      Successfully uninstalled SQLAlchemy-1.4.26
Successfully installed sqlalchemy-1.4.27
Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
     |████████████████████████████████| 3.0 MB 1.9 MB/s            
[?25hInstalling collected packages: psycopg2-binary
  Attempting uninstall: psycopg2-binary
    Found existing installation: psycopg2-binary 2.9.1
    Uninstalling psycopg2-binary-2.9.1:
      Successfully uninstalled psycopg2-binary-2.9.1
Successfully installed psycopg2-binary-2.9.2


In [4]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import Integer, BigInteger, Numeric
import numpy as np
import os
import re
from decimal import Decimal
import sqlite_utils

import vars

engine = create_engine('sqlite:///current_collection.db', echo=False)

sierra_engine = create_engine('postgresql://{}:{}@sierra-db.plch.net:1032/iii'.format(vars.pg_username, vars.pg_password))

collection_file_path = os.path.join(os.getcwd(), '/home/plchuser/output/collection-analysis/')

item_re = re.compile(r"^[0-9]{4}\-[0-9]{2}\-[0-9]{2}\-plch\-item\.csv\.xz")
bib_re = re.compile(r"^[0-9]{4}\-[0-9]{2}\-[0-9]{2}\-plch\-bib\.csv\.xz")

In [5]:
# this is our working directory
print(os.getcwd())

/home/plchuser/output/jupyter/collection-analysis


In [6]:
# REMOVE and refresh the sqlite database file in the local directory
try:
    os.remove('current_collection.db')
except:
    pass

os.close(os.open('current_collection.db', os.O_CREAT))

In [7]:
collection_files = os.listdir(collection_file_path)
collection_files.sort(reverse=True)

In [8]:
# convert price to integer values
numbers_only_re = re.compile('[^0-9]')

def price_to_int(price):
    return int(numbers_only_re.sub('', price))

In [9]:
item_file = [file for file in collection_files if item_re.match(file)][0]
print(item_file)

2021-11-08-plch-item.csv.xz


In [10]:
# this was here to generate the database for the first snapshot of the year
# item_file = '2020-01-06-plch-item.csv.xz'

In [11]:
df = pd.read_csv(
    os.path.join(collection_file_path, item_file),
    compression='xz',
    delimiter='|',
    converters={'price': price_to_int},
    # nrows=100
)

In [12]:
df = df.rename(columns={'price': 'price_cents'})

In [13]:
df.to_sql(name='item', index=False, if_exists='replace', con=engine, chunksize=10000)

In [14]:
bib_file = [file for file in collection_files if bib_re.match(file)][0]
print(bib_file)

2021-11-08-plch-bib.csv.xz


In [15]:
# this was here to generate the database for the first snapshot of the year
# bib_file = '2020-01-06-plch-bib.csv.xz'

In [16]:
df = pd.read_csv(
    os.path.join(collection_file_path, bib_file),
    compression='xz',
    delimiter='|',
)

df.to_sql(
    name='bib', 
    index=False, 
    if_exists='replace', 
    con=engine, 
    chunksize=10000,
    dtype={
        'publish_year': Integer(),
    }
)

In [17]:
# indexes to create
sql = """\
CREATE INDEX IF NOT EXISTS "idx_bib_bib_record_num" ON "bib" (
    "bib_record_num"
);
CREATE INDEX IF NOT EXISTS "idx_bib_indexed_subjects" on bib (
    "indexed_subjects"
);
CREATE INDEX IF NOT EXISTS "idx_item_item_format_item_status_location_code_item_callnumber" ON "item" (
    "location_code",
    "item_format",
    "item_status_code",
    "item_callnumber"
);
CREATE INDEX IF NOT EXISTS "idx_item_bib_record_num" ON "item" (
    "bib_record_num"
);
CREATE INDEX IF NOT EXISTS "idx_item_item_record_num" ON "item" (
    "item_record_num"
);
CREATE INDEX IF NOT EXISTS "idx_item_agency_code_num_location_code" ON "item" (
    "agency_code_num",
    "location_code"
);
CREATE INDEX IF NOT EXISTS "idx_item_barcode" ON "item" (
    "barcode"
);
CREATE INDEX IF NOT EXISTS "idx_item_item_format" ON "item" (
    "item_format"
);
"""
with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

In [18]:
# bib_record

sql = """\
SELECT
b.id,
b.record_id,
b.language_code,
b.bcode1,
b.bcode2,
b.bcode3,
b.country_code,
b.index_change_count,
b.is_on_course_reserve,
b.is_right_result_exact,
b.allocation_rule_code,
b.skip_num,
b.cataloging_date_gmt,
b.marc_type_code,
b.is_suppressed

FROM
sierra_view.bib_record as b

JOIN
sierra_view.record_metadata as r on r.id = b.record_id

WHERE
r.campus_code = ''

ORDER BY
b.id

LIMIT 1000000 OFFSET {}
"""

# start the offset at 0, then add 100000 to the offset
offset = 0
count = 0
while (True):
    df = pd.read_sql(sql=sql.format(offset), con=sierra_engine)        
    print(offset, df.shape[0], sep=' ', end=', ')
    
    if df.shape[0] == 0:
        break
    
    df.to_sql(
        name='bib_record',
        con=engine,
        index=False,
        if_exists='append',
        chunksize=10000, 
        dtype={
            'id': Integer(),
            'record_id': Integer(),
            'index_change_count': Integer(),
            'skip_num': Integer(),
        },
    )
    
    count += 1
    offset += 1000000


sql = """\
CREATE INDEX bib_record_bcode3_idx ON bib_record (bcode3);
CREATE INDEX bib_record_bib_level_idx ON bib_record (bcode1);
CREATE INDEX bib_record_country_idx ON bib_record (country_code);
CREATE INDEX bib_record_lang_idx ON bib_record (language_code);
CREATE INDEX bib_record_material_type_idx ON bib_record (bcode2);
CREATE UNIQUE INDEX bib_record_record_key ON bib_record (record_id);
CREATE INDEX idx_bib_record_cataloging_date ON bib_record (cataloging_date_gmt);
CREATE UNIQUE INDEX pk_bib_record ON bib_record (id);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

0 1000000, 1000000 1000000, 2000000 95570, 3000000 0, 

In [19]:
# language_property

sql = """\
select
p.id,
p.code,
p.display_order,
n.name

from
sierra_view.language_property as p

join
sierra_view.language_property_name as n
on n.language_property_id = p.id

order by id
"""

df = pd.read_sql(sql=sql, con=sierra_engine)

df.to_sql(
        name='language_property',
        con=engine,
        index=False,
        if_exists='append',
        chunksize=10000, 
        dtype={
            'id': Integer(),
            'display_order': Integer(),
        },
    )


sql = """\
CREATE UNIQUE INDEX language_property_code_key ON language_property (code);
CREATE UNIQUE INDEX pk_language_property ON language_property (id);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

In [20]:
# sierra_view.record_metadata

sql = """\
select
r.id,
r.record_type_code,
r.record_num,
date(r.creation_date_gmt) as creation_date_gmt,
date(r.deletion_date_gmt) as deletion_date_gmt,
r.campus_code,
r.agency_code_num,
r.record_last_updated_gmt

FROM
sierra_view.record_metadata as r

WHERE
r.campus_code = ''
-- started grabbing the deleted record data 2021-03-15
AND r.deletion_date_gmt IS NULL
AND r.record_type_code in ('b', 'i', 'j' ) -- bibliographic, item, volume
"""
df = pd.read_sql(sql=sql, con=sierra_engine )

# output to the sqlite db
# NOTE: the first time through, we'll want to "replace" ... the second "append"
df.to_sql(name='record_metadata', index=False, if_exists='replace', con=engine, chunksize=10000)

# -- started grabbing the deleted record data 2021-03-15
# doing this as the second part since it may exceeed our memory limits

sql = """\
select
r.id,
r.record_type_code,
r.record_num,
date(r.creation_date_gmt) as creation_date_gmt,
date(r.deletion_date_gmt) as deletion_date_gmt,
r.campus_code,
r.agency_code_num,
r.record_last_updated_gmt

FROM
sierra_view.record_metadata as r

WHERE
r.campus_code = ''
-- started grabbing the deleted record data 2021-03-15
AND r.deletion_date_gmt IS NOT NULL
AND r.record_type_code in ('b', 'i', 'j' ) -- bibliographic, item, volume
"""
df = pd.read_sql(sql=sql, con=sierra_engine )

# output to the sqlite db
df.to_sql(name='record_metadata', index=False, if_exists='append', con=engine, chunksize=10000)


sql = """\
CREATE INDEX idx_record_metadata_id_record_last_updated ON record_metadata (id, record_last_updated_gmt);
CREATE INDEX idx_record_metadata_record_creation_date_gmt ON record_metadata (creation_date_gmt);
CREATE INDEX idx_record_metadata_record_num ON record_metadata (record_num);
CREATE INDEX pk_record_id ON record_metadata (id);
CREATE INDEX record_id_unique_constraint ON record_metadata (record_type_code, record_num, campus_code);
CREATE INDEX record_metadata_last_modified ON record_metadata (record_last_updated_gmt, record_type_code, id);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

In [21]:
# bib_record_item_record_link
sql = """
select 
l.id,
l.bib_record_id,
r.record_num as bib_record_num,
l.item_record_id,
ir.record_num as item_record_num,
l.items_display_order,
l.bibs_display_order

from
sierra_view.bib_record_item_record_link as l

join sierra_view.record_metadata as r on r.id = l.bib_record_id

join sierra_view.record_metadata as ir on ir.id = l.item_record_id
"""

df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'bib_record_item_record_link', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'id': Integer(),
        'bib_record_id': BigInteger(),
        'bib_record_num': Integer(), 
        'item_record_id': BigInteger(),
        'item_record_num': Integer(),
        'items_display_order': Integer(),
        'bibs_display_order': Integer(), 
    }
)

sql = """\
CREATE INDEX idx_bib_record_item_record_link_bib_record_id ON bib_record_item_record_link (bib_record_id);
CREATE INDEX idx_bib_record_item_record_link_bib_record_num ON bib_record_item_record_link (bib_record_num);
CREATE INDEX item_record_id_index ON bib_record_item_record_link (item_record_id);
CREATE INDEX item_record_num_index ON bib_record_item_record_link (item_record_num);
CREATE UNIQUE INDEX pk_bib_record_item_record_link ON bib_record_item_record_link (id);
CREATE UNIQUE INDEX uc_bib_record_item_record_link ON bib_record_item_record_link (bib_record_id, item_record_id);
CREATE INDEX ucn_bib_record_item_record_link ON bib_record_item_record_link (bib_record_num, item_record_num);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

print(df.shape[0])
print(df.head())

6190467
         id  bib_record_id  bib_record_num  item_record_id  item_record_num  \
0   2300207   420908943241         2148233    450973395907          1829827   
1   6966841   420909339758         2544750    450978062541          6496461   
2   3077027   420908297409         1502401    450974172727          2606647   
3   9821551   420910008622         3213614    450980647069          9080989   
4  10644296   420910124993         3329985    450981374700          9808620   

   items_display_order  bibs_display_order  
0                  0.0                   0  
1                  4.0                   0  
2                  0.0                   0  
3                  3.0                   0  
4                 13.0                   0  


In [22]:
# volume_record_item_record_link

# NOTE, this version was used previously, but would't pull in volume info if there were no items (which happens alot)
"""
select
l.id,
l.volume_record_id,
vr.record_num as volume_record_num,
l.item_record_id,
ir.record_num as item_record_num,
l.items_display_order,
(
    select
    string_agg(v.field_content, ', ' order by occ_num)

    from
    sierra_view.varfield as v

    where
    v.record_id = l.volume_record_id
    and v.varfield_type_code = 'v'
) as volume_statement

from
sierra_view.volume_record_item_record_link as l
join sierra_view.record_metadata as vr on vr.id = l.volume_record_id
join sierra_view.record_metadata as ir on ir.id = l.item_record_id
"""

sql = """\
SELECT
l.id,
r.id AS volume_record_id,
r.record_num AS volume_record_num,
ri.id AS item_record_id,
ri.record_num AS item_record_num,
l.items_display_order,
(
    SELECT
    string_agg(v.field_content, ', ' ORDER BY occ_num)
    FROM
    sierra_view.varfield AS v
    WHERE
    v.record_id = r.id
    AND v.varfield_type_code = 'v'
) AS volume_statement
FROM
sierra_view.record_metadata AS r
LEFT OUTER JOIN sierra_view.volume_record_item_record_link AS l ON
l.volume_record_id = r.id
LEFT OUTER JOIN sierra_view.record_metadata ri ON
ri.id = l.item_record_id
WHERE
r.record_type_code = 'j'
AND r.campus_code = ''
"""


df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'volume_record_item_record_link', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'id': pd.Int64Dtype(),
        'volume_record_id': BigInteger(),
        'volume_record_num': Integer(), 
        'item_record_id': BigInteger(),
        'item_record_num': Integer(),
        'items_display_order': Integer(),
    }
)

sql = """\
CREATE INDEX idx_volume_record_item_record_link_volume_record_id ON volume_record_item_record_link (volume_record_id);
CREATE INDEX idx_volume_record_item_record_link_volume_record_num ON volume_record_item_record_link (volume_record_num);
CREATE UNIQUE INDEX pk_volume_record_item_record_link ON volume_record_item_record_link (id);
CREATE UNIQUE INDEX uc_volume_record_item_record_link ON volume_record_item_record_link (item_record_id);
CREATE INDEX volume_record_item_record_link_item_id_volume_id ON volume_record_item_record_link (item_record_id, volume_record_id);
CREATE INDEX volume_record_item_record_link_item_num_volume_num ON volume_record_item_record_link (item_record_num, volume_record_num);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

print(df.shape[0])
print(df.head())

869947
          id  volume_record_id  volume_record_num  item_record_id  \
0  1814429.0      455268052693            1519317    4.509830e+11   
1  1331478.0      455267996430            1463054    4.509806e+11   
2  1331480.0      455267996430            1463054    4.509806e+11   
3  1662112.0      455267996430            1463054    4.509822e+11   
4  1662111.0      455267996430            1463054    4.509822e+11   

   item_record_num  items_display_order       volume_statement  
0       11413961.0                  NaN  V. 273 NO. 2 Sep 2021  
1        9064937.0                  NaN                   v.04  
2        9064939.0                  NaN                   v.04  
3       10605788.0                  NaN                   v.04  
4       10605789.0                  NaN                   v.04  


In [None]:
# phrase_entry

# TODO
# we're going to skip this for now, since there doesn't seem to be a big benefit to this

# target these index_tag values:
# "d": "subject"
# "a": "author",
# "t": "title",
# "o": "ocolc",
# "c": "callnumber",
# "i": "isbn",

sql = """\
SELECT 
e.id,
e.record_id,
e.index_tag,
e.varfield_type_code,
e.occurrence,
e.is_permuted,
e.type2,
e.type3,
e.index_entry,
e.insert_title,
e.phrase_rule_rule_num,
e.phrase_rule_operation,
e.phrase_rule_subfield_list,
e.original_content,
e.parent_record_id,
e.insert_title_tag,
e.insert_title_occ

FROM sierra_view.phrase_entry as e

JOIN
sierra_view.record_metadata as r
ON
  r.id = e.record_id
  
WHERE
e.index_tag in (
    'd'
    -- add these back later maybe
    -- , 'a', 't', 'o', 'c', 'i'
)
AND r.campus_code = ''
AND r.deletion_date_gmt IS NULL
AND r.record_type_code in ('b', 'i', 'j') -- bibliographic, item, volume

ORDER BY
id

LIMIT 1000000 OFFSET {}
"""

# start the offset at 0, then add 100000 to the offset
offset = 0
count = 0
while (True):
    df = pd.read_sql(sql=sql.format(offset), con=sierra_engine)        
    print(offset, df.shape[0], sep=' ', end=', ')
    
    if df.shape[0] == 0:
        break
    
    df.to_sql(
        name='phrase_entry',
        con=engine,
        index=False,
        if_exists='append',
        chunksize=10000, 
        dtype={
            'id': Integer(),
            'record_id': Integer(),
            'occurrence': Integer(),
            'type2': Integer(),
            'phrase_rule_rule_num': Integer(),
            'parent_record_id': Integer(),
            'insert_title_occ': Integer(),
        },
    )
    
    count += 1
    offset += 1000000
    
sql = """\
CREATE INDEX idx_phrase_entry ON phrase_entry (((index_tag || index_entry)), type2, insert_title, record_id);
CREATE INDEX idx_phrase_entry_parent_record_id ON phrase_entry (parent_record_id);
CREATE INDEX idx_phrase_entry_record ON phrase_entry (record_id);
CREATE INDEX idx_phrase_entry_record_key ON phrase_entry (record_id);
CREATE UNIQUE INDEX pk_phrase_entry ON phrase_entry (id);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

0 1000000, 1000000 1000000, 2000000 1000000, 3000000 1000000, 4000000 1000000, 5000000 1000000, 6000000 1000000, 

In [None]:
# location

sql = """\
SELECT
id,
code,
branch_code_num,
parent_location_code,
is_public,
is_requestable
FROM
sierra_view.location;
;
"""
    
df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'location', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'id': Integer(),
        'code': Integer(),
        'branch_code_num': Integer(),
        'parent_location_code': Integer()
    },
    chunksize=10000
)

print(df.shape[0])
print(df.head())

sql = """\
CREATE INDEX fk9ff58fb55804fddb ON location (branch_code_num);
CREATE UNIQUE INDEX location_code_key ON location (code);
CREATE UNIQUE INDEX pk_location ON location (id);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

In [None]:
# branch_name

sql = """\
SELECT
branch_id,
name,
iii_language_id
FROM
sierra_view.branch_name;
"""

df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'branch_name', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'branch_id': Integer(),
        'iii_language_id': Integer(),
    },
    chunksize=10000
)

print(df.shape[0])
print(df.head())

sql = """\
CREATE UNIQUE INDEX branch_name_pkey ON branch_name (branch_id, iii_language_id);
CREATE INDEX fk46f5c7085804fddb ON branch_name (branch_id);
CREATE INDEX fk46f5c7088eaffe82 ON branch_name (iii_language_id);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

In [None]:
# branch

sql = """\
SELECT
id,
address,
email_source,
email_reply_to,
address_latitude,
address_longitude,
code_num

FROM 
sierra_view.branch;
"""

df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'branch', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'id': Integer(),
        'code_num': Integer(),
    },
    chunksize=10000
)

print(df.shape[0])
print(df.head())

sql = """\
CREATE UNIQUE INDEX pk_branch ON branch (id);
CREATE UNIQUE INDEX uniq_branch_code_num ON branch (code_num);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

In [None]:
# country_property_myuser

sql = """
select
*
from

sierra_view.country_property_myuser

order by code
"""

df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'country_property_myuser', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'display_order': Integer(),
    }
)

print(df.shape[0])
print(df.head())

In [None]:
# item_status_property_myuser

sql = """
select
*
from
sierra_view.item_status_property_myuser
order by display_order

"""

df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'item_status_property_myuser', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'display_order': Integer(),
    }
)

print(df.shape[0])
print(df.head())

In [None]:
# itype_property_myuser

sql = """\
SELECT
code,
display_order,
itype_property_category_id,
physical_format_id,
target_audience_id,
name

FROM 
sierra_view.itype_property_myuser
"""

df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'itype_property_myuser', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'code': Integer(),
        'display_order': Integer(),
        'physical_format_id': Integer()
    },
    chunksize=10000
)

print(df.shape[0])
print(df.head())

In [None]:
# physical_format_myuser

sql = """\
select
*
from
sierra_view.physical_format_myuser
order by display_order
"""

df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'physical_format_myuser', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'id': Integer(),
        'display_order': Integer(),
    },
    chunksize=10000
)

print(df.shape[0])
print(df.head())

In [None]:
# bib_level_property_myuser

sql = """\
select
*
from
sierra_view.bib_level_property_myuser

order by
display_order
"""

df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'bib_level_property_myuser', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'id': Integer(),
        'display_order': Integer(),
    },
    chunksize=10000
)

print(df.shape[0])
print(df.head())

In [None]:
# material_property_myuser

sql = """\
select
*
from
sierra_view.material_property_myuser

order by
display_order
"""

df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'material_property_myuser', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'id': Integer(),
        'display_order': Integer(),
    },
    chunksize=10000
)

print(df.shape[0])
print(df.head())

In [None]:
# location

sql = """\
SELECT
id,
code,
branch_code_num,
parent_location_code,
is_public,
is_requestable
FROM
sierra_view.location;
;
"""
    
df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'location', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'id': Integer(),
        'code': Integer(),
        'branch_code_num': Integer(),
        'parent_location_code': Integer()
    },
    chunksize=10000
)

print(df.shape[0])
print(df.head())


sql = """\
CREATE INDEX fk9ff58fb55804fddb ON location (branch_code_num);
CREATE UNIQUE INDEX location_code_key ON location (code);
CREATE UNIQUE INDEX pk_location ON location (id);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

In [None]:
# location_name

sql = """\
SELECT
location_id,
name,
iii_language_id
FROM sierra_view.location_name;
"""

df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'location_name', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'location_id': Integer(),
        'iii_language_id': Integer(),
    },
    chunksize=10000
)

print(df.shape[0])
print(df.head())

sql = """\
CREATE INDEX fk506824d5399f0cbb ON location_name (location_id);
CREATE INDEX fk506824d58eaffe82 ON location_name (iii_language_id);
CREATE UNIQUE INDEX location_name_pkey ON location_name (location_id, iii_language_id);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

In [None]:
# hold

sql = """\
-- pull all relevant hold data

select
h.id as hold_id,
case
    when r.record_type_code = 'i' then (
        select
        br.record_num
        from
        sierra_view.bib_record_item_record_link as l
        join sierra_view.record_metadata as br on br.id = l.bib_record_id
        where
        l.item_record_id = h.record_id
        limit 1
    )
    when r.record_type_code = 'j' then (
        select
        br.record_num
        from
        sierra_view.bib_record_volume_record_link as l
        join sierra_view.record_metadata as br on br.id = l.bib_record_id
        where
        l.volume_record_id = h.record_id
        limit 1
    )
    when r.record_type_code = 'b' then r.record_num
    else NULL
end as bib_record_num,
r.campus_code,
r.record_type_code as record_type_on_hold,
case
    when r.record_type_code = 'i' then r.record_num
-- i don't think this is really useful, but i may want to come back to this 
-- when r.record_type_code = 'j' then (
-- select
-- ir.record_num
-- from
-- sierra_view.volume_record_item_record_link as l
-- join sierra_view.record_metadata as ir on ir.id = l.item_record_id
-- where
-- l.volume_record_id = h.record_id
-- limit 1
-- )
    else NULL
end as item_record_num,
case
    when r.record_type_code = 'j' then r.record_num
    else NULL
end as volume_record_num,
h.placed_gmt,
h.is_frozen,
h.delay_days,
h.location_code,
h.expires_gmt,
case
when h.status = '0' then 'on hold'
when h.status = 'b' then 'bib hold ready for pickup'
when h.status = 'j' then 'volume hold ready for pickup'
when h.status = 'i' then 'item hold ready for pickup'
when h.status = 't' then 'in transit to pickup location'
else h.status
end as hold_status,
h.is_ir,
h.is_ill,
h.pickup_location_code,
h.ir_pickup_location_code,
h.ir_print_name,
h.ir_delivery_stop_name,
h.is_ir_converted_request,
case
when p.activity_gmt >= (NOW() - '3 years'::INTERVAL) THEN TRUE
else FALSE
end as patron_is_active,
p.ptype_code as patron_ptype_code,
p.home_library_code as patron_home_library_code,
p.mblock_code as patron_mblock_code,
case 
when p.owed_amt > 10.00 then TRUE
else FALSE
end as patron_has_over_10usd_owed
from
sierra_view.hold as h
join sierra_view.record_metadata as r on r.id = h.record_id
left outer join sierra_view.patron_record as p on p.record_id = h.patron_record_id

order by
hold_id
"""

df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'hold', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'hold_id': Integer(),
        'bib_record_num': Integer(),
        'item_record_num': Integer(),
        'volume_record_num': Integer(),
        'patron_ptype_code': Integer(),
    },
    chunksize=10000
)

print(df.shape[0])
print(df.head())


sql = """\
CREATE UNIQUE INDEX pk_hold ON hold (hold_id);
CREATE UNIQUE INDEX uc_hold_composite ON hold (hold_id, bib_record_num, placed_gmt);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

In [None]:
# active_items
# 
# create the tables for active items as defined here:
# https://ilsweb.cincinnatilibrary.org/collection-analysis-docs/static_queries_holds.html#defining-active-items

sql = """\
CREATE TABLE IF NOT EXISTS active_items (
    bib_record_num BIGINT,
    item_record_num BIGINT,
    volume_record_num BIGINT,
    volume_statement TEXT,
    items_display_order INTEGER
);

with active_items_data as (
  -- "active items"
  -- --------------
  -- This will produce a list of items meeting the following criteria:
  -- * item status is one of the following codes:
  --   ('-', '!', 'b', 'p', '(', '@', ')', '_', '=', '+', 't')
  -- * if the item has a due date, then it must be less than 60 days overdue:
  --   coalesce( (julianday(date('now')) - julianday(item.due_date) > 60.0 ), FALSE)
  select
    item.bib_record_num,
    item.item_record_num,
    v.volume_record_num,
    v.volume_statement,
    v.items_display_order
  from
    item
    left outer join volume_record_item_record_link as v on v.item_record_num = item.item_record_num -- we need to consider volume information for volume-level holds
    join record_metadata as r on (
      r.record_type_code = 'b'
      and r.record_num = item.bib_record_num
      and r.campus_code = ''
    ) -- considers only items belonging to us (no virtual items)
  where
    -- * item status is one of the following codes:
    --   ('-', '!', 'b', 'p', '(', '@', ')', '_', '=', '+', 't')
    item.item_status_code in (
      '-',
      '!',
      'b',
      'p',
      '(',
      '@',
      ')',
      '_',
      '=',
      '+',
      't'
    ) -- * if the item has a due date, then it must be less than 60 days overdue:
    --   coalesce( (julianday(date('now')) - julianday(item.due_date) > 60.0 ), FALSE)
    and coalesce(
      (
        julianday(date('now')) - julianday(item.due_date) > 60.0
      ),
      FALSE
    ) is FALSE
)

INSERT OR IGNORE INTO active_items (
    bib_record_num,
    item_record_num,
    volume_record_num,
    volume_statement,
    items_display_order
)

SELECT
  *
FROM
  active_items_data
;
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)
        
sql = """\
CREATE INDEX idx_active_items_bib_record_num ON active_items (bib_record_num);
CREATE INDEX idx_active_items_item_record_num ON active_items (item_record_num);
CREATE INDEX idx_active_items_volume_record_num ON active_items (volume_record_num);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

In [None]:
# active_holds
# 
# create the tables for active items as defined here:
# https://ilsweb.cincinnatilibrary.org/collection-analysis-docs/static_queries_holds.html#defining-active-holds


sql = """\
CREATE TABLE IF NOT EXISTS active_holds (
    hold_id BIGINT
);


with active_holds_data as (
  -- "active holds"
  -- --------------
  -- This will produce a list of holds meeting the following criteria:
  -- * hold that is not Frozen (except for holds placed by patrons with ptype 196)
  -- * hold with zero delay days OR the hold delay has passed (hold placed date + delay days is not a date in the future)
  -- * hold placed by patron with one of the following ptype codes:
  --   ( 0, 1, 2, 5, 6, 10, 11, 12, 15, 22, 30, 31, 32, 40, 41, 196 )
  -- * hold status is "on hold"
  select
    h.hold_id
  from
    hold as h
    join record_metadata as r on (
      -- TODO figure out if maybe we could just use the `is_ill` boolean value to do this (this is still fast since it's an indexed search)
      r.record_type_code = 'b'
      and r.record_num = h.bib_record_num
      and r.campus_code = ''
    ) -- join the record metadata so that we're only concerning ourselves with titles that belong to us (to filter out ILL holds)
  where
    -- * hold that is not Frozen (except for holds placed by patrons with ptype 196)
    (
      h.is_frozen is FALSE
      OR h.patron_ptype_code = 196
    )
    AND -- * hold with zero delay days OR the hold delay has passed (hold placed date + delay days is not in the future)
    (
      julianday(datetime('now')) - (
        julianday(h.placed_gmt) + (h.delay_days * 1.0)
      )
    ) > 0
    AND -- * hold placed by patron with one of the following ptype codes:
    --   ( 0, 1, 2, 5, 6, 10, 11, 12, 15, 22, 30, 31, 32, 40, 41, 196 )
    h.patron_ptype_code IN (
      0,
      1,
      2,
      5,
      6,
      10,
      11,
      12,
      15,
      22,
      30,
      31,
      32,
      40,
      41,
      196
    )
    AND -- * hold status is "on hold"
    h.hold_status = 'on hold'
)

INSERT OR IGNORE INTO active_holds (
    hold_id
)

select
  *
from
  active_holds_data
;
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)
        
sql = """\
CREATE INDEX idx_active_holds_holds_id ON active_holds (hold_id);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

In [None]:
# !rsync -Pav current_collection.db plchuser@ilsweb.cincinnatilibrary.org://home/plchuser/data/collection-analysis/collection-2021-04-12.db

In [None]:
!pwd

In [None]:
# create the full text search (fts) on the 
# best_author, best_title,
# publisher, publish_year,
# bib_level_callnumber, indexed_subjects
# columns using the 

utils_db = sqlite_utils.Database('current_collection.db')
utils_db["bib"].enable_fts(["best_author", "best_title", "publisher", "publish_year", "bib_level_callnumber", "indexed_subjects"])

In [None]:
# just to double check if the table now has fts enabled ...
utils_db["bib"].detect_fts()

In [None]:
# hold_shelf.db

!pip install -U sqlite-utils

import sqlite_utils
import sqlite3
import os
from base64 import b64encode
from hashlib import pbkdf2_hmac

In [None]:
# export the holds shelf database to a working copy (holds_shelf_output_db)

# TODO -- make sure to delete the "holds_shelf_output_db" file first

holds_shelf_master_db = '/home/plchuser/plch-holds-shelf/holds_table.db'
holds_shelf_output_db = '/home/plchuser/output/jupyter/collection-analysis/holds_table_data.db'

os.system("rm {}".format(holds_shelf_output_db))
os.system("sqlite3 {} \".backup '{}'\"".format(holds_shelf_master_db, holds_shelf_output_db))

In [None]:
# engine = create_engine('sqlite:///{}'.format(holds_shelf_output_db))

salt = os.urandom(256)
iterations = 100

def hash_patron(patron):
    try:
        return b64encode(
            pbkdf2_hmac(
                hash_name='sha256',
                password=str(patron).encode(),
                salt=salt,
                iterations=iterations
            )
        ).decode('utf-8')
    except:
        return 0    

# give it the most basic test
patron = 12345678
print(hash_patron(patron))

In [None]:
sql = """\
DROP TABLE IF EXISTS "holds_shelf";

CREATE TABLE IF NOT EXISTS "holds_shelf" (
    `hold_id` INTEGER,
    `local_hold_id` INTEGER,
    `hash_row` TEXT UNIQUE PRIMARY KEY, -- so we can track changes made to the row
    `placed_epoch` INTEGER,
    `patron_record_hash` TEXT, -- 
    -- `patron_record_id` INTEGER,
    -- `patron_record_num` INTEGER,
    `record_id` INTEGER,
    `record_type_code` TEXT,
    `record_num` INTEGER,
    `item_location_code` TEXT,
    `agency_code_num` INTEGER,
    `checkin_statistics_group_code_num` INTEGER,
    `checkin_statistics_group_name` TEXT,
    `s_location_code` TEXT,
    -- `is_frozen` INTEGER,
    -- `delay_days` INTEGER,
    `expires_epoch` INTEGER,
    `status` INTEGER,
    -- `is_ir` INTEGER,
    `pickup_location_code` TEXT,
    -- `is_ill` INTEGER,
    -- `note` TEXT,
    -- `ir_pickup_location_code` TEXT,
    -- `ir_print_name` TEXT,
    -- `is_ir_converted_request` INTEGER,
    -- `patron_records_display_order` INTEGER,
    -- `records_display_order` INTEGER,
    `is_deleted` INTEGER NOT NULL DEFAULT 0,
    `deleted_epoch` INTEGER,
    `modified_epoch` INTEGER
);


INSERT OR IGNORE INTO "holds_shelf" (
    `hold_id`,
    `local_hold_id`,
    `hash_row`,
    `placed_epoch`,
    `patron_record_hash`, 
    `record_id`,
    `record_type_code`,
    `record_num`,
    `item_location_code`,
    `agency_code_num`,
    `checkin_statistics_group_code_num`,
    `checkin_statistics_group_name`,
    `s_location_code`,
    `expires_epoch`,
    `status`,    
    `pickup_location_code`,
    `is_deleted`,
    `deleted_epoch`,
    `modified_epoch`
)

select
  `hold_id`,
  `local_hold_id`,
  `hash_row`,
  `placed_epoch`,
  hash_patron_record(`patron_record_id`),
  `record_id`,
  `record_type_code`,
  `record_num`,
  `item_location_code`,
  `agency_code_num`,
  `checkin_statistics_group_code_num`,
  `checkin_statistics_group_name`,
  `s_location_code`,
  `expires_epoch`,
  `status`,    
  `pickup_location_code`,
  `is_deleted`,
  `deleted_epoch`,
  `modified_epoch`
from
  data
;

DROP TABLE data
;
"""

with sqlite3.connect(holds_shelf_output_db) as con:
    con.create_function("hash_patron_record", 1, hash_patron)
    for statement in sql.split(';'):
        con.execute(statement)
        
# os.system("sqlite3 {} \"VACUUM;\"".format(holds_shelf_output_db))

In [None]:
# with sqlite3.connect(holds_shelf_output_db) as con:
#     con.create_function("hash_patron_record", 1, hash_patron)
#     con.execute("update data set patron_record_id = hash_patron_record(patron_record_id)")

In [None]:
# import the data from the holds_shelf db into the current_collection db
os.system("sqlite3 holds_table_data.db \".schema holds_shelf\" | sqlite3 current_collection.db")
os.system("sqlite3 holds_table_data.db \"select * from holds_shelf\" | sqlite3 current_collection.db \".import /dev/stdin holds_shelf\"")

In [None]:
# generate the metadata.json file ...
import json

description_html = """\
<p>
    The <a href="current_collection"><span style="color:#54AC8E;">"current_collection"</span></a>
    dataset reprsents the current collection state for the Cincinnati & Hamilton County Public Library 
    as of {}
</p>
<p>
    Documentation, Static Queries, and Reports can be found in the "data source"
</p>
""".format(pd.Timestamp('now').strftime('%Y-%m-%d'))


sql_location_code_branch_name_available_items = """\
select
  i.location_code,
  ln.name,
  loc.branch_code_num,
  bn.name as branch_name,
  count(*) as count_available_items
from
  item as i
  left outer join location as loc on loc.code = i.location_code
  left outer join location_name as ln on ln.location_id = loc.id
  left outer join branch as br on br.code_num = loc.branch_code_num
  left outer join branch_name as bn on bn.branch_id = br.id
where
  i.item_status_code = '-'
group by
  i.location_code,
  ln.name,
  loc.branch_code_num,
  branch_name
order by
  loc.branch_code_num

branch_code_num_names:
select
  br.code_num,
  bn.name

from
  branch as br
  join branch_name as bn on bn.branch_id = br.id
"""


sql_search_titles_by_multiple_subjects = """\
with bib_search as (
  select
    bib_record_num,
    best_title,
    best_author,
    publish_year,
    indexed_subjects
  from
    bib
  where
    indexed_subjects like trim(lower('%' || :subject1 || '%'))
    and indexed_subjects like trim(lower('%' || :subject2 || '%'))
    and indexed_subjects like trim(lower('%' || :subject3 || '%'))
)
select
  b.best_title,
  b.best_author,
  cast (b.publish_year as integer) as publish_year,
  b.indexed_subjects,
  'https://cincinnatilibrary.bibliocommons.com/item/show/' || b.bib_record_num || '170' as catalog_link
from
  bib_search as b
"""


sql_find_locations_with_available_items_given_branch_code_num = """\
with location_data as (
  select
    i.location_code,
    count(*) as count_available_items
  from
    item as i 
  where
    i.item_status_code = '-'
    AND i.location_code in (
      select
        code
      from
        location as loc
      where
        loc.branch_code_num = :branch_code_num
    )
  group by
    i.location_code
)
select
  location_code,
  (
    select
      ln.name
    from
      location as loc
      left outer join location_name as ln on ln.location_id = loc.id
    where
      loc.code = d.location_code
  ) as location_name,
  count_available_items
  from
    location_data as d
"""


sql_top_circulating_by_subject = """\
select
  bib.best_title,
  bib.best_author,
  cast(bib.publish_year as integer) as publish_year,
  sum(checkout_total + renewal_total) as total_circ,
  'https://cincinnatilibrary.bibliocommons.com/item/show/' || bib.bib_record_num || '170' as catalog_link
from
  bib
  join item as i on i.bib_record_num = bib.bib_record_num
where
  bib.bib_record_num in (
    select
      r.record_num
    from
      phrase_entry as e
      join record_metadata as r on r.id = e.record_id
    where
      e.index_tag = 'd'
      and index_entry LIKE lower('%' || :subject || '%')
  )
group by
  bib.best_title,
  bib.best_author,
  bib.publish_year,
  bib.bib_record_num
order by
  total_circ DESC
"""

sql_item_lookup_by_barcode = """\
select 
  bib.*,
  item.* 
from
  item
  join bib on bib.bib_record_num = item.bib_record_num
where
  barcode = upper(trim(:barcode))
"""


sql_holds_ready_for_pickup_by_month = """\
-- calculate number of items and patrons with items "ready for pickup" on holdshelf per month
with hold_shelf_data as (
  select
    date(modified_epoch, 'unixepoch', 'localtime') as date_hold_on_holdshelf,
    date(placed_epoch, 'unixepoch', 'localtime') as date_hold_placed,
    cast(
      round((modified_epoch - placed_epoch) / 86400.0) as integer
    ) as days_to_holdshelf,
    s_location_code as item_source_location_code,
    record_id,
    patron_record_hash,
    pickup_location_code
  from
    holds_shelf
  where
    modified_epoch >= CAST(strftime('%s', :start_date || '-01') AS INT)
    and modified_epoch < CAST(
      strftime('%s', DATE(:start_date || '-01', '+1 months')) AS INT
    )
)
select
  strftime('%Y-%m', :start_date || '-01') as month,
  -- pickup_location_code,
  coalesce(branch_name.name, pickup_location_code) as pickup_location,
  round(avg(days_to_holdshelf), 2) as avg_days_to_holdshelf,
  count(record_id) as count_items,
  count(DISTINCT patron_record_hash) as count_distinct_patrons,
  round(
    (
      count(record_id) * 1.0 / count(DISTINCT patron_record_hash) * 1.0
    ),
    2
  ) as avg_items_per_patron,
  'https://ilsweb.cincinnatilibrary.org/collection-analysis/current_collection/holds_ready_for_pickup_by_month_branch?start_date=' || :start_date || '&branch_name=' || replace(
    coalesce(branch_name.name, pickup_location_code),
    ' ',
    '%20'
  ) || '&_hide_sql=1' as holds_ready_for_pickup_by_month_branch
from
  hold_shelf_data
  left outer join "location" on "location".code = hold_shelf_data.pickup_location_code
  left outer join branch on branch.code_num = "location".branch_code_num
  left outer join branch_name on branch_name.branch_id = branch.id
group by
  1,
  2
"""

sql_holds_ready_for_pickup_by_month_branch = """\
with hold_shelf_data as (
  select
    date(modified_epoch, 'unixepoch', 'localtime') as date_hold_on_holdshelf,
    date(placed_epoch, 'unixepoch', 'localtime') as date_hold_placed,
    cast(
      round((modified_epoch - placed_epoch) / 86400.0) as integer
    ) as days_to_holdshelf,
    s_location_code as item_source_location_code,
    item.item_format,
    record_num,
    patron_record_hash,
    pickup_location_code
  from
    holds_shelf as hold_shelf
    left outer join item on item.item_record_num = hold_shelf.record_num
  where
    modified_epoch >= CAST(strftime('%s', :start_date || '-01') AS INT)
    and modified_epoch < CAST(
      strftime('%s', DATE(:start_date || '-01', '+1 months')) AS INT
    )
)
select
  strftime('%Y-%m', :start_date || '-01') as month,
  coalesce(item_format, 'Not Available / Deleted') as item_format,
  round(avg(days_to_holdshelf), 2) as avg_days_to_holdshelf,
  count(record_num) as count_items,
  count(DISTINCT patron_record_hash) as count_distinct_patrons,
  round(
    (
      count(record_num) * 1.0 / count(DISTINCT patron_record_hash) * 1.0
    ),
    2
  ) as avg_items_per_patron,
  'https://ilsweb.cincinnatilibrary.org/collection-analysis/current_collection/holds_ready_for_pickup_by_month_branch_items?branch_name=' || replace(:branch_name, ' ', '%20') || '&start_date=' || :start_date || '&item_format=' || replace(item_format, ' ', '%20') || '&_hide_sql=1' as holds_ready_for_pickup_by_month_branch_items
from
  hold_shelf_data
where
  hold_shelf_data.pickup_location_code in (
    select
      "location".code
    from
      "location"
      left outer join branch on branch.code_num = "location".branch_code_num
      left outer join branch_name on branch_name.branch_id = branch.id
    where
      branch_name.name = :branch_name
  )
group by
  1,
  2
order by
  item_format
"""


sql_holds_ready_for_pickup_by_month_branch_items = """\
with hold_shelf_data as (
  select
    date(modified_epoch, 'unixepoch', 'localtime') as date_hold_on_holdshelf,
    date(placed_epoch, 'unixepoch', 'localtime') as date_hold_placed,
    cast(
      round((modified_epoch - placed_epoch) / 86400.0) as integer
    ) as days_to_holdshelf,
    s_location_code as item_source_location_code,
    item.item_format,
    item.bib_record_num,
    record_num as item_record_num,
    patron_record_hash,
    pickup_location_code
  from
    holds_shelf
    left outer join item as item on item.item_record_num = holds_shelf.record_num
  where
    modified_epoch >= CAST(strftime('%s', :start_date || '-01') AS INT)
    and modified_epoch < CAST(
      strftime('%s', DATE(:start_date || '-01', '+1 months')) AS INT
    )
)
select
  strftime('%Y-%m', :start_date || '-01') as month,
  :branch_name as branch_name,
  --item_source_location_code,
  (
    select
      --case
      --  when branch_name.name = :branch_name then ' ' || branch_name.name
      --  else coalesce(branch_name.name, item_source_location_code, '')
      --end
      coalesce(branch_name.name, item_source_location_code, '')
    from
      "location"
      left outer join branch on branch.code_num = "location".branch_code_num
      left outer join branch_name on branch_name.branch_id = branch.id
    where
      "location".code = hold_shelf_data.item_source_location_code
    limit
      1
  ) as item_source_branch_name,
  coalesce(
    hold_shelf_data.item_format,
    'Not Available / Deleted'
  ) as item_format,
  hold_shelf_data.date_hold_placed,
  hold_shelf_data.date_hold_on_holdshelf,
  hold_shelf_data.days_to_holdshelf,
  hold_shelf_data.item_record_num,
  hold_shelf_data.bib_record_num,
  (
    select
      best_title
    from
      bib
    where
      bib.bib_record_num = hold_shelf_data.bib_record_num
    limit
      1
  ) as best_title
from
  hold_shelf_data -- left outer join current_collection.bib as bib on bib.bib_record_num = hold_shelf_data.bib_record_num
where
  hold_shelf_data.pickup_location_code in (
    select
      "location".code
    from
      "location"
      left outer join branch on branch.code_num = "location".branch_code_num
      left outer join branch_name on branch_name.branch_id = branch.id
    where
      branch_name.name = :branch_name
  )
  and hold_shelf_data.item_format = :item_format
order by
  branch_name,
  -- use the trick of placing a ' ' in front of the name to get the branch name to sort to the top
  case
    when item_source_branch_name = :branch_name then ' ' || item_source_branch_name
    else item_source_branch_name
  end,
  days_to_holdshelf
"""


json_metadata = {
    'title': 'Current Collection Data Set',
    'source_url': 'https://ilsweb.cincinnatilibrary.org/collection-analysis-docs/',
    'description_html': description_html,
    'extra_css_urls': ['/static/my.css', ],
    'databases': {
        'current_collection': {
            'tables': {
                'bib': {
                    'fts_table': 'bib_fts',
                    'search_mode': 'raw'
                },
            },       
            'queries': {
                'location_code_branch_name_available_items': {
                    'sql':  sql_location_code_branch_name_available_items,
                    'title': 'location_code_branch_name_available_items'
                },
                'holds_ready_for_pickup_by_month': {
                    'sql': sql_holds_ready_for_pickup_by_month,
                    'title': 'holds_ready_for_pickup_by_month'
                },
                'holds_ready_for_pickup_by_month_branch': {
                    'sql': sql_holds_ready_for_pickup_by_month_branch,
                    'title': 'holds_ready_for_pickup_by_month_branch'
                },
                'holds_ready_for_pickup_by_month_branch_items': {
                    'sql': sql_holds_ready_for_pickup_by_month_branch_items,
                    'title': 'holds_ready_for_pickup_by_month_branch_items'
                },
                'search_titles_by_multiple_subjects': {
                    'sql': sql_search_titles_by_multiple_subjects,
                    'title': 'search_titles_by_multiple_subjects'
                },
                'find_locations_with_available_items_given_branch_code_num': {
                    'sql': sql_find_locations_with_available_items_given_branch_code_num,
                    'title': 'find_locations_with_available_items_given_branch_code_num'
                },
                'top_circulating_by_subject': {
                    'sql': sql_top_circulating_by_subject,
                    'title': 'top_circulating_by_subject'
                },
                'item_lookup_by_barcode': {
                    'sql': sql_item_lookup_by_barcode,
                    'title': 'item_lookup_by_barcode'
                },
            }
        }
    }
}

with open('metadata.json', 'w') as f:
    f.write(json.dumps(json_metadata))

*Note*:

Right now, I'm managing the old snapshots in a folder on the ilsaux2 server `/home/plchuser/output/collection-analysis/datasette_hosted_databases/`

* * * 

**from the ilsaux2 server:**

```bash
rsync -Pav \
    plchuser@ilsweb.cincinnatilibrary.org://home/plchuser/data/collection-analysis/ \
    /home/plchuser/output/collection-analysis/datasette_hosted_databases/
```

* * * 

Next, you may have to remove the oldest snapshot to make room on the `ilsweb` server

**from the ilsweb server:**

for example:

```bash
rm collection-2021-03-22.db
```

* * * 

sync the output from this script 

**from the ilsaux2 server:**

```bash
rsync -Pav /home/plchuser/output/jupyter/collection-analysis/metadata.json plchuser@ilsweb.cincinnatilibrary.org://home/plchuser/data/collection-analysis/metadata.json
```

for example ...

```bash
rsync -Pav /home/plchuser/output/jupyter/collection-analysis/current_collection.db plchuser@ilsweb.cincinnatilibrary.org://home/plchuser/data/collection-analysis/collection-2021-08-23.db
```

* * *

link the correct files:

**from the ilsweb server:**

```bash
rm current_collection.db
rm collection_prev.db
```

link the new files...

```bash
ln collection-2021-08-23.db current_collection.db
ln collection-2021-08-16.db collection_prev.db
```

Edit the `metadata.yaml` file, and change the date

```bash
nano metadata.yaml
```

restart datasette (remember to reactivate the env)

```text
(venv) plchuser@ilsweb:~/data/collection-analysis$ ./start_datasette.sh
```

**Note**
To remove a page from the htcache (if the page is stuck with an old cached version, run this command to clean it:

```bash
sudo htcacheclean -v -p/var/cache/apache2/mod_cache_disk/ "https://ilsweb.cincinnatilibrary.org:443/collection-analysis/?"`
```

To remove all entries, use this:

```bash
sudo htcacheclean -v -p/var/cache/apache2/mod_cache_disk/ -r -l1k
```