In [1]:
!pwd

/home/plchuser/output/jupyter/collection-analysis


In [2]:
!pip install -U pip
!pip install -U sqlite-utils



In [3]:
# IMPORTANT!!!
# make sure that you run the backup first!
# /home/plchuser/.backup/plch-ilsaux2-collection-analysis.sh

In [4]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import Integer, BigInteger, Numeric
import numpy as np
import os
import re
from decimal import Decimal
import sqlite_utils

import vars

engine = create_engine('sqlite:///current_collection.db', echo=False)

sierra_engine = create_engine('postgres://{}:{}@sierra-db.plch.net:1032/iii'.format(vars.pg_username, vars.pg_password))

collection_file_path = os.path.join(os.getcwd(), '/home/plchuser/output/collection-analysis/')

item_re = re.compile(r"^[0-9]{4}\-[0-9]{2}\-[0-9]{2}\-plch\-item\.csv\.xz")
bib_re = re.compile(r"^[0-9]{4}\-[0-9]{2}\-[0-9]{2}\-plch\-bib\.csv\.xz")

In [5]:
# this is our working directory
print(os.getcwd())

/home/plchuser/output/jupyter/collection-analysis


In [6]:
# REMOVE and refresh the sqlite database file in the local directory
try:
    os.remove('current_collection.db')
except:
    pass

os.close(os.open('current_collection.db', os.O_CREAT))

In [7]:
collection_files = os.listdir(collection_file_path)
collection_files.sort(reverse=True)

In [8]:
# convert price to integer values
numbers_only_re = re.compile('[^0-9]')

def price_to_int(price):
    return int(numbers_only_re.sub('', price))

In [9]:
item_file = [file for file in collection_files if item_re.match(file)][0]
print(item_file)

2021-07-19-plch-item.csv.xz


In [10]:
# this was here to generate the database for the first snapshot of the year
# item_file = '2020-01-06-plch-item.csv.xz'

In [11]:
df = pd.read_csv(
    os.path.join(collection_file_path, item_file),
    compression='xz',
    delimiter='|',
    converters={'price': price_to_int},
    # nrows=100
)

In [12]:
df = df.rename(columns={'price': 'price_cents'})

In [13]:
df.to_sql(name='item', index=False, if_exists='replace', con=engine, chunksize=10000)

In [14]:
bib_file = [file for file in collection_files if bib_re.match(file)][0]
print(bib_file)

2021-07-19-plch-bib.csv.xz


In [15]:
# this was here to generate the database for the first snapshot of the year
# bib_file = '2020-01-06-plch-bib.csv.xz'

In [16]:
df = pd.read_csv(
    os.path.join(collection_file_path, bib_file),
    compression='xz',
    delimiter='|',
)

df.to_sql(
    name='bib', 
    index=False, 
    if_exists='replace', 
    con=engine, 
    chunksize=10000,
    dtype={
        'publish_year': Integer(),
    }
)

In [17]:
# indexes to create
sql = """\
CREATE INDEX IF NOT EXISTS "idx_bib_bib_record_num" ON "bib" (
    "bib_record_num"
);
CREATE INDEX IF NOT EXISTS "idx_bib_indexed_subjects" on bib (
    "indexed_subjects"
);
CREATE INDEX IF NOT EXISTS "idx_item_item_format_item_status_location_code_item_callnumber" ON "item" (
    "location_code",
    "item_format",
    "item_status_code",
    "item_callnumber"
);
CREATE INDEX IF NOT EXISTS "idx_item_bib_record_num" ON "item" (
    "bib_record_num"
);
CREATE INDEX IF NOT EXISTS "idx_item_item_record_num" ON "item" (
    "item_record_num"
);
CREATE INDEX IF NOT EXISTS "idx_item_agency_code_num_location_code" ON "item" (
    "agency_code_num",
    "location_code"
);
CREATE INDEX IF NOT EXISTS "idx_item_barcode" ON "item" (
    "barcode"
);
CREATE INDEX IF NOT EXISTS "idx_item_item_format" ON "item" (
    "item_format"
);
"""
with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

In [18]:
# bib_record

sql = """\
SELECT
b.id,
b.record_id,
b.language_code,
b.bcode1,
b.bcode2,
b.bcode3,
b.country_code,
b.index_change_count,
b.is_on_course_reserve,
b.is_right_result_exact,
b.allocation_rule_code,
b.skip_num,
b.cataloging_date_gmt,
b.marc_type_code,
b.is_suppressed

FROM
sierra_view.bib_record as b

JOIN
sierra_view.record_metadata as r on r.id = b.record_id

WHERE
r.campus_code = ''

ORDER BY
b.id

LIMIT 1000000 OFFSET {}
"""

# start the offset at 0, then add 100000 to the offset
offset = 0
count = 0
while (True):
    df = pd.read_sql(sql=sql.format(offset), con=sierra_engine)        
    print(offset, df.shape[0], sep=' ', end=', ')
    
    if df.shape[0] == 0:
        break
    
    df.to_sql(
        name='bib_record',
        con=engine,
        index=False,
        if_exists='append',
        chunksize=10000, 
        dtype={
            'id': Integer(),
            'record_id': Integer(),
            'index_change_count': Integer(),
            'skip_num': Integer(),
        },
    )
    
    count += 1
    offset += 1000000


sql = """\
CREATE INDEX bib_record_bcode3_idx ON bib_record (bcode3);
CREATE INDEX bib_record_bib_level_idx ON bib_record (bcode1);
CREATE INDEX bib_record_country_idx ON bib_record (country_code);
CREATE INDEX bib_record_lang_idx ON bib_record (language_code);
CREATE INDEX bib_record_material_type_idx ON bib_record (bcode2);
CREATE UNIQUE INDEX bib_record_record_key ON bib_record (record_id);
CREATE INDEX idx_bib_record_cataloging_date ON bib_record (cataloging_date_gmt);
CREATE UNIQUE INDEX pk_bib_record ON bib_record (id);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

0 1000000, 1000000 1000000, 2000000 18964, 3000000 0, 

In [19]:
# language_property

sql = """\
select
p.id,
p.code,
p.display_order,
n.name

from
sierra_view.language_property as p

join
sierra_view.language_property_name as n
on n.language_property_id = p.id

order by id
"""

df = pd.read_sql(sql=sql, con=sierra_engine)

df.to_sql(
        name='language_property',
        con=engine,
        index=False,
        if_exists='append',
        chunksize=10000, 
        dtype={
            'id': Integer(),
            'display_order': Integer(),
        },
    )


sql = """\
CREATE UNIQUE INDEX language_property_code_key ON language_property (code);
CREATE UNIQUE INDEX pk_language_property ON language_property (id);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

In [20]:
# sierra_view.record_metadata

sql = """\
select
r.id,
r.record_type_code,
r.record_num,
date(r.creation_date_gmt) as creation_date_gmt,
date(r.deletion_date_gmt) as deletion_date_gmt,
r.campus_code,
r.agency_code_num,
r.record_last_updated_gmt

FROM
sierra_view.record_metadata as r

WHERE
r.campus_code = ''
-- started grabbing the deleted record data 2021-03-15
AND r.deletion_date_gmt IS NULL
AND r.record_type_code in ('b', 'i', 'j' ) -- bibliographic, item, volume
"""
df = pd.read_sql(sql=sql, con=sierra_engine )

# output to the sqlite db
# NOTE: the first time through, we'll want to "replace" ... the second "append"
df.to_sql(name='record_metadata', index=False, if_exists='replace', con=engine, chunksize=10000)

# -- started grabbing the deleted record data 2021-03-15
# doing this as the second part since it may exceeed our memory limits

sql = """\
select
r.id,
r.record_type_code,
r.record_num,
date(r.creation_date_gmt) as creation_date_gmt,
date(r.deletion_date_gmt) as deletion_date_gmt,
r.campus_code,
r.agency_code_num,
r.record_last_updated_gmt

FROM
sierra_view.record_metadata as r

WHERE
r.campus_code = ''
-- started grabbing the deleted record data 2021-03-15
AND r.deletion_date_gmt IS NOT NULL
AND r.record_type_code in ('b', 'i', 'j' ) -- bibliographic, item, volume
"""
df = pd.read_sql(sql=sql, con=sierra_engine )

# output to the sqlite db
df.to_sql(name='record_metadata', index=False, if_exists='append', con=engine, chunksize=10000)


sql = """\
CREATE INDEX idx_record_metadata_id_record_last_updated ON record_metadata (id, record_last_updated_gmt);
CREATE INDEX idx_record_metadata_record_creation_date_gmt ON record_metadata (creation_date_gmt);
CREATE INDEX idx_record_metadata_record_num ON record_metadata (record_num);
CREATE UNIQUE INDEX pk_record_id ON record_metadata (id);
CREATE UNIQUE INDEX record_id_unique_constraint ON record_metadata (record_type_code, record_num, campus_code);
CREATE INDEX record_metadata_last_modified ON record_metadata (record_last_updated_gmt, record_type_code, id);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

In [21]:
# bib_record_item_record_link
sql = """
select 
l.id,
l.bib_record_id,
r.record_num as bib_record_num,
l.item_record_id,
ir.record_num as item_record_num,
l.items_display_order,
l.bibs_display_order

from
sierra_view.bib_record_item_record_link as l

join sierra_view.record_metadata as r on r.id = l.bib_record_id

join sierra_view.record_metadata as ir on ir.id = l.item_record_id
"""

df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'bib_record_item_record_link', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'id': Integer(),
        'bib_record_id': BigInteger(),
        'bib_record_num': Integer(), 
        'item_record_id': BigInteger(),
        'item_record_num': Integer(),
        'items_display_order': Integer(),
        'bibs_display_order': Integer(), 
    }
)

sql = """\
CREATE INDEX idx_bib_record_item_record_link_bib_record_id ON bib_record_item_record_link (bib_record_id);
CREATE INDEX idx_bib_record_item_record_link_bib_record_num ON bib_record_item_record_link (bib_record_num);
CREATE INDEX item_record_id_index ON bib_record_item_record_link (item_record_id);
CREATE INDEX item_record_num_index ON bib_record_item_record_link (item_record_num);
CREATE UNIQUE INDEX pk_bib_record_item_record_link ON bib_record_item_record_link (id);
CREATE UNIQUE INDEX uc_bib_record_item_record_link ON bib_record_item_record_link (bib_record_id, item_record_id);
CREATE INDEX ucn_bib_record_item_record_link ON bib_record_item_record_link (bib_record_num, item_record_num);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

print(df.shape[0])
print(df.head())

6016964
         id  bib_record_id  bib_record_num  item_record_id  item_record_num  \
0   1481488   420908780724         1985716    450972577188          1011108   
1   1489279   420908616008         1821000    450972584979          1018899   
2  11451750   420907809642         1014634    450982078657         10512577   
3  12148146   420907809576         1014568    450982706328         11140248   
4  11791765   420907809080         1014072    450982382162         10816082   

   items_display_order  bibs_display_order  
0                  0.0                   0  
1                  0.0                   0  
2               1083.0                   0  
3               7019.0                   0  
4               1710.0                   0  


In [22]:
# volume_record_item_record_link
sql = """
select
l.id,
l.volume_record_id,
vr.record_num as volume_record_num,
l.item_record_id,
ir.record_num as item_record_num,
l.items_display_order,
(
    select
    string_agg(v.field_content, ', ' order by occ_num)

    from
    sierra_view.varfield as v

    where
    v.record_id = l.volume_record_id
    and v.varfield_type_code = 'v'
) as volume_statement

from
sierra_view.volume_record_item_record_link as l
join sierra_view.record_metadata as vr on vr.id = l.volume_record_id
join sierra_view.record_metadata as ir on ir.id = l.item_record_id
"""

df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'volume_record_item_record_link', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'id': Integer(),
        'volume_record_id': BigInteger(),
        'volume_record_num': Integer(), 
        'item_record_id': BigInteger(),
        'item_record_num': Integer(),
        'items_display_order': Integer(),
    }
)

sql = """\
CREATE INDEX idx_volume_record_item_record_link_volume_record_id ON volume_record_item_record_link (volume_record_id);
CREATE INDEX idx_volume_record_item_record_link_volume_record_num ON volume_record_item_record_link (volume_record_num);
CREATE UNIQUE INDEX pk_volume_record_item_record_link ON volume_record_item_record_link (id);
CREATE UNIQUE INDEX uc_volume_record_item_record_link ON volume_record_item_record_link (item_record_id);
CREATE INDEX volume_record_item_record_link_item_id_volume_id ON volume_record_item_record_link (item_record_id, volume_record_id);
CREATE INDEX volume_record_item_record_link_item_num_volume_num ON volume_record_item_record_link (item_record_num, volume_record_num);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

print(df.shape[0])
print(df.head())

680713
        id  volume_record_id  volume_record_num  item_record_id  \
0   358209      455267826131            1292755    450974557246   
1   310769      455267788599            1255223    450974381522   
2   249342      455267735960            1202584    450974131592   
3  1249534      455267990013            1456637    450980250593   
4  1635327      455268036961            1503585    450982043103   

   item_record_num  items_display_order volume_statement  
0          2991166                  NaN     1987 v.00.01  
1          2815442                  NaN             v.09  
2          2565512                  NaN             v.02  
3          8684513                  NaN             v.17  
4         10477023                  NaN             v.01  


In [23]:
# phrase_entry

# TODO
# we're going to skip this for now, since there doesn't seem to be a big benefit to this

# target these index_tag values:
# "d": "subject"
# "a": "author",
# "t": "title",
# "o": "ocolc",
# "c": "callnumber",
# "i": "isbn",

sql = """\
SELECT 
e.id,
e.record_id,
e.index_tag,
e.varfield_type_code,
e.occurrence,
e.is_permuted,
e.type2,
e.type3,
e.index_entry,
e.insert_title,
e.phrase_rule_rule_num,
e.phrase_rule_operation,
e.phrase_rule_subfield_list,
e.original_content,
e.parent_record_id,
e.insert_title_tag,
e.insert_title_occ

FROM sierra_view.phrase_entry as e

JOIN
sierra_view.record_metadata as r
ON
  r.id = e.record_id
  
WHERE
e.index_tag in (
    'd'
    -- add these back later maybe
    -- , 'a', 't', 'o', 'c', 'i'
)
AND r.campus_code = ''
AND r.deletion_date_gmt IS NULL
AND r.record_type_code in ('b', 'i', 'j') -- bibliographic, item, volume

ORDER BY
id

LIMIT 1000000 OFFSET {}
"""

# start the offset at 0, then add 100000 to the offset
offset = 0
count = 0
while (True):
    df = pd.read_sql(sql=sql.format(offset), con=sierra_engine)        
    print(offset, df.shape[0], sep=' ', end=', ')
    
    if df.shape[0] == 0:
        break
    
    df.to_sql(
        name='phrase_entry',
        con=engine,
        index=False,
        if_exists='append',
        chunksize=10000, 
        dtype={
            'id': Integer(),
            'record_id': Integer(),
            'occurrence': Integer(),
            'type2': Integer(),
            'phrase_rule_rule_num': Integer(),
            'parent_record_id': Integer(),
            'insert_title_occ': Integer(),
        },
    )
    
    count += 1
    offset += 1000000
    
sql = """\
CREATE INDEX idx_phrase_entry ON phrase_entry (((index_tag || index_entry)), type2, insert_title, record_id);
CREATE INDEX idx_phrase_entry_parent_record_id ON phrase_entry (parent_record_id);
CREATE INDEX idx_phrase_entry_record ON phrase_entry (record_id);
CREATE INDEX idx_phrase_entry_record_key ON phrase_entry (record_id);
CREATE UNIQUE INDEX pk_phrase_entry ON phrase_entry (id);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

0 1000000, 1000000 1000000, 2000000 1000000, 3000000 1000000, 4000000 1000000, 5000000 1000000, 6000000 1000000, 7000000 63431, 8000000 0, 

In [24]:
# location

sql = """\
SELECT
id,
code,
branch_code_num,
parent_location_code,
is_public,
is_requestable
FROM
sierra_view.location;
;
"""
    
df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'location', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'id': Integer(),
        'code': Integer(),
        'branch_code_num': Integer(),
        'parent_location_code': Integer()
    },
    chunksize=10000
)

print(df.shape[0])
print(df.head())

sql = """\
CREATE INDEX fk9ff58fb55804fddb ON location (branch_code_num);
CREATE UNIQUE INDEX location_code_key ON location (code);
CREATE UNIQUE INDEX pk_location ON location (id);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

2981
     id   code  branch_code_num parent_location_code  is_public  \
0   884  fotab             14.0                 None      False   
1    81  y0604              1.0                 None      False   
2  1683  nsjdn             29.0                 None      False   
3    27  avjpl              3.0                 None      False   
4   616  crzzz              9.0                 None      False   

   is_requestable  
0            True  
1            True  
2            True  
3            True  
4            True  


In [25]:
# branch_name

sql = """\
SELECT
branch_id,
name,
iii_language_id
FROM
sierra_view.branch_name;
"""

df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'branch_name', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'branch_id': Integer(),
        'iii_language_id': Integer(),
    },
    chunksize=10000
)

print(df.shape[0])
print(df.head())

sql = """\
CREATE UNIQUE INDEX branch_name_pkey ON branch_name (branch_id, iii_language_id);
CREATE INDEX fk46f5c7085804fddb ON branch_name (branch_id);
CREATE INDEX fk46f5c7088eaffe82 ON branch_name (iii_language_id);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

52
   branch_id          name  iii_language_id
0          1  Main Library                1
1          2      Anderson                1
2          3      Avondale                1
3          4      Blue Ash                1
4          5     Bond Hill                1


In [26]:
# branch

sql = """\
SELECT
id,
address,
email_source,
email_reply_to,
address_latitude,
address_longitude,
code_num

FROM 
sierra_view.branch;
"""

df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'branch', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'id': Integer(),
        'code_num': Integer(),
    },
    chunksize=10000
)

print(df.shape[0])
print(df.head())

sql = """\
CREATE UNIQUE INDEX pk_branch ON branch (id);
CREATE UNIQUE INDEX uniq_branch_code_num ON branch (code_num);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

52
   id                                            address  \
0  52                                                      
1  50  Main Library$800 Vine Street$Cincinnati, Ohio ...   
2  17  Greenhills Branch$8 Enfield St.$Cincinnati, Oh...   
3  49                                                      
4  44                                                      

                          email_source                       email_reply_to  \
0                                                                             
1  patronnotices@cincinnatilibrary.org  patronnotices@cincinnatilibrary.org   
2  patronnotices@cincinnatilibrary.org  patronnotices@cincinnatilibrary.org   
3                                                                             
4  patronnotices@cincinnatilibrary.org  patronnotices@cincinnatilibrary.org   

  address_latitude address_longitude  code_num  
0       39.1057790       -84.5133140        52  
1       39.2305206       -84.3749388        50  
2       39.268

In [27]:
# country_property_myuser

sql = """
select
*
from

sierra_view.country_property_myuser

order by code
"""

df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'country_property_myuser', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'display_order': Integer(),
    }
)

print(df.shape[0])
print(df.head())

333
  code  display_order                      name
0                   0                No country
1   aa              1                   Albania
2  abc              2                   Alberta
3  aca              3  Australian Capital Terr.
4   ae              4                   Algeria


In [28]:
# item_status_property_myuser

sql = """
select
*
from
sierra_view.item_status_property_myuser
order by display_order

"""

df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'item_status_property_myuser', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'display_order': Integer(),
    }
)

print(df.shape[0])
print(df.head())

31
  code  display_order                       name
0    !              0               ON HOLDSHELF
1    #              1       SearchOH/OL RECEIVED
2    $              2              LOST AND PAID
3    %              3       SearchOH/OL RETURNED
4    &              4  SearchOH/OHIOLINK REQUEST


In [29]:
# itype_property_myuser

sql = """\
SELECT
code,
display_order,
itype_property_category_id,
physical_format_id,
target_audience_id,
name

FROM 
sierra_view.itype_property_myuser
"""

df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'itype_property_myuser', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'code': Integer(),
        'display_order': Integer(),
        'physical_format_id': Integer()
    },
    chunksize=10000
)

print(df.shape[0])
print(df.head())

117
   code  display_order itype_property_category_id  physical_format_id  \
0   105             54                       None                 NaN   
1     6              6                       None                 NaN   
2   102             51                       None                 NaN   
3   110             55                       None                 NaN   
4     2              2                       None                 1.0   

  target_audience_id                  name  
0               None            Leased DVD  
1               None           Leased Book  
2               None                Bluray  
3               None  MakerSpace Equipment  
4               None         Juvenile Book  


In [30]:
# physical_format_myuser

sql = """\
select
*
from
sierra_view.physical_format_myuser
order by display_order
"""

df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'physical_format_myuser', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'id': Integer(),
        'display_order': Integer(),
    },
    chunksize=10000
)

print(df.shape[0])
print(df.head())

9
   id  is_default  display_order         name
0   1        True              1         Book
1  29       False              2   Book on CD
2  34       False              3          DVD
3  35       False              4  Large Print
4  33       False              5     Magazine


In [31]:
# bib_level_property_myuser

sql = """\
select
*
from
sierra_view.bib_level_property_myuser

order by
display_order
"""

df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'bib_level_property_myuser', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'id': Integer(),
        'display_order': Integer(),
    },
    chunksize=10000
)

print(df.shape[0])
print(df.head())

8
  code  display_order          name
0    -              0           ---
1    a              1  MONO COMP PT
2    b              2   SER COMP PT
3    c              3    COLLECTION
4    d              4       SUBUNIT


In [32]:
# material_property_myuser

sql = """\
select
*
from
sierra_view.material_property_myuser

order by
display_order
"""

df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'material_property_myuser', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'id': Integer(),
        'display_order': Integer(),
    },
    chunksize=10000
)

print(df.shape[0])
print(df.head())

28
  code  display_order  is_public material_property_category_id  \
0    -              0       True                          None   
1    1              1       True                          None   
2    2              2       True                          None   
3    3              3       True                          None   
4    4              4       True                          None   

  physical_format_id                    name  
0               None               Undefined  
1               None  Downloadable Audiobook  
2               None       Downloadable Book  
3               None      Downloadable Music  
4               None      Downloadable Video  


In [33]:
# location

sql = """\
SELECT
id,
code,
branch_code_num,
parent_location_code,
is_public,
is_requestable
FROM
sierra_view.location;
;
"""
    
df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'location', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'id': Integer(),
        'code': Integer(),
        'branch_code_num': Integer(),
        'parent_location_code': Integer()
    },
    chunksize=10000
)

print(df.shape[0])
print(df.head())


sql = """\
CREATE INDEX fk9ff58fb55804fddb ON location (branch_code_num);
CREATE UNIQUE INDEX location_code_key ON location (code);
CREATE UNIQUE INDEX pk_location ON location (id);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

2981
     id   code  branch_code_num parent_location_code  is_public  \
0   884  fotab             14.0                 None      False   
1    81  y0604              1.0                 None      False   
2  1683  nsjdn             29.0                 None      False   
3    27  avjpl              3.0                 None      False   
4   616  crzzz              9.0                 None      False   

   is_requestable  
0            True  
1            True  
2            True  
3            True  
4            True  


In [34]:
# location_name

sql = """\
SELECT
location_id,
name,
iii_language_id
FROM sierra_view.location_name;
"""

df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'location_name', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'location_id': Integer(),
        'iii_language_id': Integer(),
    },
    chunksize=10000
)

print(df.shape[0])
print(df.head())

sql = """\
CREATE INDEX fk506824d5399f0cbb ON location_name (location_id);
CREATE INDEX fk506824d58eaffe82 ON location_name (iii_language_id);
CREATE UNIQUE INDEX location_name_pkey ON location_name (location_id, iii_language_id);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

2981
   location_id                                 name  iii_language_id
0          884          Forest Park Teen Audiobooks                1
1         2177         Walnut Hills Television DVDs                1
2         1683  Northside Juvenile New Release DVDs                1
3         2338                Wyoming Foreign Films                1
4          616                   Corryville Cleanup                1


In [35]:
# hold

sql = """\
-- pull all relevant hold data

select
h.id as hold_id,
case
    when r.record_type_code = 'i' then (
        select
        br.record_num
        from
        sierra_view.bib_record_item_record_link as l
        join sierra_view.record_metadata as br on br.id = l.bib_record_id
        where
        l.item_record_id = h.record_id
        limit 1
    )
    when r.record_type_code = 'j' then (
        select
        br.record_num
        from
        sierra_view.bib_record_volume_record_link as l
        join sierra_view.record_metadata as br on br.id = l.bib_record_id
        where
        l.volume_record_id = h.record_id
        limit 1
    )
    when r.record_type_code = 'b' then r.record_num
    else NULL
end as bib_record_num,
r.campus_code,
r.record_type_code as record_type_on_hold,
case
    when r.record_type_code = 'i' then r.record_num
-- i don't think this is really useful, but i may want to come back to this 
-- when r.record_type_code = 'j' then (
-- select
-- ir.record_num
-- from
-- sierra_view.volume_record_item_record_link as l
-- join sierra_view.record_metadata as ir on ir.id = l.item_record_id
-- where
-- l.volume_record_id = h.record_id
-- limit 1
-- )
    else NULL
end as item_record_num,
case
    when r.record_type_code = 'j' then r.record_num
    else NULL
end as volume_record_num,
h.placed_gmt,
h.is_frozen,
h.delay_days,
h.location_code,
h.expires_gmt,
case
when h.status = '0' then 'on hold'
when h.status = 'b' then 'bib hold ready for pickup'
when h.status = 'j' then 'volume hold ready for pickup'
when h.status = 'i' then 'item hold ready for pickup'
when h.status = 't' then 'in transit to pickup location'
else h.status
end as hold_status,
h.is_ir,
h.is_ill,
h.pickup_location_code,
h.ir_pickup_location_code,
h.ir_print_name,
h.ir_delivery_stop_name,
h.is_ir_converted_request,
case
when p.activity_gmt >= (NOW() - '3 years'::INTERVAL) THEN TRUE
else FALSE
end as patron_is_active,
p.ptype_code as patron_ptype_code,
p.home_library_code as patron_home_library_code,
p.mblock_code as patron_mblock_code,
case 
when p.owed_amt > 10.00 then TRUE
else FALSE
end as patron_has_over_10usd_owed
from
sierra_view.hold as h
join sierra_view.record_metadata as r on r.id = h.record_id
left outer join sierra_view.patron_record as p on p.record_id = h.patron_record_id

order by
hold_id
"""

df = pd.read_sql(sql=sql, con=sierra_engine)

# write results to sqlite db
df.to_sql(
    'hold', 
    con=engine, 
    index=False, 
    if_exists='replace',
    dtype={
        'hold_id': Integer(),
        'bib_record_num': Integer(),
        'item_record_num': Integer(),
        'vol_record_num': Integer(),
        'patron_ptype_code': Integer(),
    },
    chunksize=10000
)

print(df.shape[0])
print(df.head())


sql = """\
CREATE UNIQUE INDEX pk_hold ON hold (hold_id);
CREATE UNIQUE INDEX uc_hold_composite ON hold (hold_id, bib_record_num, placed_gmt);
"""

with engine.connect() as con:
    for statement in sql.split(';'):
        con.execute(statement)

158907
   hold_id  bib_record_num campus_code record_type_on_hold  item_record_num  \
0  2707601         2621589                               j              NaN   
1  2707604         2621589                               j              NaN   
2  3473847         2621589                               j              NaN   
3  3473848         2621589                               j              NaN   
4  3473856         2823181                               j              NaN   

   volume_record_num                 placed_gmt  is_frozen  delay_days  \
0          1386395.0  2013-01-21 16:11:15-05:00       True         255   
1          1366848.0  2013-01-21 16:11:23-05:00       True         255   
2          1386395.0  2013-03-11 19:09:43-04:00       True         255   
3          1366848.0  2013-03-11 19:09:51-04:00       True         255   
4          1401384.0  2013-03-11 19:10:11-04:00       True         255   

  location_code  ... pickup_location_code ir_pickup_location_code  \
0   

In [36]:
# !rsync -Pav current_collection.db plchuser@ilsweb.cincinnatilibrary.org://home/plchuser/data/collection-analysis/collection-2021-04-12.db

In [37]:
!pwd

/home/plchuser/output/jupyter/collection-analysis


In [38]:
# create the full text search (fts) on the 
# best_author, best_title,
# publisher, publish_year,
# bib_level_callnumber, indexed_subjects
# columns using the 

utils_db = sqlite_utils.Database('current_collection.db')
utils_db["bib"].enable_fts(["best_author", "best_title", "publisher", "publish_year", "bib_level_callnumber", "indexed_subjects"])

<Table bib (bib_record_num, creation_date, record_last_updated, isbn, best_author, best_title, publisher, publish_year, bib_level_callnumber, indexed_subjects)>

In [39]:
# just to double check if the table now has fts enabled ...
utils_db["bib"].detect_fts()

'bib_fts'