In [1]:
import pandas as pd
import os
from sqlalchemy import create_engine
from sqlalchemy.types import Integer, Text, JSON, Boolean, BigInteger, DateTime

import vars

engine = create_engine(
    'sqlite:///current_orders.db', 
    echo=False
)

sierra_engine = create_engine(
    'postgresql://{}:{}@sierra-db.plch.net:1032/iii'.format(
        vars.pg_username, vars.pg_password
    )
)

# remove the previous database
try:
    os.remove('current_orders.db')
except:
    pass

os.close(os.open('current_orders.db', os.O_CREAT))

In [2]:
%%time
sql = """\
-- build the aggreate cmf data associated with the order record
WITH order_record_cmf_data AS (
    SELECT
    o_r.record_id AS order_record_id,
    count(cmf.id) AS count_cmf,
    count(DISTINCT cmf.fund_code) AS count_distinct_cmf_fund_codes,
    sum(cmf.copies) AS sum_cmf_copies,
    json_agg(
        json_build_object(
            'display_order', cmf.display_order,
            'cmf_id', cmf.id,
            'fund_code_num', fm.code_num,
            'fund_code', fm.code,
            'acct_unit_code_num', au.code_num,
            'copies', cmf.copies,
            'location_code', cmf.location_code
        )
        ORDER BY
        cmf.display_order ASC
    ) AS cmf_data
    FROM 
    sierra_view.order_record AS o_r
    LEFT OUTER JOIN sierra_view.order_record_cmf AS cmf ON cmf.order_record_id = o_r.record_id
    -- it's unfortunate, but it seems like the code number can have TEXT values like 'none' for example..
    -- so, it's necessary to filter those out with a regex
    LEFT OUTER JOIN sierra_view.fund_master AS fm ON fm.code_num = NULLIF(regexp_replace(cmf.fund_code, '[^0-9]*', '', 'g'),'')::int
    LEFT OUTER JOIN sierra_view.accounting_unit AS au ON au.id = fm.accounting_unit_id 
    WHERE 
    cmf.location_code != 'multi'
    GROUP BY 1
)
SELECT
-- build order record data
rm.record_num AS order_record_num,
d.*,
brorl.orders_display_order,
brorl.bib_record_id  AS bib_record_id,
rm.creation_date_gmt,
order_record.accounting_unit_code_num,
(order_record.estimated_price * 100.0) :: INTEGER AS estimated_price_cents,
order_record.form_code AS physical_form_code,
fpn."name" AS physical_form_name,
order_record.order_date_gmt,
order_record.catalog_date_gmt,
order_record.order_type_code,
otpm."name" AS order_type_name,
order_record.received_date_gmt,
order_record.receiving_location_code,
order_record.order_status_code,
ospn."name" AS order_status_name,
order_record.vendor_record_code,
vr.record_id AS vendor_record_id,
order_record.volume_count
FROM 
order_record_cmf_data AS d
LEFT OUTER JOIN sierra_view.order_record AS order_record ON order_record.record_id = d.order_record_id
LEFT OUTER JOIN sierra_view.record_metadata AS rm ON rm.id = order_record.record_id
LEFT OUTER JOIN sierra_view.bib_record_order_record_link AS brorl ON brorl.order_record_id = order_record.record_id	
LEFT OUTER JOIN sierra_view.form_property AS fp ON fp.code = order_record.form_code
LEFT OUTER JOIN sierra_view.form_property_name AS fpn ON fpn.form_property_id = fp.id 
LEFT OUTER JOIN sierra_view.order_type_property_myuser AS otpm ON otpm.code = order_record.order_type_code
LEFT OUTER JOIN sierra_view.order_status_property AS osp ON osp.code = order_record.order_status_code 
LEFT OUTER JOIN sierra_view.order_status_property_name AS ospn ON ospn.order_status_property_id = osp.id
LEFT OUTER JOIN sierra_view.vendor_record AS vr ON vr.code = order_record.vendor_record_code
"""

df = pd.read_sql(sql=sql, con=sierra_engine)

CPU times: user 14 s, sys: 1.64 s, total: 15.7 s
Wall time: 41.8 s


In [3]:
df.to_sql(
    name='orders', 
    index=False, 
    if_exists='replace', 
    con=engine, 
    chunksize=10000,
    dtype={
        'order_record_num': Integer(), 
        'order_record_id': BigInteger(), 
        'count_cmf': Integer(),
        'count_distinct_cmf_fund_codes': Integer(), 
        'sum_cmf_copies': Integer(), 
        'cmf_data': JSON(),
        'orders_display_order': Integer(), 
        'bib_record_id': BigInteger(), 
        'creation_date_gmt': DateTime(),
        'accounting_unit_code_num': Integer(),
        'estimated_price_cents': Integer(),
        'physical_form_code': Text(),
        'physical_form_name': Text(),
        'order_date_gmt': DateTime(),
        'catalog_date_gmt': DateTime(),
        'order_type_code': Text(),
        'order_type_name': Text(),
        'received_date_gmt': DateTime(),
        'receiving_location_code': Text(),
        'order_status_code': Text(),
        'order_status_name': Text(), 
        'vendor_record_code': Text(), 
        'vendor_record_id': BigInteger(),
        'volume_count': Integer(),        
    }
)

In [4]:
# sql = """\
# -- search for a barcode stored in the 'b' tagged varfield for the item...
# SELECT
# items.item_record_id,
# items.item_record_num,
# json_extract(value, '$.field_content') as barcode
# FROM
# items, 
# json_each(items.json_item_varfields)
# WHERE 
# json_extract(value, '$.varfield_type_code') = 'b'
# and json_extract(value, '$.field_content') like '{}'
# """

# pd.read_sql(sql=sql.format('a000073209167'), con=engine).head()

In [5]:
!tar -cvvf - ./current_orders.db | xz -9 -T0 > current_orders.db.tar.xz

-rwxr-xr-x plchuser/plchuser 466841600 2022-01-19 23:28 ./current_orders.db
