## 01_Truncate_Dataset Notebook

Author: Daniel Hui

License: MIT

Description: This notebook takes gigantic source datasets and outputs a truncated dataset defined by parameters defined by the research

In [2]:
import pandas as pd

## I. Collection Inventory Dataset

In [3]:
#Load the entire inventory data
inventory = pd.read_csv('/Users/dhui/Downloads/01_Source_Data/Library_Collection_Inventory.csv')

### A. Collection: Truncate by Report Date
Since I will be preducting on books borrowed in 2017, I will use a reporting date from Jan 1 2018 to define the lirbary collection as it existed in 2017.

In [4]:
#The total range of report dates
inventory["ReportDate"].unique()

array(['08/01/2018', '09/01/2018', '10/01/2018', '09/01/2017',
       '10/01/2017', '11/01/2017', '12/01/2017', '01/01/2018',
       '02/01/2018', '03/01/2018', '04/01/2018', '05/01/2018',
       '06/01/2018', '07/01/2018'], dtype=object)

In [5]:
#report_date = '10/01/2018'  #Trucate by the most recent report
#file_date = 'oct_2018'      #use this in the file naming later

report_date = '01/01/2018'   #Trucate by the first report after the feature set
file_date = 'jan_2018'       

In [6]:
#Isolate inventory to items recorded in inventory on the desired recent report date
inventory_trunc = inventory[inventory["ReportDate"] == report_date]

### B. Collection: Truncate by Type Book
I will look only for non-reference printed books

In [7]:
#Load the data dictionary of codes 
data_dctnry = pd.read_csv('../01_Data/01_Source/Integrated_Library_System__ILS__Data_Dictionary.csv')

In [8]:
#extract the codes related to printed books
book_codes = data_dctnry[(data_dctnry["Format Subgroup"] == "Book") &      #Isolate things that are books
                         (data_dctnry["Code Type"] == "ItemCollection") &  #From the field 'ItemCollection'
                         (data_dctnry["Category Group"] != "Reference")]   #Exclude Reference (non-circulating)
book_codes = book_codes["Code"]                                            #these are all the codes for things
                                                                           #we want to include

In [9]:
inventory_trunc = inventory_trunc[inventory_trunc["ItemCollection"].isin(book_codes)]  #Exclude! 

### C. Collection: Truncate by Location
Only look for books that are in one of the 27 branch libraries, and exclude interlibrary loan and the mobile unit

In [12]:
# Only use books in one of the 27 branch library locations
# exclude things like Interlibrary Loan, the temporary mobile unit, etc
#Load the data dictionary of codes (code duplicated below)
data_dctnry = pd.read_csv('../01_Data/01_Source/Integrated_Library_System__ILS__Data_Dictionary.csv')
#Grab the location codes
locations = data_dctnry[data_dctnry["Code Type"] == "ItemLocation"][["Code","Description"]]
locations = locations.reset_index(drop=True)
locations = locations.drop_duplicates()
locations = locations["Code"]

In [15]:
inventory_trunc = inventory_trunc[inventory_trunc["ItemLocation"].isin(locations)]

#### Export Book Codes so other notebooks can use this set

In [7]:
book_codes.to_csv("../01_Data/03_Cleaned/ItemCollection_Book_Codes.csv")

### D. Collection: Export to CSV

In [16]:
inventory_trunc = inventory_trunc.reset_index(drop=True)       
inventory_trunc.to_csv(f"../01_Data/02_Truncated/Library_Collection_Inventory_{file_date}.csv")

## II. Seattle Public Library ILS Data Dictionary
Load and truncate the data dictionary to the useful codes

In [6]:
#Load the data dictionary of codes 
data_dctnry = pd.read_csv('../01_Data/01_Source/Integrated_Library_System__ILS__Data_Dictionary.csv')

### A. Dictionary: Locations - Export to CSV

In [7]:
#Grab the location codes
locations = data_dctnry[data_dctnry["Code Type"] == "ItemLocation"][["Code","Description"]]

In [8]:
locations = locations.reset_index(drop=True)
locations = locations.drop_duplicates()
locations.to_csv("../01_Data/02_Truncated/Library_Locations.csv")

### B. Dictionary: Book Codes (Non Reference) - Export to CSV

In [36]:
#First run cells in section I.B Truncate by Book (above)

In [41]:
data_dctnry_trunc = data_dctnry[data_dctnry["Code"].isin(book_codes)]
data_dctnry_trunc = data_dctnry_trunc.reset_index(drop=True) 

In [43]:
data_dctnry_trunc.to_csv("../01_Data/02_Truncated/Data_Dictionary_Book_Codes.csv")