## 11_Merge_Features_Target

Author: Daniel Hui

License: MIT

This notebnook merges the different sets holding features along with the target. The output will be the set used for training and testing of the model

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_columns', 500)

### Global Variables
These are the three sets of target options for predicting different ranges of checkout activity: January, Quarter 1, and the first half of the year

In [2]:
target_set = "18_Half"       # Checkout Targets from Jan-Jun 2018

## A. Load Datasets
---
### 1. Load Checkout Records

In [3]:
checkout_target_df = pd.read_csv(f'../01_Data/05_Target/{target_set}_Target.csv',index_col=0)

In [4]:
checkout_target_df["Checkout"] = checkout_target_df["Checkout"].apply(lambda x: int(x))   #convert to int
checkout_target_df.head()

Unnamed: 0,BibNum,Checkout
0,3177276,0
1,395432,0
2,123754,1
3,193328,1
4,1764894,1


In [5]:
len(checkout_target_df)

397147

### 2. Load in Dictionary Features

In [6]:
dictionary_features_df = pd.read_csv('../01_Data/06_Features/Dictionary_Features.csv',index_col=0)
dictionary_features_df.head()

Unnamed: 0,Code,Description,Category,Fiction,Language,Nonfiction,Biography,Large Print,Picture,Children,Teen,Mystery,AfAm,Comic
0,canf,CA-Nonfiction,Nonfiction,0,0,1,0,0,0,0,0,0,0,0
1,nanf,NA-Nonfiction,Nonfiction,0,0,1,0,0,0,0,0,0,0,0
2,cafic,CA3-Fiction,Fiction,1,0,0,0,0,0,0,0,0,0,0
3,caln,CA1-Language,Language,0,1,0,0,0,0,0,0,0,0,0
4,nafic,NA-Fiction,Fiction,1,0,0,0,0,0,0,0,0,0,0


In [7]:
len(dictionary_features_df)    #this contains 104 unique codes

104

### 3. Load in Inventory Features

In [8]:
inventory_features_df = pd.read_csv('../01_Data/03_Cleaned/Library_Collection_Inventory_jan_2018_clean.csv',index_col=0)
inventory_features_df = inventory_features_df[["BibNum","Title","ISBN","ItemCollection","PublicationYear"]]

In [9]:
inventory_features_df = inventory_features_df.rename({"ItemCollection":"Code","PublicationYear":"Year"},axis=1)
inventory_features_df = inventory_features_df.drop_duplicates()
inventory_features_df.head()

Unnamed: 0,BibNum,Title,ISBN,Code,Year
0,3177276,Day of the Dead.,,naover,2016.0
1,395432,Swan Lake / Ann Nugent.,812056744.0,canf,1985.0
2,123754,Best short stories of Jack London.,,cs3fic,1945.0
3,193328,The comedy of errors.,,canf,1962.0
4,1764894,Below the belt : play / by Richard Dresser.,573696306.0,canf,1997.0


In [10]:
len(inventory_features_df)   #this contains non-unique titles

567995

### 4. Load in Collection Features

In [11]:
collection_features_df = pd.read_csv('../01_Data/06_Features/Collection_Features.csv',index_col=0)
collection_features_df.head()

Unnamed: 0,BibNum,Branches,Copies
0,2609423,27,49
1,3313042,27,28
2,3297286,27,32
3,15661,27,60
4,2772449,27,48


In [12]:
len(collection_features_df)    #this set has 397,147 unique titles

397147

### 5. Load in Location Features

In [13]:
location_features_df = pd.read_csv('../01_Data/06_Features/Location_Features.csv',index_col=0)
location_features_df.head()

Unnamed: 0,BibNum,bal,bea,bro,cap,cen,col,dlr,dth,fre,glk,gwd,hip,idc,lcy,mag,mgm,mon,net,nga,nhy,qna,rbe,spa,swt,uni,wal,wts
0,7,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,24,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,25,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,32,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,33,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [14]:
len(location_features_df)    #this set has 397,147 unique titles

397147

### 6. Load Checkout Features

In [15]:
checkout_features_df = pd.read_csv('../01_Data/06_Features/Checkout_Features.csv',index_col=0)

In [16]:
checkout_features_df = checkout_features_df.astype(int)    #convert to integer
checkout_features_df.head()

Unnamed: 0,BibNum,30 Days,90 Days,180 Days,365 Days
0,3177276,14,45,78,94
1,395432,0,0,0,1
2,123754,1,1,1,1
3,193328,0,0,0,0
4,1764894,0,0,0,0


In [17]:
len(checkout_features_df)   #this set has 397,147 unique titles

397147

## B. Merge Datasets
---
### Merge 1: Combine Target with Inventory Features

In [18]:
merge1_df = checkout_target_df.merge(inventory_features_df,on="BibNum",how="left")
merge1_df.head()

Unnamed: 0,BibNum,Checkout,Title,ISBN,Code,Year
0,3177276,0,Day of the Dead.,,naover,2016.0
1,395432,0,Swan Lake / Ann Nugent.,812056744.0,canf,1985.0
2,123754,1,Best short stories of Jack London.,,cs3fic,1945.0
3,193328,1,The comedy of errors.,,canf,1962.0
4,1764894,1,Below the belt : play / by Richard Dresser.,573696306.0,canf,1997.0


In [19]:
len(merge1_df)  

567995

### Merge 2: Add Dictionary Features

In [20]:
merge2_df = merge1_df.merge(dictionary_features_df,on="Code",how="left")
merge2_df = merge2_df.drop(["Code","Description","Category","Checkout","Title","ISBN","Year"],axis=1)   #don't need these columns anymore
len(merge2_df)

567995

In [21]:
merge2_df.nunique()

BibNum         397147
Fiction             2
Language            2
Nonfiction          2
Biography           2
Large Print         2
Picture             2
Children            2
Teen                2
Mystery             2
AfAm                2
Comic               2
dtype: int64

Since a single title may appear several times in the inventory, for example classified as a children's fiction book and also a teen graphic novel, we use group by only using the uninque BibNum and the MAX aggregation to collapse all the different variants together but preserving all the features attributed to the book across all the different ways the library has classified it.

In [22]:
merge2_df = merge2_df.groupby(["BibNum"]).max()
merge2_df = merge2_df.reset_index()
len(merge2_df)    

397147

In [23]:
merge2_df.head()

Unnamed: 0,BibNum,Fiction,Language,Nonfiction,Biography,Large Print,Picture,Children,Teen,Mystery,AfAm,Comic
0,7,0,0,1,1,0,0,0,0,0,0,0
1,24,0,0,1,0,0,0,0,0,0,0,0
2,25,0,0,1,0,0,0,0,0,0,0,0
3,32,0,0,1,0,0,0,0,0,0,0,0
4,33,0,0,1,0,0,0,0,0,0,0,0


Now add back in the Title ISBN, Year and Target info by reforming the merge1 data with only the relevant columns

In [24]:
merge1_lite_df = merge1_df.drop(["Code"],axis=1).drop_duplicates()
len(merge1_lite_df)

397147

In [25]:
merge1_lite_df.head()

Unnamed: 0,BibNum,Checkout,Title,ISBN,Year
0,3177276,0,Day of the Dead.,,2016.0
1,395432,0,Swan Lake / Ann Nugent.,812056744.0,1985.0
2,123754,1,Best short stories of Jack London.,,1945.0
3,193328,1,The comedy of errors.,,1962.0
4,1764894,1,Below the belt : play / by Richard Dresser.,573696306.0,1997.0


In [26]:
merge2a_df = merge1_lite_df.merge(merge2_df,on="BibNum",how="left")
merge2a_df.head()

Unnamed: 0,BibNum,Checkout,Title,ISBN,Year,Fiction,Language,Nonfiction,Biography,Large Print,Picture,Children,Teen,Mystery,AfAm,Comic
0,3177276,0,Day of the Dead.,,2016.0,0,0,1,0,0,0,0,0,0,0,0
1,395432,0,Swan Lake / Ann Nugent.,812056744.0,1985.0,0,0,1,0,0,0,0,0,0,0,0
2,123754,1,Best short stories of Jack London.,,1945.0,1,0,0,0,0,0,0,0,0,0,0
3,193328,1,The comedy of errors.,,1962.0,0,0,1,0,0,0,0,0,0,0,0
4,1764894,1,Below the belt : play / by Richard Dresser.,573696306.0,1997.0,0,0,1,0,0,0,0,0,0,0,0


### Merge 3: Add Checkout Features

In [27]:
merge3_df = merge2a_df.merge(checkout_features_df,on="BibNum",how='left')
len(merge3_df)

397147

In [28]:
merge3_df.head(3)

Unnamed: 0,BibNum,Checkout,Title,ISBN,Year,Fiction,Language,Nonfiction,Biography,Large Print,Picture,Children,Teen,Mystery,AfAm,Comic,30 Days,90 Days,180 Days,365 Days
0,3177276,0,Day of the Dead.,,2016.0,0,0,1,0,0,0,0,0,0,0,0,14,45,78,94
1,395432,0,Swan Lake / Ann Nugent.,812056744.0,1985.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
2,123754,1,Best short stories of Jack London.,,1945.0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1


### Merge 4: Add Collection Features

In [29]:
merge4_df = merge3_df.merge(collection_features_df,on="BibNum",how="left")
len(merge4_df)

397147

In [30]:
merge4_df.head()

Unnamed: 0,BibNum,Checkout,Title,ISBN,Year,Fiction,Language,Nonfiction,Biography,Large Print,Picture,Children,Teen,Mystery,AfAm,Comic,30 Days,90 Days,180 Days,365 Days,Branches,Copies
0,3177276,0,Day of the Dead.,,2016.0,0,0,1,0,0,0,0,0,0,0,0,14,45,78,94,17,18
1,395432,0,Swan Lake / Ann Nugent.,812056744.0,1985.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1
2,123754,1,Best short stories of Jack London.,,1945.0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1
3,193328,1,The comedy of errors.,,1962.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1
4,1764894,1,Below the belt : play / by Richard Dresser.,573696306.0,1997.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1


### Merge 5: Add Location Features

In [31]:
merge5_df = merge4_df.merge(location_features_df,on="BibNum",how="left")
len(merge5_df)

397147

In [32]:
merge5_df.head()

Unnamed: 0,BibNum,Checkout,Title,ISBN,Year,Fiction,Language,Nonfiction,Biography,Large Print,Picture,Children,Teen,Mystery,AfAm,Comic,30 Days,90 Days,180 Days,365 Days,Branches,Copies,bal,bea,bro,cap,cen,col,dlr,dth,fre,glk,gwd,hip,idc,lcy,mag,mgm,mon,net,nga,nhy,qna,rbe,spa,swt,uni,wal,wts
0,3177276,0,Day of the Dead.,,2016.0,0,0,1,0,0,0,0,0,0,0,0,14,45,78,94,17,18,0,1,1,0,0,0,1,1,1,0,1,1,0,1,1,0,1,1,1,0,1,0,1,1,1,0,1
1,395432,0,Swan Lake / Ann Nugent.,812056744.0,1985.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,123754,1,Best short stories of Jack London.,,1945.0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,193328,1,The comedy of errors.,,1962.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1764894,1,Below the belt : play / by Richard Dresser.,573696306.0,1997.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Inspect and Clean

In [33]:
merge5_df.describe()

Unnamed: 0,BibNum,Checkout,Year,Fiction,Language,Nonfiction,Biography,Large Print,Picture,Children,Teen,Mystery,AfAm,Comic,30 Days,90 Days,180 Days,365 Days,Branches,Copies,bal,bea,bro,cap,cen,col,dlr,dth,fre,glk,gwd,hip,idc,lcy,mag,mgm,mon,net,nga,nhy,qna,rbe,spa,swt,uni,wal,wts
count,397147.0,397147.0,393577.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0
mean,2234259.0,0.494515,2001.668296,0.278685,0.082463,0.63302,0.037457,0.021168,0.038646,0.15463,0.047222,0.024029,0.019086,0.032011,0.723694,2.348508,4.872443,9.60918,2.49912,2.884453,0.112225,0.072379,0.084717,0.070805,0.895988,0.065203,0.032693,0.095033,0.034703,0.052522,0.107338,0.040438,0.023357,0.100615,0.04913,0.020033,0.033096,0.128612,0.069151,0.024943,0.046421,0.065303,0.031716,0.101136,0.057452,0.014967,0.069143
std,953967.6,0.499971,16.898001,0.448353,0.275069,0.481982,0.189879,0.143946,0.192749,0.361552,0.212113,0.153139,0.136828,0.176029,5.128003,12.296913,20.522126,34.405515,3.094677,4.779715,0.315644,0.259114,0.27846,0.256499,0.305276,0.246883,0.177833,0.293261,0.183026,0.223078,0.309543,0.196985,0.151033,0.300819,0.216141,0.140113,0.178888,0.334771,0.253711,0.155951,0.210396,0.247061,0.175244,0.30151,0.232705,0.12142,0.253698
min,7.0,0.0,1784.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1848334.0,0.0,1997.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2554562.0,0.0,2007.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2957798.0,1.0,2013.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,7.0,3.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3343666.0,1.0,2022.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,585.0,1436.0,1613.0,2621.0,27.0,290.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [34]:
merge5_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 397147 entries, 0 to 397146
Data columns (total 49 columns):
BibNum         397147 non-null int64
Checkout       397147 non-null int64
Title          396434 non-null object
ISBN           345671 non-null object
Year           393577 non-null float64
Fiction        397147 non-null int64
Language       397147 non-null int64
Nonfiction     397147 non-null int64
Biography      397147 non-null int64
Large Print    397147 non-null int64
Picture        397147 non-null int64
Children       397147 non-null int64
Teen           397147 non-null int64
Mystery        397147 non-null int64
AfAm           397147 non-null int64
Comic          397147 non-null int64
30 Days        397147 non-null int64
90 Days        397147 non-null int64
180 Days       397147 non-null int64
365 Days       397147 non-null int64
Branches       397147 non-null int64
Copies         397147 non-null int64
bal            397147 non-null int64
bea            397147 non-null int

### Clean and Null Handling

A function to change some fields into binary values:

In [35]:
def check_field(row):
    if type(row) == float:  #if the row has a NaN, it is a float, and we return 0
        return 0
    else: return 1          #if there is a title, it is a string, and return 1

#### Missing Titles
What kind of book is missing a title? I think this is a relevant metric since the library card catalog relies on search functionality. How can someone find a book if there is no record of the title in the catalog?

In [36]:
merge5_df[merge5_df["Title"].isnull()].head()

Unnamed: 0,BibNum,Checkout,Title,ISBN,Year,Fiction,Language,Nonfiction,Biography,Large Print,Picture,Children,Teen,Mystery,AfAm,Comic,30 Days,90 Days,180 Days,365 Days,Branches,Copies,bal,bea,bro,cap,cen,col,dlr,dth,fre,glk,gwd,hip,idc,lcy,mag,mgm,mon,net,nga,nhy,qna,rbe,spa,swt,uni,wal,wts
1140,429574,1,,,,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2429,488256,1,,,,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3412,429597,1,,,,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5269,422741,1,,,,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6125,461945,1,,,,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Turn Title into a binary. If there is a year, 1 if not, 0

In [37]:
merge5_df["Title"] = merge5_df["Title"].apply(check_field)  
merge5_df.head(5)

Unnamed: 0,BibNum,Checkout,Title,ISBN,Year,Fiction,Language,Nonfiction,Biography,Large Print,Picture,Children,Teen,Mystery,AfAm,Comic,30 Days,90 Days,180 Days,365 Days,Branches,Copies,bal,bea,bro,cap,cen,col,dlr,dth,fre,glk,gwd,hip,idc,lcy,mag,mgm,mon,net,nga,nhy,qna,rbe,spa,swt,uni,wal,wts
0,3177276,0,1,,2016.0,0,0,1,0,0,0,0,0,0,0,0,14,45,78,94,17,18,0,1,1,0,0,0,1,1,1,0,1,1,0,1,1,0,1,1,1,0,1,0,1,1,1,0,1
1,395432,0,1,812056744.0,1985.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,123754,1,1,,1945.0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,193328,1,1,,1962.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1764894,1,1,573696306.0,1997.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Missing ISBNs
What kind of book is missing an ISBN code? I think this is a relevant metric since many people will become interested in a book by finding it on Amazon. Books with ISBNs are more searchable on retail websites. How can someone find a book if there is no ISBN? 

In [38]:
merge5_df[merge5_df["ISBN"].isnull()].head()

Unnamed: 0,BibNum,Checkout,Title,ISBN,Year,Fiction,Language,Nonfiction,Biography,Large Print,Picture,Children,Teen,Mystery,AfAm,Comic,30 Days,90 Days,180 Days,365 Days,Branches,Copies,bal,bea,bro,cap,cen,col,dlr,dth,fre,glk,gwd,hip,idc,lcy,mag,mgm,mon,net,nga,nhy,qna,rbe,spa,swt,uni,wal,wts
0,3177276,0,1,,2016.0,0,0,1,0,0,0,0,0,0,0,0,14,45,78,94,17,18,0,1,1,0,0,0,1,1,1,0,1,1,0,1,1,0,1,1,1,0,1,0,1,1,1,0,1
2,123754,1,1,,1945.0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,193328,1,1,,1962.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,7845,1,1,,1971.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
14,14748,1,1,,1971.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Turn ISBN into a binary. 1 if there is a code and 0 if not

In [39]:
merge5_df["ISBN"] = merge5_df["ISBN"].apply(check_field)
merge5_df.head(5)

Unnamed: 0,BibNum,Checkout,Title,ISBN,Year,Fiction,Language,Nonfiction,Biography,Large Print,Picture,Children,Teen,Mystery,AfAm,Comic,30 Days,90 Days,180 Days,365 Days,Branches,Copies,bal,bea,bro,cap,cen,col,dlr,dth,fre,glk,gwd,hip,idc,lcy,mag,mgm,mon,net,nga,nhy,qna,rbe,spa,swt,uni,wal,wts
0,3177276,0,1,0,2016.0,0,0,1,0,0,0,0,0,0,0,0,14,45,78,94,17,18,0,1,1,0,0,0,1,1,1,0,1,1,0,1,1,0,1,1,1,0,1,0,1,1,1,0,1
1,395432,0,1,1,1985.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,123754,1,1,0,1945.0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,193328,1,1,0,1962.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1764894,1,1,1,1997.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Missing Year
There are only a few thousand of these and it should be ok to fill in 0 for those

In [40]:
merge5_df["Year"] = merge5_df["Year"].fillna(0)
merge5_df["Year"] = merge5_df["Year"].astype(int)

### Much Cleaner Dataset

In [41]:
merge5_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 397147 entries, 0 to 397146
Data columns (total 49 columns):
BibNum         397147 non-null int64
Checkout       397147 non-null int64
Title          397147 non-null int64
ISBN           397147 non-null int64
Year           397147 non-null int64
Fiction        397147 non-null int64
Language       397147 non-null int64
Nonfiction     397147 non-null int64
Biography      397147 non-null int64
Large Print    397147 non-null int64
Picture        397147 non-null int64
Children       397147 non-null int64
Teen           397147 non-null int64
Mystery        397147 non-null int64
AfAm           397147 non-null int64
Comic          397147 non-null int64
30 Days        397147 non-null int64
90 Days        397147 non-null int64
180 Days       397147 non-null int64
365 Days       397147 non-null int64
Branches       397147 non-null int64
Copies         397147 non-null int64
bal            397147 non-null int64
bea            397147 non-null int64
b

### Export CSV

In [42]:
merge5_df.to_csv(f"../01_Data/07_Merged_Target_Feature_Data/{target_set}_set.csv")