In [98]:
import pandas as pd
import joblib
from geo_utils import *
from time import sleep
from geopy.distance import geodesic
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
customers = joblib.load('data/customers_clean.pkl')
checkouts = joblib.load('data/checkouts_clean.pkl')
books = joblib.load('data/books_clean.pkl')
libraries = joblib.load('data/libraries_clean.pkl')

## Distance-based features
---

* Features based on library location data are added, since analysis showed differences in return rates
* Additional location data is gathered using openstreetmap, Overpass and geopy

In [7]:
stations = get_public_transport()

In [10]:
def fetch_data(x, stations):
    sleep(3) # avoiding rate limits
    data = find_nearest_stations(x, stations)
    return data['min_distance'], data['num_near']

In [68]:
libraries[['city', 'region']] = libraries[['city', 'region']].fillna('')

In [71]:
tmp_df = libraries.apply((lambda x: fetch_data(f"{x['street_address']}, {x['region']}", stations)), axis=1)

In [72]:
for i in range(len(tmp_df)):
    libraries.loc[i, 'closest_transport'] = tmp_df[i][0]
    libraries.loc[i, 'num_close_transport'] = tmp_df[i][1]

## Merging of data

In [79]:
df_merged = checkouts.merge(books, on='id')\
            .merge(customers, left_on='patron_id', right_on='id', suffixes=('', '_customers'))\
            .merge(libraries, left_on='library_id', right_on='id', suffixes=('', '_libraries'))

df_merged['book_age'] = (df_merged['date_checkout']-df_merged['publishedDate']).dt.days/365.25
df_merged.loc[df_merged['book_age']<0, 'book_age'] = df_merged.loc[df_merged['book_age']>0, 'book_age'].median()

In [82]:
return_deadline = 28

df_merged['days_return'] = (df_merged['date_returned']-df_merged['date_checkout']).dt.days
df_merged['target'] = df_merged['days_return']>return_deadline

In [93]:
cols_to_exclude = [
    'id', 'patron_id', 'library_id', 'id_customers', 'id_libraries', # ids not useful for modelling
    'title', 'authors', 'name', 'publisher', 'name_libraries',
    'street_address', 'street_address_libraries',
    'date_checkout', 'date_returned', 'birth_date', 'publishedDate',
    'gender'
]

In [94]:
df = df_merged.drop(columns=cols_to_exclude)

Categorical columns will be encoded using one-hot-encoding. Another option would be to use ordinal encoding for columns where values have some hierarchy (e.g. education level).

In [97]:
df = pd.get_dummies(df)

## Additional Cleaning & Feature Selection

Features which are constant, or have too low variance can be removed.

Also, features that are correlated with one another (in this case we would keep the one with higher correlation to target). Mutual information can be used instead of correlation to catch non-linear relationships.

Since tree-based methods are most likely to be used in the modelling exercise here, this step will be skipped as they are less prone to being affected by useless features.

Data is split between train and test and saved on disk.

In [103]:
x_train, x_test, y_train, y_test = train_test_split(df.drop(columns=['target']), df['target'], test_size=0.2, stratify=df['target'], random_state=42)

In [104]:
joblib.dump(x_train, 'data/x_train.pkl')
joblib.dump(x_test, 'data/x_test.pkl')
joblib.dump(y_train, 'data/y_train.pkl')
joblib.dump(y_test, 'data/y_test.pkl')

['data/y_test.pkl']