# Data Science Challenge
-----------------------

## [ Part. 2 ] Model Training

## 1. Train / Test Split

## 2. Feature Selection by DecisionTree

## 3. 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from source.utils import plot_precision_recall_curve
from source.utils import plot_feature_selection
from random import shuffle

pd.options.display.max_columns = 50

%matplotlib inline

Load feature engineered data

In [3]:
data = pd.read_parquet('./data/events.parquet')

#### Set target values as binary 


In [4]:
data['event_type'] = data['event_type'].map({'search':0,'book':1})

#### Encode 'origin' and 'destination' as numerical categories

In [5]:
origin_dic = {}
for idx, item in enumerate(data['origin'].unique()):
    origin_dic[item] = idx
destin_dic = {}
for idx, item in enumerate(data['destination'].unique()):
    destin_dic[item] = idx
data['origin'] = data['origin'].map(origin_dic)
data['destination'] = data['destination'].map(destin_dic)

#### Change the category value type

In [6]:
category_columns=['origin', 'destination', 'ts_dow', 'date_from_dow', 'date_to_dow']
for col in category_columns:
    data[col] = data[col].astype('category')

## 1. Train / Test Split

In [7]:
data['event_type'].value_counts()

0    45177
1     1808
Name: event_type, dtype: int64

In [8]:
book_userid = list(data[data['event_type']==1]['user_id'].unique())
search_userid = list(set(data['user_id'].unique())-set(book_userid))

In [None]:
print (len(book_userid), len(search_userid))

In [None]:
book_data = data[data['user_id'].isin(book_userid)]
search_data = data[data['user_id'].isin(search_userid)]
book_data.shape, search_data.shape

In [None]:
len(book_data) / ( len(book_data) + len(search_data) )

In [None]:
len(book_userid) / ( len(book_userid) + len(search_userid) )

##### [ Comments ] 
A. Target value is highly imbalanced. (1808 / 45177) 3.8% <br>
B. The ratio of booked users / searched users is slightly better than the target value ratio. (1804 / 29361) 5.7%<br>
C. Activity ratio of booked users / searched users is much better. (9486 / 37499) 20.2%<br>

------------------

#### Train / Test Split should have the similar distribution 

##### 1) Select randomly booked user in 20% and searched users in 20% as Test user 
##### 2) Split Train / Test data based on train / test userid 


In [None]:
train_data_ratio = 0.8
shuffle(book_userid)
shuffle(search_userid)

idx_book = int(len(book_userid)*0.8)
idx_serach = int(len(search_userid)*0.8)

train_user = book_userid[:idx_book] + search_userid[:idx_serach]
test_user = book_userid[idx_book:] + search_userid[idx_serach:]

train_data = data[data['user_id'].isin(train_user)]
test_data = data[data['user_id'].isin(test_user)]
len(train_data), len(test_data)

Set the target value for each datasets 

In [None]:
select_features = ['origin', 'destination','num_adults', 'num_children', 
                  'act_count', 'diff_ts','ts_dow', 'date_from_dow', 
                  'date_to_dow', 'hours_ts', 'ts_day','date_from_day', 
                  'date_to_day', 'trip_duration', 'trip_distance']
target_features = ['event_type']

In [None]:
train_data.head(2)

In [None]:
X_train = train_data[select_features]
X_test = test_data[select_features]
y_train = train_data[target_features]
y_test = test_data[target_features]


In [None]:

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


In [None]:
features = plot_feature_selection(clf, X_train.columns)

In [None]:
features[:3]

In [None]:
clf.score(X_test, y_test)

In [None]:
y_score = clf.predict(X_test)

In [None]:
plot_precision_recall_curve(y_test, y_score)

In [None]:
sample_search_data = search_data.sample(7500)

In [None]:
sample_data = pd.concat([book_data, sample_search_data])
sample_data.shape

In [None]:
sample_data.head(2)

In [None]:
select_features = ['origin', 'destination','num_adults', 'num_children', 
                  'act_count', 'diff_ts','ts_dow', 'date_from_dow', 
                  'date_to_dow', 'hours_ts', 'ts_day','date_from_day', 
                  'date_to_day', 'trip_duration', 'trip_distance']
target_features = ['event_type']

In [None]:

from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import average_precision_score


In [None]:
df = sample_data[select_features+target_features]

In [None]:
clf.score(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
y_score = clf.predict(X_test)

In [None]:
np.average(1-y_test)

In [None]:
plot_precision_recall_curve(y_test, y_score)

In [None]:
features

In [None]:
plot_precision_recall_curve(y_test, y_score)

In [None]:
X_test.head(2)

In [None]:
features