In [1]:
# The purpose of this notebook is to compare the
# efficacy of various Machine Learning models on
# the dataset.

import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

np.random.seed(0)

script_dir = os.path.abspath('')

file = os.path.realpath(script_dir + '/../data/interim/train_users_2_2.csv')

df = pd.read_csv(file)

df.head()

print(df.columns)

Index(['Unnamed: 0', 'id', 'date_account_created', 'timestamp_first_active',
       'date_first_booking', 'gender', 'age', 'signup_method', 'signup_flow',
       'language', 'affiliate_channel', 'affiliate_provider',
       'first_affiliate_tracked', 'signup_app', 'first_device_type',
       'first_browser', 'country_destination', 'days_thinking',
       'number_of_actions'],
      dtype='object')


In [3]:
# Prepare the dataset for our regression

df2 = 0

# Remove undesired columns

# Note we do NOT want date_first_booking in
# our model because it is not included in
# the test set of the competition.

df2 = df[['date_account_created',
          'gender',
          'age',
          'signup_method',
          'language',
          'affiliate_channel',
          'affiliate_provider',
          'first_affiliate_tracked',
          'signup_app',
          'first_device_type',
          'first_browser',
          'number_of_actions',
          'country_destination']]

# Convert time-based columns to datetime
# objects, then to numbers that the model
# can use.

df2['week_account_created'] = pd.to_datetime(df2['date_account_created']).dt.week

df2 = df2.drop(['date_account_created'], axis=1)

df2.fillna(0, inplace=True)

# Use get_dummies to convert our categorical
# features to numerical features so that our
# model can use them.

dummiescols = ['gender', 'signup_method', 'language',
               'affiliate_channel', 'affiliate_provider',
               'first_affiliate_tracked', 'signup_app',
               'first_device_type', 'first_browser']

df2 = pd.get_dummies(df2, prefix=dummiescols, columns=dummiescols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [4]:
# Create the training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    df2.drop('country_destination', axis=1).values,
    df2['country_destination'].values,
    random_state=0)

In [5]:
# Logistic Regression
# Train the model, then score it on the test set

from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

logreg.fit(X_train, y_train)

logreg.score(X_test, y_test)



0.5180819409475853

In [6]:
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
import matplotlib.pyplot as plt

ylr_pred = logreg.predict(X_test)

labels = [i for i in unique_labels(y_test, ylr_pred)]

print(labels)

cm = confusion_matrix(y_test, ylr_pred)

print(cm)

['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT', 'US', 'other']
[[   0    0    0    0    0    0    0   16    0    0   16    0]
 [   0    0    0    0    0    0    0   19    0    0   50    0]
 [   0    0    0    0    0    0    0   20    0    0   28    0]
 [   0    0    0    0    0    0    0   39    0    0   78    0]
 [   0    0    0    0    0    0    0   75    0    0  166    0]
 [   0    0    0    0    0    0    0   43    0    0  106    0]
 [   0    0    0    0    0    0    0   58    0    0  101    0]
 [   0    0    0    0    0    0    0 2166    0    0 1578    0]
 [   0    0    0    0    0    0    0   18    0    0   25    0]
 [   0    0    0    0    0    0    0    2    0    0   10    0]
 [   0    0    0    0    0    0    0 1185    0    0 2361    0]
 [   0    0    0    0    0    0    0  196    0    0  382    0]]


In [10]:
# Random Forests

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)

rf.fit(X_train, y_train)

rf.score(X_test, y_test)

0.47894254978255896

In [11]:
yrf_pred = rf.predict(X_test)

print(labels)

cm2 = confusion_matrix(y_test, yrf_pred)

print(cm2)

['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT', 'US', 'other']
[[   0    0    0    0    0    0    0   18    0    0   14    0]
 [   0    1    0    0    0    0    0   28    0    0   39    1]
 [   0    0    0    0    0    0    0   25    0    0   21    2]
 [   1    1    0    0    1    0    0   54    0    0   57    3]
 [   0    0    0    0    2    2    0  110    0    0  121    6]
 [   0    0    0    0    0    0    1   62    0    0   86    0]
 [   0    0    0    1    3    0    0   75    0    0   77    3]
 [   1    1    2    9   21    8    4 2202    0    0 1460   36]
 [   0    0    0    0    0    0    0   21    0    0   21    1]
 [   0    0    0    0    0    0    0    4    0    0    8    0]
 [   1    5    2   13   16    6   15 1458    3    0 1968   59]
 [   0    1    0    0    3    0    2  254    0    0  306   12]]


In [12]:
# AdaBoost

from sklearn.ensemble import AdaBoostClassifier

ab = AdaBoostClassifier()

ab.fit(X_train, y_train)

ab.score(X_test, y_test)

0.5203707942320898

In [13]:
yab_pred = ab.predict(X_test)

print(labels)

cm3 = confusion_matrix(y_test, yab_pred)

print(cm3)

['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT', 'US', 'other']
[[   0    0    0    0    0    0    0   15    0    0   17    0]
 [   0    0    0    0    0    0    0   20    0    0   49    0]
 [   0    0    0    0    0    0    0   20    0    0   28    0]
 [   0    0    0    0    0    0    0   42    0    0   75    0]
 [   0    0    0    0    0    0    0   83    0    0  158    0]
 [   0    0    0    0    0    0    0   48    0    1  100    0]
 [   0    0    0    0    0    0    0   61    0    0   98    0]
 [   0    0    0    0    0    0    0 2232    0    4 1508    0]
 [   0    0    0    0    0    0    0   19    0    0   24    0]
 [   0    0    0    0    0    0    0    4    0    0    8    0]
 [   0    0    0    0    0    0    0 1230    0    1 2315    0]
 [   0    0    0    0    0    0    0  210    0    0  368    0]]


In [14]:
# Looks like our algorithms are only ever
# predicting NDF or US as the classifier.
# I wonder if we can get a better result
# by removing NDF entirely. If so, we can
# just create a separate algorithm to sort
# the data into "NDF" or "Made a Booking".

df3 = df2[df2['country_destination'] != 'NDF']

print(df3['country_destination'].unique())

['other' 'US' 'FR' 'GB' 'IT' 'AU' 'CA' 'ES' 'DE' 'NL' 'PT']


In [15]:
X3_train, X3_test, y3_train, y3_test = train_test_split(
    df3.drop('country_destination', axis=1).values,
    df3['country_destination'].values,
    random_state=0)

In [16]:
logreg3 = LogisticRegression()

logreg3.fit(X3_train, y3_train)

logreg3.score(X3_test, y3_test)



0.7182475884244373

In [17]:
ylr3_pred = logreg3.predict(X3_test)

labels3 = [i for i in unique_labels(y3_test, ylr3_pred)]

print(labels3)

cm = confusion_matrix(y3_test, ylr3_pred)

print(cm)

# Nope. It's still just saying everyone travels to the US.
# Let's try one more time, eliminating the US as well.

['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NL', 'PT', 'US', 'other']
[[   0    0    0    0    0    0    0    0    0   33    0]
 [   0    0    0    0    0    0    0    0    0   85    0]
 [   0    0    0    0    0    0    0    0    0   40    0]
 [   0    0    0    0    0    0    0    0    0  107    0]
 [   0    0    0    0    0    0    0    0    0  238    0]
 [   0    0    0    0    0    0    0    0    0  119    0]
 [   0    0    0    0    0    0    0    0    0  144    0]
 [   0    0    0    0    0    0    0    0    0   43    0]
 [   0    0    0    0    0    0    0    0    0    8    0]
 [   0    0    0    0    0    0    0    0    0 3574    0]
 [   0    0    0    0    0    0    0    0    0  585    0]]


In [18]:
df4 = df3[df3['country_destination'] != 'US']

print(df4['country_destination'].unique())

X4_train, X4_test, y4_train, y4_test = train_test_split(
    df4.drop('country_destination', axis=1).values,
    df4['country_destination'].values,
    random_state=0)

['other' 'FR' 'GB' 'IT' 'AU' 'CA' 'ES' 'DE' 'NL' 'PT']


In [19]:
logreg4 = LogisticRegression()

logreg4.fit(X4_train, y4_train)

logreg4.score(X4_test, y4_test)



0.4271639690358902

In [20]:
ylr4_pred = logreg4.predict(X4_test)

labels4 = [i for i in unique_labels(y4_test, ylr4_pred)]

print(labels4)

cm = confusion_matrix(y4_test, ylr4_pred)

print(cm)

# Our model is pretty weak. It looks like it just
# assigns the data to the most popular destination in
# the dataset. Let's remove 'other' and see what
# happens.

['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NL', 'PT', 'other']
[[  0   0   0   0   0   0   0   0   0  30]
 [  0   0   0   0   1   0   0   0   0  90]
 [  0   0   0   0   1   0   1   0   0  43]
 [  0   0   1   0   5   0   0   0   0 106]
 [  0   0   1   0   7   0   1   0   0 232]
 [  0   0   0   0   2   0   0   0   0  98]
 [  0   0   1   0   4   0   1   0   0 133]
 [  0   0   0   0   0   0   0   0   0  40]
 [  0   0   0   0   0   0   0   0   0  10]
 [  0   0   1   2   9   0   2   0   0 599]]


In [21]:
df5 = df4[df4['country_destination'] != 'other']

print(df5['country_destination'].unique())

X5_train, X5_test, y5_train, y5_test = train_test_split(
    df5.drop('country_destination', axis=1).values,
    df5['country_destination'].values,
    random_state=0)

['FR' 'GB' 'IT' 'AU' 'CA' 'ES' 'DE' 'NL' 'PT']


In [22]:
logreg5 = LogisticRegression()

logreg5.fit(X5_train, y5_train)

logreg5.score(X5_test, y5_test)



0.2733812949640288

In [23]:
ylr5_pred = logreg5.predict(X5_test)

labels5 = [i for i in unique_labels(y5_test, ylr5_pred)]

print(labels5)

cm = confusion_matrix(y5_test, ylr5_pred)

print(cm)

# Ok. It's turtles all the way down. Just
# to be sure, try Random Forests and AdaBoost.

['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NL', 'PT']
[[  0   0   0   4  25   1   1   0   0]
 [  0   1   1   5  68   4   4   0   0]
 [  0   1   1   1  40   0   3   0   0]
 [  0   1   1   3  97   1   1   0   0]
 [  0   1   0  12 215   2   9   0   0]
 [  0   0   0   2 114   3   4   0   0]
 [  0   0   1   2 140   3   5   0   0]
 [  0   0   0   2  38   2   4   0   0]
 [  0   0   0   0  11   0   0   0   0]]


In [24]:
rf5 = RandomForestClassifier(n_estimators=300)

rf5.fit(X5_train, y5_train)

rf5.score(X5_test, y5_test)

0.22302158273381295

In [25]:
ab5 = AdaBoostClassifier()

ab5.fit(X5_train, y5_train)

ab5.score(X5_test, y5_test)

0.24700239808153476

Very clear what's going on here: the models have little ability to discern between the less popular destination spots. We were getting such accurate results in the first case solely because two features—NDF and US—absolutely dwarf the other target categories. To proceed, I will need to understand how to predict even despite very unevenly distributed target classes.

Perhaps it is time to re-wrangle things and re-introduce users for whom
data is missing. It could be that a larger training set would allow for
a better result. I think I will also start performing these 