In [60]:
# Import libraries.
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning) 

In [2]:
# Import data files from Kaggle.
DATA_PATH = './data/extracted'
dfs_raw = {}
dfs = {}
for root, dirs, files in os.walk(DATA_PATH):
    for file in files:
        dfs_raw[file.split('.')[0]] = pd.read_csv(f'{DATA_PATH}/{file}')
        dfs = dfs_raw.copy()
        print(file)

age_gender_bkts.csv
train_users_2.csv
countries.csv
sample_submission_NDF.csv
sessions.csv
test_users.csv


In [3]:
# Split training dataset into data and labels.
train_data_all = dfs["train_users_2"]
train_labels_all = dfs["train_users_2"].iloc[:, -1:]

# Evaluate existing representation of classes.
print(pd.value_counts(train_labels_all['country_destination']))
countries = train_labels_all['country_destination'].unique()

# Create pd for each country.
train_data_country = {}
train_labels_country = {}
min_count = -1
for country in countries:
    train_data_country[country] = train_data_all.loc[train_labels_all['country_destination'] == country]
    train_labels_country[country] = train_labels_all.loc[train_labels_all['country_destination'] == country]
    count = train_labels_country[country].shape[0]
    if (min_count == -1 or count < min_count):
        min_count = count

# Create balanced training dataset.
balanced_train_data = pd.DataFrame(columns=train_data_all.columns.values)
for country in countries:
    country_pd = train_data_country[country].sample(n=min_count, random_state=1)
    balanced_train_data = pd.concat([balanced_train_data, country_pd])

NDF      124543
US        62376
other     10094
FR         5023
IT         2835
GB         2324
ES         2249
CA         1428
DE         1061
NL          762
AU          539
PT          217
Name: country_destination, dtype: int64


In [7]:
# Split into data and labels (panda dataframes).
#reduced this to 10k / 213k since it was taking forever to even test anything
#train_data   = dfs["train_users_2"][:10000].iloc[:, 0:-1] #we should randomize since accounts are in chronological order
#train_labels = dfs["train_users_2"][:10000]["country_destination"].ravel()

# Set train/dev split to 0.04685/0.95315 to give train size of 10k.  0.04685 = 10000/213451
test_size = 0.95315

# Use (train_test_split) to randomize train_users_2 before splitting into train/dev.
train_data, dev_data, train_labels, dev_labels = train_test_split(dfs["train_users_2"].iloc[:, 0:-1], dfs["train_users_2"].iloc[:, -1:], test_size=test_size, random_state=42)

# Final test data for Kaggle submission.
test_data = dfs["test_users"]

In [6]:
#Function to bucket ages prior to one-hot encoding
def age_bucketer(df_input):
    df = df_input
    df.loc[(pd.isnull(df.age), 'age_bucket')] = 'unknown'
    df.loc[(pd.notnull(df.age), 'age_bucket')] = pd.cut(df['age'],
                                                        [0, 4, 9, 14, 19, 24, 29, 34, 39, 44, 49, 54, 59, 64, 69, 74, 79, 84, 89, 94,99,10000],
                                                        labels=['0-4', '5-9', '10-14','15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49',
                                                                '50-54', '55-59','60-64', '65-69','70-74','75-79','80-84','85-89','90-94','95-99','100+'],
                                                        include_lowest=True)
    return df.drop(['age'], axis=1)

#Since NaN's in categorical data will cause issues with our pipeline we will replace that with "unknown".
def clean_first_affiliate_tracked_nulls(df_input):
    df_input['first_affiliate_tracked'] = df_input['first_affiliate_tracked'].fillna("unknown", inplace=False)
    return df_input

#Add month and year features
def feature_creator (df_input):
    df = df_input
    df['first_active_date'] = pd.to_datetime(df.timestamp_first_active,format='%Y%m%d%H%M%S')
    df['year_first_active'] = df['first_active_date'].dt.year
    df['month_first_active'] = df['first_active_date'].dt.month
    df['season'] = ''
    df.loc[(df['month_first_active'].isin([12, 1, 2]), 'season')] = 'Winter'
    df.loc[(df['month_first_active'].isin([3, 4, 5]), 'season')] = 'Spring'
    df.loc[(df['month_first_active'].isin([6, 7, 8]), 'season')] = 'Summer'
    df.loc[(df['month_first_active'].isin([9, 10, 11]), 'season')] = 'Fall'
    return df.drop(['first_active_date'], axis=1) #consider dropping month as well

def session_feature_creator(df_input):
    df = df_input
    session_agg = dfs['sessions'].groupby('user_id').agg({"secs_elapsed": np.sum, "device_type": pd.Series.nunique, 'action': 'count'}).reset_index(
        ).rename(columns={'secs_elapsed':'total_time', 'device_type':'unique_device_types', 'action': 'unique_actions'})
    return df.merge(session_agg, left_on='id', right_on='user_id', how='left')

def nan_destroyer(df_input):
    # funcation to remove nans from numerical fields, need to determine strategy
    return none

def feautre_selector(df_input):
    # final function to remove IDs, repetitive features, and any features we do not want to be passed to our models
    return none

In [8]:
# Create Preprocessor pipeline.
def create_preprocessor_pipeline():

    column_transformer = make_column_transformer(
        (['gender',
          'signup_method',
          'signup_flow',
          'language',
          'affiliate_channel',
          'affiliate_provider',
          'first_affiliate_tracked',
          'signup_app',
          'first_device_type',
          'first_browser',
          'age_bucket',
          'season'
         ], OneHotEncoder(handle_unknown='ignore')),remainder='drop') # when we add in sessions features we will want to pass remainders
    
    preprocessor = make_pipeline(
        FunctionTransformer(age_bucketer, validate=False),
        FunctionTransformer(feature_creator, validate=False),
        FunctionTransformer(clean_first_affiliate_tracked_nulls, validate=False),
        column_transformer)
    
    return preprocessor

In [9]:
# Stage: Data Preprocessor.
preprocessor = create_preprocessor_pipeline()



In [13]:
balanced_data = balanced_train_data.iloc[:, 0:-1]
balanced_labels = balanced_train_data.iloc[:,-1:]

## Model Selection

In this section, we will attempt to answer our research question through a variety of machine learning algorithms, as well as through an ensemble method combining several different approaches.  We will examine the efficacy of K-neighbors, Bernoulli Naive-Bayes, Random Forest, Logistic Regression, and XGBoost at predicting the destination country of new users, before combining several into an ensemble classifier.

### K-Neighbors Classifier

To begin, we start with a k-neighbors classifier.  Given the heavy skew of our data towards the NDF and US classes, we select a k-neighbors classifier to start, as the algorithm can perform well with unbalanced classes (CITATION) due to the model predicting off of nearby cases rather than generalized rules.

In [146]:
params={'n_neighbors':[3]}
knn = KNeighborsClassifier(n_jobs=-1)
knn_gs = GridSearchCV(knn, params, cv=3, scoring='f1_macro', n_jobs=-1)
pipeline = make_pipeline(preprocessor, knn_gs)
pipeline.fit(train_data, train_labels.values.ravel())
dev_pred = pipeline.predict(dev_data)
print('Accuracy: ',accuracy_score(dev_pred, dev_labels.values.ravel()))
print(classification_report(dev_pred, dev_labels.values.ravel()))

Accuracy:  0.523015369794201
              precision    recall  f1-score   support

          AU       0.00      0.00      0.00       928
          CA       0.02      0.01      0.01      2719
          DE       0.01      0.01      0.01      1862
          ES       0.02      0.01      0.01      3038
          FR       0.04      0.03      0.04      6805
          GB       0.02      0.02      0.02      2563
          IT       0.02      0.01      0.02      4688
         NDF       0.75      0.65      0.70    136454
          NL       0.00      0.00      0.00        24
          PT       0.00      0.00      0.00        10
          US       0.29      0.40      0.33     42705
       other       0.01      0.06      0.02      1655

   micro avg       0.52      0.52      0.52    203451
   macro avg       0.10      0.10      0.10    203451
weighted avg       0.57      0.52      0.54    203451



In [140]:
knn_gs.best_params_

{'n_neighbors': 9}

Indeed, the model does perform fairly well on our dataset, and even returns the best Macro F1 scores our of any of our models when the number of neighbors is fairly low.  As the number of neighbors increases, the model's Macro F1 score declines while Weighted F1 improves, suggesting that there our imbalanced classes impose a tradeoff for the model.  The model predict time also slows as the number of neighbors increases, making further experimentation difficult.

### Bernoulli Naive Bayes

Bernoulli Naive Bayes offers a number of advantages for our dataset.  Our initial pipeline one-hot encodes all of our categorical features, a significant portion of our data is already in the binarized form required for Bernoulli Naive Bayes.  Furthermore, the model's use of Bayesian probabilities should help it avoid incorrectly predicting our less common classes. (REWORD)

In [86]:
params={'alpha': [1.05, .1]}
bnb = BernoulliNB()
bnb_gs = GridSearchCV(bnb, params, cv=3, scoring='f1_weighted', n_jobs=-1)
pipeline = make_pipeline(preprocessor, bnb_gs)
pipeline.fit(train_data, train_labels.values.ravel())
dev_pred = pipeline.predict(dev_data)
print('Accuracy: ',accuracy_score(dev_pred, dev_labels.values.ravel()))
print(classification_report(dev_pred, dev_labels.values.ravel()))

Accuracy:  0.5676698566239536
              precision    recall  f1-score   support

          AU       0.00      0.00      0.00        16
          CA       0.00      0.05      0.00        22
          DE       0.00      0.00      0.00       144
          ES       0.00      0.01      0.00       147
          FR       0.00      0.02      0.00        58
          GB       0.00      0.00      0.00        83
          IT       0.00      0.03      0.00       152
         NDF       0.67      0.70      0.69    113654
          NL       0.00      0.00      0.00        18
          PT       0.00      0.00      0.00         7
          US       0.60      0.40      0.48     88602
       other       0.00      0.05      0.01       548

   micro avg       0.57      0.57      0.57    203451
   macro avg       0.11      0.10      0.10    203451
weighted avg       0.64      0.57      0.59    203451



In [87]:
bnb_gs.best_params_

{'alpha': 1.05}

After tuning the Laplace smoothing parameter (alpha), we are able to generate a model with a reasonably high accuracy and weighted F1 score, however, the model rarely predicts outside of our two main classes, NDF and US.  This results in a reduced macro F1 score.  This is likely because of the rarity of the other classes. (ADD MORE)

## Random Forst Classifier

For our next model, we choose a random forest classifier.  There is extensive literature on the history of decision-tree based classifiers for purchase prediction(CITATIONS NEEDED), and we examine the efficacy of this below. (ADD REASONS FOR EFFICACY IN OTHER CASES).

In [59]:
params={'n_estimators':[300], 'max_depth':[5,10,20]}
rf = RandomForestClassifier(n_jobs = -1, class_weight = 'balanced')
rf_gs = GridSearchCV(rf, params, cv=3, scoring='f1_weighted', n_jobs=-1)
pipeline = make_pipeline(preprocessor, rf_gs)
pipeline.fit(train_data, train_labels.values.ravel())
dev_pred = pipeline.predict(dev_data)
print('Accuracy: ',accuracy_score(dev_pred, dev_labels.values.ravel()))
print(classification_report(dev_pred, dev_labels.values.ravel()))

Accuracy:  0.4634482012867963
              precision    recall  f1-score   support

          AU       0.01      0.00      0.00      2452
          CA       0.02      0.01      0.01      6347
          DE       0.01      0.01      0.01      1798
          ES       0.02      0.01      0.01      4858
          FR       0.03      0.02      0.03      5246
          GB       0.02      0.01      0.02      3506
          IT       0.03      0.01      0.02      6082
         NDF       0.60      0.66      0.63    106768
          NL       0.01      0.00      0.01      2964
          PT       0.00      0.00      0.00       983
          US       0.38      0.43      0.40     52595
       other       0.06      0.06      0.06      9852

   micro avg       0.46      0.46      0.46    203451
   macro avg       0.10      0.10      0.10    203451
weighted avg       0.42      0.46      0.44    203451



In [94]:
rf_gs.best_params_

{'max_depth': 20, 'n_estimators': 300}

In acccuracy, weighted F1, and macro F1 the random forest classifier performs worse than our other classifiers above.  Even after optimizing for the number of extimators and tree depth, the model still performs somewhat worse.  Unlike our two prior examples, this model predicts the uncommon classes at a much higher rate, but is inaccurate at doing so, as evidenced by the class-level F1s.  This could be due to the model "overlearning" feature combinations in the training phase, building out decision tree branches that ultimately do not generalize.

## Logistic Regression

In [105]:
params={'C':[.001,.01, .1, 1, 10, 100, .007], 'penalty':['l1', 'l2']}
lr = LogisticRegression(class_weight = 'balanced', tol=.01)
lr_gs = GridSearchCV(lr, params, cv=3, scoring='f1_weighted', n_jobs=-1)
pipeline = make_pipeline(preprocessor, lr_gs)
pipeline.fit(train_data, train_labels.values.ravel())
dev_pred = pipeline.predict(dev_data)
print('Accuracy: ',accuracy_score(dev_pred, dev_labels.values.ravel()))
print(classification_report(dev_pred, dev_labels.values.ravel()))

Accuracy:  0.5527964964536916
              precision    recall  f1-score   support

          AU       0.07      0.00      0.01      7144
          CA       0.03      0.01      0.02      3861
          DE       0.01      0.00      0.00      1755
          ES       0.03      0.02      0.02      2750
          FR       0.00      0.02      0.00        92
          GB       0.01      0.01      0.01       890
          IT       0.01      0.02      0.01       819
         NDF       0.78      0.69      0.74    134627
          NL       0.03      0.01      0.01      3667
          PT       0.03      0.00      0.00      7473
          US       0.32      0.47      0.38     40287
       other       0.00      0.15      0.00        86

   micro avg       0.55      0.55      0.55    203451
   macro avg       0.11      0.12      0.10    203451
weighted avg       0.59      0.55      0.56    203451



In [106]:
lr_gs.best_params_

{'C': 0.007, 'penalty': 'l2'}

## XGBoost

In [129]:
#params={'reg_lambda':[100]}
params={'booster':['gbtree', 'gblinear','dart']}
xgb = XGBClassifier(class_weight = 'balanced', nthread=-1)
xgb_gs = GridSearchCV(xgb, params, cv=3, scoring='f1_weighted', n_jobs=-1)
pipeline = make_pipeline(preprocessor, xgb_gs)
pipeline.fit(train_data, train_labels.values.ravel())
dev_pred = pipeline.predict(dev_data)
print('Accuracy: ',accuracy_score(dev_pred, dev_labels.values.ravel()))
print(classification_report(dev_pred, dev_labels.values.ravel()))

Accuracy:  0.6335333815021799


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

          AU       0.00      0.00      0.00         0
          CA       0.00      0.00      0.00         0
          DE       0.00      0.00      0.00         0
          ES       0.00      0.00      0.00         0
          FR       0.00      0.00      0.00         0
          GB       0.00      0.00      0.00         0
          IT       0.00      0.00      0.00         0
         NDF       0.85      0.69      0.76    146802
          NL       0.00      0.00      0.00         0
          PT       0.00      0.00      0.00         0
          US       0.47      0.49      0.48     56649
       other       0.00      0.00      0.00         0

   micro avg       0.63      0.63      0.63    203451
   macro avg       0.11      0.10      0.10    203451
weighted avg       0.74      0.63      0.68    203451



  'recall', 'true', average, warn_for)


In [130]:
xgb_gs.best_params_

{'booster': 'gbtree'}

## Ensemble Voting Classifier

In [None]:
# Train

bnb = BernoulliNB(alpha=1.05)
rf = RandomForestClassifier(n_jobs = -1, n_estimators=300, max_depth=10, class_weight='balanced')
lr = LogisticRegression()#C=.007, class_weight='balanced')
xgb = XGBClassifier(n_jobs=-1)

vc = VotingClassifier(estimators = [('bnb', bnb),
                                    ('rf', rf),
                                    ('lr', lr),
                                    ('xgb', xgb)], voting='hard')

pipeline = make_pipeline(preprocessor, vc)

final_model = cross_validate(pipeline, train_data, train_labels,
                      scoring=["f1_weighted"],
                      return_train_score=True, cv=3, n_jobs = -1)

In [119]:
# Show accuracy results.
pipeline.fit(train_data, train_labels.values.ravel())
score = pipeline.score(dev_data, dev_labels.values.ravel())
print("Pipeline Score: %.4f" %(score))
display(pd.DataFrame(final_model))

Pipeline Score: 0.6306


Unnamed: 0,fit_time,score_time,test_f1_weighted,train_f1_weighted
0,6.050808,1.058576,0.573121,0.603624
1,5.908152,1.053457,0.585729,0.596188
2,6.193894,0.947466,0.574207,0.600389


In [135]:
pipeline.fit(train_data, train_labels.values.ravel())
dev_pred = pipeline.predict(dev_data)
print('Accuracy: ',accuracy_score(dev_pred, dev_labels.values.ravel()))
print(classification_report(dev_pred, dev_labels.values.ravel()))

Accuracy:  0.29775719952224367
              precision    recall  f1-score   support

          AU       0.19      0.01      0.01     14840
          CA       0.17      0.01      0.03     15641
          DE       0.14      0.01      0.02     11540
          ES       0.13      0.02      0.03     16542
          FR       0.13      0.04      0.06     18262
          GB       0.11      0.02      0.03     14730
          IT       0.07      0.02      0.03     10079
         NDF       0.45      0.78      0.57     68193
          NL       0.14      0.01      0.02     11078
          PT       0.15      0.00      0.01      6301
          US       0.08      0.45      0.13     10608
       other       0.04      0.07      0.05      5637

   micro avg       0.30      0.30      0.30    203451
   macro avg       0.15      0.12      0.08    203451
weighted avg       0.24      0.30      0.22    203451



In [9]:
# Generate predictions for test data to submit to Kaggle for scoring.
predictions = pipeline.predict(test_data)

# Save to csv
final_csv = 'kaggle_submission.csv'
predictions_pd = pd.DataFrame(data=predictions, columns=['country'])
test_result = pd.concat([test_data['id'], predictions_pd], axis=1, sort=False)
test_result.to_csv(final_csv, index=False)
print("WRITTEN: %s" %(final_csv))

WRITTEN: kaggle_submission.csv
