In [7]:
# Import libraries.
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate

In [8]:
# Import data files from Kaggle.
DATA_PATH = './data/extracted'
dfs_raw = {}
dfs = {}
for root, dirs, files in os.walk(DATA_PATH):
    for file in files:
        dfs_raw[file.split('.')[0]] = pd.read_csv(f'{DATA_PATH}/{file}')
        dfs = dfs_raw.copy()
        print(file)

age_gender_bkts.csv
countries.csv
sample_submission_NDF.csv
sessions.csv
test_users.csv
train_users_2.csv


In [9]:
# Split into data and labels (panda dataframes).
#reduced this to 10k / 213k since it was taking forever to even test anything
train_data   = dfs["train_users_2"][:10000].iloc[:, 0:-1] #we should randomize since accounts are in chronological order
train_labels = dfs["train_users_2"][:10000]["country_destination"].ravel()
test_data    = dfs["test_users"]

In [10]:
#Function to bucket ages prior to one-hot encoding
def age_bucketer(df_input):
    df = df_input
    df.loc[(pd.isnull(df.age), 'age_bucket')] = 'unknown'
    df.loc[(pd.notnull(df.age), 'age_bucket')] = pd.cut(df['age'],
                                                        [0, 4, 9, 14, 19, 24, 29, 34, 39, 44, 49, 54, 59, 64, 69, 74, 79, 84, 89, 94,99,10000],
                                                        labels=['0-4', '5-9', '10-14','15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49',
                                                                '50-54', '55-59','60-64', '65-69','70-74','75-79','80-84','85-89','90-94','95-99','100+'],
                                                        include_lowest=True)
    return df.drop(['age'], axis=1)

#Since NaN's in categorical data will cause issues with our pipeline we will replace that with "unknown".
def clean_first_affiliate_tracked_nulls(df_input):
    df_input['first_affiliate_tracked'] = df_input['first_affiliate_tracked'].fillna("unknown", inplace=False)
    return df_input

#Add month and year features
def feature_creator (df_input):
    df = df_input
    df['first_active_date'] = pd.to_datetime(df.timestamp_first_active,format='%Y%m%d%H%M%S')
    df['year_first_active'] = df['first_active_date'].dt.year
    df['month_first_active'] = df['first_active_date'].dt.month
    return df.drop(['first_active_date'], axis=1)

In [11]:
# Create Preprocessor pipeline.
def create_preprocessor_pipeline():

    column_transformer = make_column_transformer(
        (['gender',
          'signup_method',
          'signup_flow',
          'language',
          'affiliate_channel',
          'affiliate_provider',
          'first_affiliate_tracked',
          'signup_app',
          'first_device_type',
          'first_browser',
          'age_bucket'
         ], OneHotEncoder(handle_unknown='ignore')),remainder='drop') # when we add in sessions features we will want to pass remainders
    
    preprocessor = make_pipeline(
        FunctionTransformer(age_bucketer, validate=False),
        FunctionTransformer(feature_creator, validate=False),
        FunctionTransformer(clean_first_affiliate_tracked_nulls, validate=False),
        column_transformer)
    
    return preprocessor

In [12]:
# Stage: Data Preprocessor.
preprocessor = create_preprocessor_pipeline()



In [13]:
# Train
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

bnb = BernoulliNB()
rf = RandomForestClassifier(n_jobs = -1)
lr = LogisticRegression()

vc = VotingClassifier(estimators = [('bnb', bnb), ('rf', rf), ('lr', lr)], voting='hard')

pipeline = make_pipeline(preprocessor, vc)

final_model = cross_validate(pipeline, train_data, train_labels,
                      scoring=["f1_macro"],
                      return_train_score=True, cv=3, n_jobs = -1)

In [15]:
# Show accuracy results.
display(pd.DataFrame(final_model))

Unnamed: 0,fit_time,score_time,test_f1_macro,train_f1_macro
0,2.382014,0.387995,0.095529,0.185941
1,2.358514,0.410005,0.0933,0.189705
2,2.490518,0.301499,0.091781,0.197438


In [None]:
# Step 7
# Generate predictions for test data to submit to Kaggle for scoring.
# predictions = clf.predict(test_data)