In [1]:
# IMPORTS 

# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# Import a bunch of libraries.
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.ticker import MultipleLocator
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_mldata
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
import scipy.stats


# Set the randomizer seed so results are the same each time.
np.random.seed(0)

In [2]:
train = pd.read_csv('../train_dev_data/train_data.csv')
train.head(200)

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,g936neasyy,2013-05-12,20130512210934,2013-05-13,-unknown-,,basic,0,en,direct,direct,linked,Web,Mac Desktop,Chrome,other
1,duq2vabpp2,2013-03-02,20130302054534,,FEMALE,31.0,facebook,0,en,direct,direct,untracked,Web,iPad,Mobile Safari,NDF
2,xiymwcsklc,2011-05-17,20110517211429,,-unknown-,105.0,facebook,2,en,direct,direct,untracked,Web,Mac Desktop,Firefox,NDF
3,8kkcksa0dw,2013-12-02,20131202180650,2013-12-11,-unknown-,37.0,basic,0,en,sem-brand,google,omg,Web,iPad,Mobile Safari,US
4,zk8qx61d9m,2013-11-07,20131107183734,,FEMALE,25.0,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome,NDF
5,m279vg69ql,2013-06-20,20130620041107,2013-07-03,FEMALE,26.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,FR
6,08j7fujqln,2012-11-23,20121123122454,,-unknown-,,basic,12,en,api,other,,Moweb,Other/Unknown,-unknown-,NDF
7,zfn6qt4riu,2011-01-22,20110122230445,2011-02-18,FEMALE,35.0,basic,2,en,sem-non-brand,vast,untracked,Web,Windows Desktop,IE,other
8,bzj8eo9zt1,2013-08-30,20130830170502,2013-08-30,-unknown-,,basic,0,en,direct,direct,untracked,Web,Windows Desktop,Chrome,US
9,59firgevbj,2014-04-07,20140407225228,,-unknown-,,basic,0,en,sem-non-brand,google,omg,Web,Windows Desktop,IE,NDF


In [3]:
dev = pd.read_csv('../train_dev_data/dev_data.csv')
#dev.head()

In [4]:
#print(len(train))
#print(len(dev))

In [5]:
#creating labels sets from our train and dev data sets
train_labels = train.country_destination
dev_labels = dev.country_destination



In [6]:
languages = train.language.unique()
genders = train.gender.unique()
signup_methods = train.signup_method.unique()
affiliate_channels = train.affiliate_channel.unique()
signup_apps = train.signup_app.unique()
device_types = train.first_device_type.unique()
browsers = train.first_browser.unique()
destinations = train.country_destination.unique()

print"Possible language options: \n", languages, "\n"
print"Possible gender options: \n", genders, "\n"
print"Possible sign up methods options: \n", signup_methods, "\n"
print"Possible affiliate channels options: \n", affiliate_channels, "\n"
print"Possible sign up apps options: \n", signup_apps, "\n"
print"Possible device types options: \n", device_types, "\n"
print"Possible browsers options: \n", browsers, "\n"
print"Possible destinations options: \n", destinations, "\n"


Possible language options: 
['en' 'zh' 'fr' 'sv' 'ja' 'it' 'es' 'ko' 'no' 'ru' 'da' 'de' 'pl' 'pt' 'el'
 'nl' 'fi' 'tr' 'th' 'cs' 'hu' 'id' 'is' 'ca' 'hr'] 

Possible gender options: 
['-unknown-' 'FEMALE' 'MALE' 'OTHER'] 

Possible sign up methods options: 
['basic' 'facebook' 'google'] 

Possible affiliate channels options: 
['direct' 'sem-brand' 'api' 'sem-non-brand' 'content' 'other' 'seo'
 'remarketing'] 

Possible sign up apps options: 
['Web' 'Moweb' 'iOS' 'Android'] 

Possible device types options: 
['Mac Desktop' 'iPad' 'Windows Desktop' 'Other/Unknown' 'iPhone'
 'Android Phone' 'Desktop (Other)' 'Android Tablet' 'SmartPhone (Other)'] 

Possible browsers options: 
['Chrome' 'Mobile Safari' 'Firefox' '-unknown-' 'IE' 'Safari' 'Silk'
 'Opera' 'AOL Explorer' 'Chrome Mobile' 'Iron' 'Android Browser'
 'BlackBerry Browser' 'RockMelt' 'Sogou Explorer' 'Apple Mail' 'Chromium'
 'SlimBrowser' 'IceWeasel' 'Maxthon' 'Yandex.Browser' 'IE Mobile'
 'Pale Moon' 'SiteKiosk' 'Opera Mobile' 'wOS

In [80]:
# CORRELATION TRIAL 1:

columns_of_interest_corr = ['gender', 'country_destination']

corr_dummy = pd.get_dummies(train[['gender', 'country_destination']], columns=columns_of_interest_corr)
corr_dummy.corr()

Unnamed: 0,gender_-unknown-,gender_FEMALE,gender_MALE,gender_OTHER,country_destination_AU,country_destination_CA,country_destination_DE,country_destination_ES,country_destination_FR,country_destination_GB,country_destination_IT,country_destination_NDF,country_destination_NL,country_destination_PT,country_destination_US,country_destination_other
gender_-unknown-,1.0,-0.583299,-0.52756,-0.033369,-0.018201,-0.016904,-0.02585,-0.025241,-0.033181,-0.026422,-0.020332,0.207554,-0.017285,-0.008586,-0.16327,-0.046899
gender_FEMALE,-0.583299,1.0,-0.378869,-0.023964,0.010233,0.003879,0.005252,0.01823,0.032463,0.020016,0.023697,-0.119394,0.005131,0.004245,0.096462,0.008241
gender_MALE,-0.52756,-0.378869,1.0,-0.021674,0.00997,0.014736,0.023701,0.009565,0.003414,0.009174,-0.001764,-0.11044,0.013924,0.00507,0.084469,0.044423
gender_OTHER,-0.033369,-0.023964,-0.021674,1.0,0.001006,0.005739,0.003421,0.00181,0.005542,0.000277,0.001924,-0.016431,0.005001,0.003331,0.009957,0.005323
country_destination_AU,-0.018201,0.010233,0.00997,0.001006,1.0,-0.00413,-0.003584,-0.005177,-0.007829,-0.00526,-0.005853,-0.059391,-0.003006,-0.001599,-0.03228,-0.011161
country_destination_CA,-0.016904,0.003879,0.014736,0.005739,-0.00413,1.0,-0.005869,-0.008479,-0.012822,-0.008614,-0.009586,-0.097267,-0.004923,-0.00262,-0.052867,-0.018278
country_destination_DE,-0.02585,0.005252,0.023701,0.003421,-0.003584,-0.005869,1.0,-0.007358,-0.011127,-0.007475,-0.008319,-0.084409,-0.004273,-0.002273,-0.045878,-0.015862
country_destination_ES,-0.025241,0.01823,0.009565,0.00181,-0.005177,-0.008479,-0.007358,1.0,-0.016074,-0.010799,-0.012017,-0.121936,-0.006172,-0.003284,-0.066275,-0.022914
country_destination_FR,-0.033181,0.032463,0.003414,0.005542,-0.007829,-0.012822,-0.011127,-0.016074,1.0,-0.01633,-0.018172,-0.184396,-0.009334,-0.004966,-0.100223,-0.034652
country_destination_GB,-0.026422,0.020016,0.009174,0.000277,-0.00526,-0.008614,-0.007475,-0.010799,-0.01633,1.0,-0.012209,-0.12388,-0.00627,-0.003336,-0.067331,-0.02328


In [67]:
#TRIAL 1: language, gender, sign up method (knn best accuracy: 0.59)


columns_of_interest = ['language', 'gender', 'signup_method']
#columns_of_interest = ['language', 'gender', 'signup_app']

#creating new dataframe from train with integer values for categorical variables
#only including columns / variables of interest
train_dummy = pd.get_dummies(train[['language', 'gender', 'signup_method']], columns=columns_of_interest)
dev_dummy = pd.get_dummies(dev[['language', 'gender', 'signup_method']], columns=columns_of_interest)




In [64]:
# TRIAL 2: gender, sign up method (knn best accuracy: 0.58)
    
columns_of_interest = ['gender', 'signup_method']
#columns_of_interest = ['language', 'gender', 'signup_app']

#creating new dataframe from train with integer values for categorical variables
#only including columns / variables of interest
train_dummy = pd.get_dummies(train[['gender', 'signup_method']], columns=columns_of_interest)
dev_dummy = pd.get_dummies(dev[['gender', 'signup_method']], columns=columns_of_interest)

EXPLORATORY ANALYSIS

In [73]:
# Note that training data has two extra languages in it - ca & hr
# We need to add dummy columns for ca & hr to the dev data

# List of columns in transformed training data & dev data
train_dummy_list = list(train_dummy.columns.values)
dev_dummy_list = list(dev_dummy.columns.values)

# difference between the sets
np.setdiff1d(train_dummy_list, dev_dummy_list)

array([], 
      dtype='|S22')

In [74]:
list(train_dummy)

['language_ca',
 'language_cs',
 'language_da',
 'language_de',
 'language_el',
 'language_en',
 'language_es',
 'language_fi',
 'language_fr',
 'language_hr',
 'language_hu',
 'language_id',
 'language_is',
 'language_it',
 'language_ja',
 'language_ko',
 'language_nl',
 'language_no',
 'language_pl',
 'language_pt',
 'language_ru',
 'language_sv',
 'language_th',
 'language_tr',
 'language_zh',
 'gender_-unknown-',
 'gender_FEMALE',
 'gender_MALE',
 'gender_OTHER',
 'signup_method_basic',
 'signup_method_facebook',
 'signup_method_google']

In [75]:
list(dev_dummy)

['language_ca',
 'language_cs',
 'language_da',
 'language_de',
 'language_el',
 'language_en',
 'language_es',
 'language_fi',
 'language_fr',
 'language_hr',
 'language_hu',
 'language_id',
 'language_is',
 'language_it',
 'language_ja',
 'language_ko',
 'language_nl',
 'language_no',
 'language_pl',
 'language_pt',
 'language_ru',
 'language_sv',
 'language_th',
 'language_tr',
 'language_zh',
 'gender_-unknown-',
 'gender_FEMALE',
 'gender_MALE',
 'gender_OTHER',
 'signup_method_basic',
 'signup_method_facebook',
 'signup_method_google']

In [71]:
# Create arrays based on the variables we want to use
train_array = np.array(train_dummy)
train_label_array = np.array(train_labels)

dev_array = np.array(dev_dummy)
dev_label_array = np.array(dev_labels)

In [72]:
def knn_model_test(k_vals, train_data, train_labels, dev_data, dev_labels):

### STUDENT START ###

    # We want to evaluate a variety of values for k, so we need
    # to enclose our work in a loop.
    for elem in k_vals:
        
        # Create a classifier object, fit our training data &
        # initialize a variable to hold the predictions
        knn = KNeighborsClassifier(n_neighbors=elem)
        knn.fit(train_data, train_labels)
        preds = knn.predict(dev_data)
        
        # We evaluate the accuracy for each value of k by comparing
        # the predictions and the labels, then updating values for
        # correct and total
        correct, total = 0, 0
        for pred, label in zip(preds, dev_labels):
            if pred == label: 
                correct += 1
            total += 1
        print 'For k=%s, total: %3d  correct: %3d  accuracy: %3.2f' %(elem, total, correct, 1.0*correct/total)

k_vals = [7, 10, 15, 20, 50]
knn_model_test(k_vals, train_array[:2000], train_label_array[:2000], dev_array[:2000], dev_label_array[:2000])

For k=7, total: 2000  correct: 1183  accuracy: 0.59
For k=10, total: 2000  correct: 1158  accuracy: 0.58
For k=15, total: 2000  correct: 1161  accuracy: 0.58
For k=20, total: 2000  correct: 1160  accuracy: 0.58
For k=50, total: 2000  correct: 1161  accuracy: 0.58
