In [2]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *


# Set the randomizer seed so results are the same each time.
np.random.seed(0)



In [3]:
train = pd.read_csv('../train_dev_data/train_data.csv')
train.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,g936neasyy,2013-05-12,20130512210934,2013-05-13,-unknown-,,basic,0,en,direct,direct,linked,Web,Mac Desktop,Chrome,other
1,duq2vabpp2,2013-03-02,20130302054534,,FEMALE,31.0,facebook,0,en,direct,direct,untracked,Web,iPad,Mobile Safari,NDF
2,xiymwcsklc,2011-05-17,20110517211429,,-unknown-,105.0,facebook,2,en,direct,direct,untracked,Web,Mac Desktop,Firefox,NDF
3,8kkcksa0dw,2013-12-02,20131202180650,2013-12-11,-unknown-,37.0,basic,0,en,sem-brand,google,omg,Web,iPad,Mobile Safari,US
4,zk8qx61d9m,2013-11-07,20131107183734,,FEMALE,25.0,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome,NDF


In [4]:
dev = pd.read_csv('../train_dev_data/dev_data.csv')
#dev.head()

In [4]:
#print(len(train))
#print(len(dev))

In [6]:
#creating labels sets from our train and dev data sets
train_labels = train.country_destination
dev_labels = dev.country_destination



In [26]:
min_age = train[['age']].min(axis=0)
max_age = train[['age']].max(axis=0)

min_age

age    1.0
dtype: float64

In [28]:
max_age

age    2014.0
dtype: float64

In [18]:
languages = train.language.unique()
genders = train.gender.unique()
signup_methods = train.signup_method.unique()
affiliate_channels = train.affiliate_channel.unique()
signup_apps = train.signup_app.unique()
device_types = train.first_device_type.unique()
browsers = train.first_browser.unique()
destinations = train.country_destination.unique()
signup_flow = train.signup_flow.unique()
ages = train.age.unique()

print"Possible language options: \n", languages, "\n"
print"Possible gender options: \n", genders, "\n"
print"Possible sign up methods options: \n", signup_methods, "\n"
print"Possible affiliate channels options: \n", affiliate_channels, "\n"
print"Possible sign up apps options: \n", signup_apps, "\n"
print"Possible device types options: \n", device_types, "\n"
print"Possible browsers options: \n", browsers, "\n"
print"Possible destinations options: \n", destinations, "\n"
print"Possible signup flow options: \n", signup_flow, "\n"
print"Possible age options: \n", ages, "\n"

Possible language options: 
['en' 'zh' 'fr' 'sv' 'ja' 'it' 'es' 'ko' 'no' 'ru' 'da' 'de' 'pl' 'pt' 'el'
 'nl' 'fi' 'tr' 'th' 'cs' 'hu' 'id' 'is' 'ca' 'hr'] 

Possible gender options: 
['-unknown-' 'FEMALE' 'MALE' 'OTHER'] 

Possible sign up methods options: 
['basic' 'facebook' 'google'] 

Possible affiliate channels options: 
['direct' 'sem-brand' 'api' 'sem-non-brand' 'content' 'other' 'seo'
 'remarketing'] 

Possible sign up apps options: 
['Web' 'Moweb' 'iOS' 'Android'] 

Possible device types options: 
['Mac Desktop' 'iPad' 'Windows Desktop' 'Other/Unknown' 'iPhone'
 'Android Phone' 'Desktop (Other)' 'Android Tablet' 'SmartPhone (Other)'] 

Possible browsers options: 
['Chrome' 'Mobile Safari' 'Firefox' '-unknown-' 'IE' 'Safari' 'Silk'
 'Opera' 'AOL Explorer' 'Chrome Mobile' 'Iron' 'Android Browser'
 'BlackBerry Browser' 'RockMelt' 'Sogou Explorer' 'Apple Mail' 'Chromium'
 'SlimBrowser' 'IceWeasel' 'Maxthon' 'Yandex.Browser' 'IE Mobile'
 'Pale Moon' 'SiteKiosk' 'Opera Mobile' 'wOS

In [6]:
# CORRELATION TRIAL 1:

columns_of_interest_corr = ['gender', 'age']

corr_dummy = pd.get_dummies(train[['gender', 'age']], columns=columns_of_interest_corr)
corr_dummy.corr()

Unnamed: 0,gender_-unknown-,gender_FEMALE,gender_MALE,gender_OTHER,age_1.0,age_2.0,age_4.0,age_5.0,age_15.0,age_16.0,...,age_1935.0,age_1936.0,age_1938.0,age_1942.0,age_1947.0,age_1949.0,age_1953.0,age_1995.0,age_2013.0,age_2014.0
gender_-unknown-,1.000000,-0.583299,-0.527560,-0.033369,-0.002076,-0.001836,-0.000922,-0.009999,-0.003742,-0.010171,...,-0.002076,-0.002936,-0.002076,-0.002076,-0.002936,-0.002936,-0.002076,-0.002076,-0.010107,-0.042065
gender_FEMALE,-0.583299,1.000000,-0.378869,-0.023964,-0.001491,0.002068,-0.002582,0.007672,-0.002036,0.007128,...,-0.001491,0.001462,0.003559,0.003559,0.001462,0.001462,-0.001491,0.003559,0.008529,0.041164
gender_MALE,-0.527560,-0.378869,1.000000,-0.021674,0.003935,-0.000055,0.003765,0.003424,0.006418,0.004179,...,0.003935,0.001829,-0.001348,-0.001348,0.001829,0.001829,0.003935,-0.001348,0.002647,0.004666
gender_OTHER,-0.033369,-0.023964,-0.021674,1.000000,-0.000085,-0.000171,-0.000148,-0.000533,-0.000226,-0.000418,...,-0.000085,-0.000121,-0.000085,-0.000085,-0.000121,-0.000121,-0.000085,-0.000085,-0.000483,0.002931
age_1.0,-0.002076,-0.001491,0.003935,-0.000085,1.000000,-0.000011,-0.000009,-0.000033,-0.000014,-0.000026,...,-0.000005,-0.000008,-0.000005,-0.000005,-0.000008,-0.000008,-0.000005,-0.000005,-0.000030,-0.000131
age_2.0,-0.001836,0.002068,-0.000055,-0.000171,-0.000011,1.000000,-0.000018,-0.000066,-0.000028,-0.000052,...,-0.000011,-0.000015,-0.000011,-0.000011,-0.000015,-0.000015,-0.000011,-0.000011,-0.000060,-0.000263
age_4.0,-0.000922,-0.002582,0.003765,-0.000148,-0.000009,-0.000018,1.000000,-0.000057,-0.000024,-0.000045,...,-0.000009,-0.000013,-0.000009,-0.000009,-0.000013,-0.000013,-0.000009,-0.000009,-0.000052,-0.000228
age_5.0,-0.009999,0.007672,0.003424,-0.000533,-0.000033,-0.000066,-0.000057,1.000000,-0.000088,-0.000162,...,-0.000033,-0.000047,-0.000033,-0.000033,-0.000047,-0.000047,-0.000033,-0.000033,-0.000187,-0.000821
age_15.0,-0.003742,-0.002036,0.006418,-0.000226,-0.000014,-0.000028,-0.000024,-0.000088,1.000000,-0.000069,...,-0.000014,-0.000020,-0.000014,-0.000014,-0.000020,-0.000020,-0.000014,-0.000014,-0.000079,-0.000348
age_16.0,-0.010171,0.007128,0.004179,-0.000418,-0.000026,-0.000052,-0.000045,-0.000162,-0.000069,1.000000,...,-0.000026,-0.000037,-0.000026,-0.000026,-0.000037,-0.000037,-0.000026,-0.000026,-0.000147,-0.000644


In [9]:
f, ax = pl.subplots(figsize=(10, 8))
corr = dataframe.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

NameError: name 'pl' is not defined

In [8]:
#TRIAL 1: language, gender, sign up method (knn best accuracy: 0.59)


columns_of_interest = ['language', 'gender', 'signup_method']
#columns_of_interest = ['language', 'gender', 'signup_app']

#creating new dataframe from train with integer values for categorical variables
#only including columns / variables of interest
train_dummy = pd.get_dummies(train[['language', 'gender', 'signup_method']], columns=columns_of_interest)
dev_dummy = pd.get_dummies(dev[['language', 'gender', 'signup_method']], columns=columns_of_interest)




In [64]:
# TRIAL 2: gender, sign up method (knn best accuracy: 0.58)
    
columns_of_interest = ['gender', 'signup_method']
#columns_of_interest = ['language', 'gender', 'signup_app']

#creating new dataframe from train with integer values for categorical variables
#only including columns / variables of interest
train_dummy = pd.get_dummies(train[['gender', 'signup_method']], columns=columns_of_interest)
dev_dummy = pd.get_dummies(dev[['gender', 'signup_method']], columns=columns_of_interest)

In [8]:
# Note that training data has two extra languages in it - ca & hr
# We need to add dummy columns for ca & hr to the dev data

# List of columns in transformed training data & dev data
train_dummy_list = list(train_dummy.columns.values)
dev_dummy_list = list(dev_dummy.columns.values)

# difference between the sets
np.setdiff1d(train_dummy_list, dev_dummy_list)

array(['language_ca', 'language_hr'], 
      dtype='|S22')

In [9]:
dev_dummy.insert(0, 'language_ca', 0, allow_duplicates=False)
dev_dummy.insert(9, 'language_hr', 0, allow_duplicates=False)
dev_dummy.head()

Unnamed: 0,language_ca,language_cs,language_da,language_de,language_el,language_en,language_es,language_fi,language_fr,language_hr,...,language_th,language_tr,language_zh,gender_-unknown-,gender_FEMALE,gender_MALE,gender_OTHER,signup_method_basic,signup_method_facebook,signup_method_google
0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [9]:
list(train_dummy)

['language_ca',
 'language_cs',
 'language_da',
 'language_de',
 'language_el',
 'language_en',
 'language_es',
 'language_fi',
 'language_fr',
 'language_hr',
 'language_hu',
 'language_id',
 'language_is',
 'language_it',
 'language_ja',
 'language_ko',
 'language_nl',
 'language_no',
 'language_pl',
 'language_pt',
 'language_ru',
 'language_sv',
 'language_th',
 'language_tr',
 'language_zh',
 'gender_-unknown-',
 'gender_FEMALE',
 'gender_MALE',
 'gender_OTHER',
 'signup_method_basic',
 'signup_method_facebook',
 'signup_method_google']

In [10]:
list(dev_dummy)

['language_cs',
 'language_da',
 'language_de',
 'language_el',
 'language_en',
 'language_es',
 'language_fi',
 'language_fr',
 'language_hu',
 'language_id',
 'language_is',
 'language_it',
 'language_ja',
 'language_ko',
 'language_nl',
 'language_no',
 'language_pl',
 'language_pt',
 'language_ru',
 'language_sv',
 'language_th',
 'language_tr',
 'language_zh',
 'gender_-unknown-',
 'gender_FEMALE',
 'gender_MALE',
 'gender_OTHER',
 'signup_method_basic',
 'signup_method_facebook',
 'signup_method_google']

In [10]:
# Create arrays based on the variables we want to use
train_array = np.array(train_dummy)
train_label_array = np.array(train_labels)

dev_array = np.array(dev_dummy)
dev_label_array = np.array(dev_labels)

In [11]:
def knn_model_test(k_vals, train_data, train_labels, dev_data, dev_labels):

### STUDENT START ###

    # We want to evaluate a variety of values for k, so we need
    # to enclose our work in a loop.
    for elem in k_vals:
        
        # Create a classifier object, fit our training data &
        # initialize a variable to hold the predictions
        knn = KNeighborsClassifier(n_neighbors=elem)
        knn.fit(train_data, train_labels)
        preds = knn.predict(dev_data)
        
        # We evaluate the accuracy for each value of k by comparing
        # the predictions and the labels, then updating values for
        # correct and total
        correct, total = 0, 0
        for pred, label in zip(preds, dev_labels):
            if pred == label: 
                correct += 1
            total += 1
        print 'For k=%s, total: %3d  correct: %3d  accuracy: %3.2f' %(elem, total, correct, 1.0*correct/total)

k_vals = [7, 10, 15, 20, 50]
knn_model_test(k_vals, train_array[:2000], train_label_array[:2000], dev_array[:2000], dev_label_array[:2000])

For k=7, total: 2000  correct: 1183  accuracy: 0.59
For k=10, total: 2000  correct: 1158  accuracy: 0.58
For k=15, total: 2000  correct: 1161  accuracy: 0.58
For k=20, total: 2000  correct: 1160  accuracy: 0.58
For k=50, total: 2000  correct: 1161  accuracy: 0.58


In [14]:
def mod_test2(train_data, train_labels, dev_data, dev_labels):
    
    # LOGISTIC REGRESSION
    log = LogisticRegression()
    
    C_options = {'C': np.arange(0.1, 1, 0.1)}
    log_grid = GridSearchCV(log, C_options)

    log_grid.fit(train_data, train_labels)
    log_preds = log_grid.predict(dev_data)
    
    # Output best param
    print "Best value for C: %.2f" %log_grid.best_params_['C']
    print "F1 score for Logistic Regression: %.3f" %metrics.f1_score(dev_labels, log_preds, average="weighted") + "\n"
    

mod_test2(train_array[:2000], train_label_array[:2000], dev_array[:2000], dev_label_array[:2000])



Best value for C: 0.50
F1 score for Logistic Regression: 0.550



  'precision', 'predicted', average, warn_for)


In [None]:
 def mod_test3(train_data, train_labels, dev_data, dev_labels):
    mult = MultinomialNB(alpha = 1)
    # floor converts data into a discrete-like set
    mult.fit(dig_mini_train_data, mini_train_labels) 
    mult_score = mult.score(dig_dev_data, dev_labels) 
    
  
    print 'bernoulli accuracy: %3.2f' %bern.score(dev_data, dev_labels)
    print 'multinomial accuracy: %3.2f' %mult_score
    
    print '\n'
    plt.hist(dev_data)
    
mod_test2(train_array[:2000], train_label_array[:2000], dev_array[:2000], dev_label_array[:2000])

In [33]:
barplot_affiliate = train[['signup_flow']].copy()

In [None]:
barplot_affiliate.plot.bar(stacked=True)

In [5]:
train[['signup_method']]

Unnamed: 0,signup_method
0,basic
1,facebook
2,facebook
3,basic
4,basic
5,facebook
6,basic
7,basic
8,basic
9,basic
