In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *



In [2]:
train = pd.read_csv('../train_dev_data/train_data.csv')
dev = pd.read_csv('../train_dev_data/dev_data.csv')

# Split off training labels & dev labels
train_labels = train.country_destination
dev_labels = dev.country_destination

In [3]:
train.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,g936neasyy,2013-05-12,20130512210934,2013-05-13,-unknown-,,basic,0,en,direct,direct,linked,Web,Mac Desktop,Chrome,other
1,duq2vabpp2,2013-03-02,20130302054534,,FEMALE,31.0,facebook,0,en,direct,direct,untracked,Web,iPad,Mobile Safari,NDF
2,xiymwcsklc,2011-05-17,20110517211429,,-unknown-,105.0,facebook,2,en,direct,direct,untracked,Web,Mac Desktop,Firefox,NDF
3,8kkcksa0dw,2013-12-02,20131202180650,2013-12-11,-unknown-,37.0,basic,0,en,sem-brand,google,omg,Web,iPad,Mobile Safari,US
4,zk8qx61d9m,2013-11-07,20131107183734,,FEMALE,25.0,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome,NDF


In [4]:
ses = pd.read_csv('../unzipped_files/sessions.csv')
ses.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0


In [199]:
# We're going to attempt to undersample the examples in which NDF is the outcome
# Split off rows where NDF is target variable
NDF = train[train['country_destination']=='NDF']
US = train[train['country_destination']=='US']
other = train[((train['country_destination']!='NDF') & (train['country_destination']!='US'))]

# Concatenate a portion of NDF with everything else
objs = [NDF[:10000], US[:10000], other]
new_df = pd.concat(objs, axis=0)

In [200]:
# Split off training labels & dev labels
train_labels = new_df.country_destination
dev_labels = dev.country_destination

In [201]:
# Transform variables we want to use into binary
test_cols = ['language', 'gender', 'signup_app']
tr_binary_vars = pd.get_dummies(new_df[['language', 'gender', 'signup_app', 'age']], columns=test_cols)

dev_binary_vars = pd.get_dummies(dev[['language', 'gender', 'signup_app', 'age']], columns=test_cols)

In [202]:
# Note that training data has two extra languages in it - ca & hr
# We need to add dummy columns for ca to the dev data

# List of columns in transformed training data & dev data
tr_list = list(tr_binary_vars.columns.values)
dev_list = list(dev_binary_vars.columns.values)

# difference between the sets - tr has language_ca, dev has language_id
np.setdiff1d(tr_list, dev_list)

array([], 
      dtype='|S18')

In [203]:
# tr_binary_vars = tr_binary_vars.drop(["language_ca"], axis=1)
dev_binary_vars = dev_binary_vars.drop(["language_id"], axis=1)

In [204]:
# Create arrays based on the variables we want to use
train_array = np.array(tr_binary_vars)
train_label_array = np.array(train_labels)

dev_array = np.array(dev_binary_vars)
dev_label_array = np.array(dev_labels)

In [205]:
# Try a model
def mod_test(k_vals, train_data, train_labels, dev_data, dev_labels=None):

### STUDENT START ###

    # We want to evaluate a variety of values for k, so we need
    # to enclose our work in a loop.
    for elem in k_vals:
        
        # Create a classifier object, fit our training data &
        # initialize a variable to hold the predictions
        knn = KNeighborsClassifier(n_neighbors=elem)
        knn.fit(train_data, train_labels)
        preds = knn.predict(dev_data)
        
        # We evaluate the accuracy for each value of k by comparing
        # the predictions and the labels, then updating values for
        # correct and total
        correct, total = 0, 0
        for pred, label in zip(preds, dev_labels):
            if pred == label: 
                correct += 1
            total += 1
        print 'For k=%s, total: %3d  correct: %3d  accuracy: %3.4f' %(elem, total, correct, 1.0*correct/total)

k_vals = [7, 10, 15, 20, 50]
mod_test(k_vals, train_array[:10000], train_label_array[:10000], dev_array[:2000], dev_label_array[:2000])

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [134]:
new_df.language.value_counts()

en    42176
zh      258
fr      238
es      162
de      160
ko      142
it       70
ru       69
ja       41
pt       37
nl       24
sv       18
tr       15
da       10
pl        9
no        7
el        6
cs        5
fi        3
th        2
hu        2
is        1
Name: language, dtype: int64

# Bin languages

In [165]:
#Define a generic function using Pandas replace function
def coding(col, codeDict):
    colCoded = pd.Series(col, copy=True)
    for key, value in codeDict.items():
        colCoded.replace(key, value, inplace=True)
    return colCoded

new_df["lang_mod"] = coding(new_df["language"], {'en':'en', 'zh':'zh', 'fr':'fr', 'es':'es', 'de':'de', 'ko':'ko',
                                                'it':'other', 'ru':'other', 'ja':'other', 'pt':'other', 'nl':'other',
                                                'sv':'other', 'tr':'other', 'da':'other', 'pl':'other', 'no':'other',
                                                'el':'other', 'cs':'other', 'fi':'other', 'th':'other', 'hu':'other',
                                                'is':'other', 'id':'other'})

dev["lang_mod"] = coding(dev["language"], {'en':'en', 'zh':'zh', 'fr':'fr', 'es':'es', 'de':'de', 'ko':'ko',
                                                'it':'other', 'ru':'other', 'ja':'other', 'pt':'other', 'nl':'other',
                                                'sv':'other', 'tr':'other', 'da':'other', 'pl':'other', 'no':'other',
                                                'el':'other', 'cs':'other', 'fi':'other', 'th':'other', 'hu':'other',
                                                'is':'other', 'id':'other'})


# Reduce number of english language examples
en = new_df[new_df['lang_mod']=='en']
other = new_df[new_df['lang_mod']!='en']

# Concatenate a portion of en with everything else
objs = [en[:20000], other]
new_df = pd.concat(objs, axis=0)

# Split off training labels & dev labels
train_labels = new_df.country_destination
dev_labels = dev.country_destination

In [166]:
# Transform variables we want to use into binary
test_cols = ['lang_mod', 'gender']
tr_binary_vars = pd.get_dummies(new_df[['lang_mod', 'gender']], columns=test_cols)

dev_binary_vars = pd.get_dummies(dev[['lang_mod', 'gender']], columns=test_cols)


In [167]:
# Create arrays based on the variables we want to use
train_array = np.array(tr_binary_vars)
train_label_array = np.array(train_labels)

dev_array = np.array(dev_binary_vars)
dev_label_array = np.array(dev_labels)

In [168]:
# Try a model
def mod_test(k_vals, train_data, train_labels, dev_data, dev_labels=None):

### STUDENT START ###

    # We want to evaluate a variety of values for k, so we need
    # to enclose our work in a loop.
    for elem in k_vals:
        
        # Create a classifier object, fit our training data &
        # initialize a variable to hold the predictions
        knn = KNeighborsClassifier(n_neighbors=elem)
        knn.fit(train_data, train_labels)
        preds = knn.predict(dev_data)
        
        # We evaluate the accuracy for each value of k by comparing
        # the predictions and the labels, then updating values for
        # correct and total
        correct, total = 0, 0
        for pred, label in zip(preds, dev_labels):
            if pred == label: 
                correct += 1
            total += 1
        print 'For k=%s, total: %3d  correct: %3d  accuracy: %3.4f' %(elem, total, correct, 1.0*correct/total)

k_vals = [7, 10, 15, 20, 50]
mod_test(k_vals, train_array[:10000], train_label_array[:10000], dev_array[:2000], dev_label_array[:2000])

For k=7, total: 2000  correct: 1159  accuracy: 0.5795
For k=10, total: 2000  correct: 1157  accuracy: 0.5785
For k=15, total: 2000  correct: 1157  accuracy: 0.5785
For k=20, total: 2000  correct: 1155  accuracy: 0.5775
For k=50, total: 2000  correct: 1154  accuracy: 0.5770
