In [2]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

In [3]:
train = pd.read_csv('../train_dev_data/train_data.csv')
train.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,g936neasyy,2013-05-12,20130512210934,2013-05-13,-unknown-,,basic,0,en,direct,direct,linked,Web,Mac Desktop,Chrome,other
1,duq2vabpp2,2013-03-02,20130302054534,,FEMALE,31.0,facebook,0,en,direct,direct,untracked,Web,iPad,Mobile Safari,NDF
2,xiymwcsklc,2011-05-17,20110517211429,,-unknown-,105.0,facebook,2,en,direct,direct,untracked,Web,Mac Desktop,Firefox,NDF
3,8kkcksa0dw,2013-12-02,20131202180650,2013-12-11,-unknown-,37.0,basic,0,en,sem-brand,google,omg,Web,iPad,Mobile Safari,US
4,zk8qx61d9m,2013-11-07,20131107183734,,FEMALE,25.0,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome,NDF


In [3]:
dev = pd.read_csv('../train_dev_data/dev_data.csv')
dev.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,vwcvba5tbi,2013-05-10,20130510181208,2013-05-17,MALE,43.0,facebook,0,en,direct,direct,linked,Web,Mac Desktop,Chrome,US
1,lh49nmvbhr,2014-05-30,20140530230300,2014-12-15,FEMALE,35.0,basic,0,en,sem-brand,google,omg,Web,Windows Desktop,Chrome,CA
2,peowrqmisu,2013-03-15,20130315150346,,-unknown-,,basic,0,en,direct,direct,linked,Web,Mac Desktop,Safari,NDF
3,568wanlyy1,2014-02-10,20140210064831,,-unknown-,,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Safari,NDF
4,ihi4gy0dyl,2014-01-01,20140101193508,2014-01-06,FEMALE,29.0,basic,0,en,direct,direct,untracked,Web,Windows Desktop,Chrome,other


In [4]:
# Split off training labels & dev labels
train_labels = train.country_destination
dev_labels = dev.country_destination

In [5]:
# Explore relationships between variables and outcome
#pd.crosstab(train.language, train.country_destination)

In [6]:
# pd.crosstab(train.gender, train.country_destination)

In [7]:
# pd.crosstab(train.first_device_type, train.country_destination)

In [35]:
# Transform variables we want to use into binary
test_cols = ['language', 'gender']
tr_binary_vars = pd.get_dummies(train[['language', 'gender']], columns=test_cols)
# tr_binary_vars.head()

dev_binary_vars = pd.get_dummies(train[['language', 'gender']], columns=test_cols)
# dev_binary_vars.head()

In [36]:
# Create arrays based on the variables we want to use
train_array = np.array(tr_binary_vars)
train_label_array = np.array(train_labels)

dev_array = np.array(dev_binary_vars)
dev_label_array = np.array(dev_labels)

In [37]:
# Try a model
def mod_test(k_vals, train_data, train_labels, dev_data, dev_labels):

### STUDENT START ###

    # We want to evaluate a variety of values for k, so we need
    # to enclose our work in a loop.
    for elem in k_vals:
        
        # Create a classifier object, fit our training data &
        # initialize a variable to hold the predictions
        knn = KNeighborsClassifier(n_neighbors=elem)
        knn.fit(train_data, train_labels)
        preds = knn.predict(dev_data)
        
        # We evaluate the accuracy for each value of k by comparing
        # the predictions and the labels, then updating values for
        # correct and total
        correct, total = 0, 0
        for pred, label in zip(preds, dev_labels):
            if pred == label: 
                correct += 1
            total += 1
        print 'For k=%s, total: %3d  correct: %3d  accuracy: %3.2f' %(elem, total, correct, 1.0*correct/total)

k_vals = [7, 10, 15, 20, 50]
mod_test(k_vals, train_array[:2000], train_label_array[:2000], dev_array[:2000], dev_label_array[:2000])

For k=7, total: 2000  correct: 1009  accuracy: 0.50
For k=10, total: 2000  correct: 1158  accuracy: 0.58
For k=15, total: 2000  correct: 1155  accuracy: 0.58
For k=20, total: 2000  correct: 1157  accuracy: 0.58
For k=50, total: 2000  correct: 1158  accuracy: 0.58


In [30]:
# Try another model
def mod_test2(train_data, train_labels, dev_data, dev_labels):
    
    # LOGISTIC REGRESSION
    log = LogisticRegression()
    
    C_options = {'C': np.arange(0.1, 1, 0.1)}
    log_grid = GridSearchCV(log, C_options)

    log_grid.fit(train_data, train_labels)
    log_preds = log_grid.predict(dev_data)
    
    # Output best param
    print "Best value for C: %.2f" %log_grid.best_params_['C']
    print "F1 score for Logistic Regression: %.3f" %metrics.f1_score(dev_labels, log_preds, average="weighted") + "\n"
    

### STUDENT END ###

mod_test2(train_array[:2000], train_label_array[:2000], dev_array[:2000], dev_label_array[:2000])

Best value for C: 0.10
F1 score for Logistic Regression: 0.425

