In [45]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GMM
from sklearn.mixture import GaussianMixture
from matplotlib.colors import LogNorm
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn import preprocessing

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

In [46]:
train = pd.read_csv('../train_dev_data/train_data.csv')
dev = pd.read_csv('../train_dev_data/dev_data.csv')

# Split off training labels & dev labels
train_labels = train.country_destination
dev_labels = dev.country_destination

# Make the dataset tiny for testing purposes
train = train[:15000]
train_labels = train_labels[:15000]

In [47]:
ses = pd.read_csv('../unzipped_files/sessions.csv')
# ses.head()

In [48]:
train.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,g936neasyy,2013-05-12,20130512210934,2013-05-13,-unknown-,,basic,0,en,direct,direct,linked,Web,Mac Desktop,Chrome,other
1,duq2vabpp2,2013-03-02,20130302054534,,FEMALE,31.0,facebook,0,en,direct,direct,untracked,Web,iPad,Mobile Safari,NDF
2,xiymwcsklc,2011-05-17,20110517211429,,-unknown-,105.0,facebook,2,en,direct,direct,untracked,Web,Mac Desktop,Firefox,NDF
3,8kkcksa0dw,2013-12-02,20131202180650,2013-12-11,-unknown-,37.0,basic,0,en,sem-brand,google,omg,Web,iPad,Mobile Safari,US
4,zk8qx61d9m,2013-11-07,20131107183734,,FEMALE,25.0,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome,NDF


In [49]:
def parse_month(col):
    start = col.find("-")
    end = col.find("-", start+1)
    month = col[start+1:end]
    return month

train["month_created"] = train.date_account_created.apply(parse_month)
dev["month_created"] = dev.date_account_created.apply(parse_month)

In [50]:
def parse_season(col):
    if col in ('12', '01', '02'):
        return 'Winter'
    elif col in ('03', '04', '05'):
        return 'Spring'
    elif col in ('06', '07', '08'):
        return 'Summer'
    else:
        return 'Fall'
    
train["season_created"] = train.month_created.apply(parse_season)
dev["season_created"] = dev.month_created.apply(parse_season)

In [51]:
def parse_year(col):
    stop = col.find("-")
    year = col[:stop]
    return year

train["year_created"] = train.date_account_created.apply(parse_year)
dev["year_created"] = dev.date_account_created.apply(parse_year)

In [52]:
def bin_age(col):
    if col >= 65 and col < 100:
        return "100+"
    elif col >=45 and col < 65:
        return "45-65"
    elif col >=30 and col < 45:
        return "30-45"
    elif col >= 0 and col < 30:
        return "Under30"
    else:
        return "Unknown"
    
train["bin_age"] = train.age.apply(bin_age)
dev["bin_age"] = train.age.apply(bin_age)

In [53]:
def booking_flag(col):
    if type(col) != 'str':
        return 0
    else:
        return 1

train["booking_flag"] = train.date_first_booking.apply(booking_flag)
dev["booking_flag"] = dev.date_first_booking.apply(booking_flag)

In [54]:
def bin_lang(col):
    if col in ('en', 'zh', 'es', 'fr'):
        return col
    else:
        return 'other'

train["bin_lang"] = train.language.apply(booking_flag)
dev["bin_lang"] = dev.language.apply(booking_flag)

# Test Model 1

In [55]:
# Define a function that will encode columns for decision tree
def label_encode(df, cols):
    le = preprocessing.LabelEncoder()
    for elem in cols:
        df[[elem]] = le.fit_transform(df[[elem]]) 
    return df

dt_train = train[['age', 'season_created', 'month_created', 'year_created', 'bin_age', 'bin_lang', 'gender', 
                  'booking_flag', 'signup_method', 'signup_app', 'first_device_type']]
label_encode(dt_train, ['age', 'season_created', 'month_created', 'year_created', 'bin_age', 'bin_lang', 'gender', 
                        'booking_flag', 'signup_method', 'signup_app', 'first_device_type'])

dt_dev = dev[['age', 'season_created', 'month_created', 'year_created', 'bin_age', 'bin_lang', 'gender', 
              'booking_flag', 'signup_method', 'signup_app', 'first_device_type']]
label_encode(dt_dev, ['age', 'season_created', 'month_created', 'year_created', 'bin_age', 'bin_lang', 'gender', 
                      'booking_flag', 'signup_method', 'signup_app', 'first_device_type'])

# Preprocess train & dev labels for decision tree
le = preprocessing.LabelEncoder()
dt_train_labels = le.fit_transform(train_labels)
dt_dev_labels = le.fit_transform(dev_labels)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [56]:
dt_dev.head()

Unnamed: 0,age,season_created,month_created,year_created,bin_age,bin_lang,gender,booking_flag,signup_method,signup_app,first_device_type
0,31,1,4,3,5,0,2,0,1,2,3
1,23,1,4,4,2,0,1,0,0,2,6
2,2620,1,2,3,5,0,0,0,0,2,3
3,2618,3,1,4,2,0,0,0,0,2,3
4,17,3,0,4,4,0,1,0,0,2,6


In [57]:
sample_sizes = [5, 10, 15, 20, 25, 50, 75, 100, 125]
scores = []
for elem in sample_sizes:
    print "min samples: ", elem
     # Fit decision tree model
    dt = DecisionTreeClassifier(criterion="gini", min_samples_split=elem, random_state=99)
    dt.fit(dt_train, dt_train_labels)
    # Predict with decision tree
    preds = dt.predict(dt_dev)
    print dt.score(dt_dev, dt_dev_labels)
    print dt.feature_importances_ 
print pd.Series(preds).unique()

min samples:  5
0.48768
[ 0.50052473  0.03685583  0.16612106  0.08749844  0.01036871  0.
  0.03791529  0.          0.04908795  0.03137728  0.0802507 ]
min samples:  10
0.5238
[ 0.52400028  0.03074652  0.14324197  0.07898131  0.00846581  0.
  0.03661282  0.          0.0673457   0.03282834  0.07777724]
min samples:  15
0.5574
[ 0.54077868  0.02831252  0.13011395  0.07347955  0.00370956  0.
  0.03278973  0.          0.07979592  0.03641982  0.07460027]
min samples:  20
0.56668
[ 0.55519083  0.0268705   0.1163006   0.07012413  0.00543209  0.
  0.03059055  0.          0.09004741  0.03622698  0.06921691]
min samples:  25
0.57
[ 0.56572218  0.02438781  0.10741165  0.06636065  0.00584636  0.
  0.02965026  0.          0.09878867  0.03574839  0.06608403]
min samples:  50
0.58452
[ 0.58465445  0.01562376  0.08648928  0.06238281  0.00407386  0.
  0.02807987  0.          0.12949784  0.0368225   0.05237562]
min samples:  75
0.59096
[ 0.6012535   0.01424203  0.06840027  0.06176467  0.00334828  0.
  0.

In [58]:
sample_sizes = [25, 50, 75, 100]
max_depth = [0, 1, 2, 3, 4, 5]
scores = []
for m in max_depth:
    for elem in sample_sizes:
        # Fit decision tree model
        dt = DecisionTreeClassifier(criterion="gini", min_samples_split=elem, random_state=99)
        dt.fit(dt_train, dt_train_labels)
        scores.append((dt.score(dt_dev, dt_dev_labels), m, elem))
        # Predict with decision tree
    #     preds = dt.predict(dt_dev)
for elem in scores:
    print elem


(0.56999999999999995, 0, 25)
(0.58452000000000004, 0, 50)
(0.59096000000000004, 0, 75)
(0.59516000000000002, 0, 100)
(0.56999999999999995, 1, 25)
(0.58452000000000004, 1, 50)
(0.59096000000000004, 1, 75)
(0.59516000000000002, 1, 100)
(0.56999999999999995, 2, 25)
(0.58452000000000004, 2, 50)
(0.59096000000000004, 2, 75)
(0.59516000000000002, 2, 100)
(0.56999999999999995, 3, 25)
(0.58452000000000004, 3, 50)
(0.59096000000000004, 3, 75)
(0.59516000000000002, 3, 100)
(0.56999999999999995, 4, 25)
(0.58452000000000004, 4, 50)
(0.59096000000000004, 4, 75)
(0.59516000000000002, 4, 100)
(0.56999999999999995, 5, 25)
(0.58452000000000004, 5, 50)
(0.59096000000000004, 5, 75)
(0.59516000000000002, 5, 100)


In [59]:
def visualize_tree(tree, feature_names):
    """Create tree png using graphviz.

    Args
    ----
    tree -- scikit-learn DecsisionTree.
    feature_names -- list of feature names.
    """
    with open("dt.dot", 'w') as f:
        export_graphviz(tree, out_file=f,
                        feature_names=feature_names)

    command = ["dot", "-Tpng", "dt.dot", "-o", "dt.png"]
    try:
        subprocess.check_call(command)
    except:
        exit("Could not run dot, ie graphviz, to "
             "produce visualization")

In [60]:
# Predict with decision tree
preds = dt.predict(dt_dev)
dt.score(dt_dev, dt_dev_labels)

0.59516000000000002

# Trim features

In [61]:
dt_train = train[['age', 'season_created', 'month_created', 'year_created', 'gender', 
                  'signup_method', 'signup_app', 'first_device_type']]
label_encode(dt_train, ['age', 'season_created', 'month_created', 'year_created', 'gender', 
                        'signup_method', 'signup_app', 'first_device_type'])

dt_dev = dev[['age', 'season_created', 'month_created', 'year_created', 'gender', 
              'signup_method', 'signup_app', 'first_device_type']]
label_encode(dt_dev, ['age', 'season_created', 'month_created', 'year_created', 'gender', 
                      'signup_method', 'signup_app', 'first_device_type'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,age,season_created,month_created,year_created,gender,signup_method,signup_app,first_device_type
0,31,1,4,3,2,1,2,3
1,23,1,4,4,1,0,2,6
2,2620,1,2,3,0,0,2,3
3,2618,3,1,4,0,0,2,3
4,17,3,0,4,1,0,2,6
5,2617,1,4,3,0,0,2,6
6,30,1,4,4,1,0,2,6
7,8,0,9,3,0,0,3,8
8,2610,3,1,3,0,0,2,3
9,12,2,5,4,1,0,2,8


In [62]:
sample_sizes = [50, 60, 70, 80, 90, 100]
scores = []
for elem in sample_sizes:
    print "min samples: ", elem
     # Fit decision tree model
    dt = DecisionTreeClassifier(criterion="gini", min_samples_split=elem, random_state=99)
    dt.fit(dt_train, dt_train_labels)
    # Predict with decision tree
    preds = dt.predict(dt_dev)
    print dt.score(dt_dev, dt_dev_labels)
    print dt.feature_importances_ 
print pd.Series(preds).unique()

min samples:  50
0.58496
[ 0.58714056  0.01608624  0.08621576  0.06214601  0.02849332  0.12935007
  0.03702348  0.05354456]
min samples:  60
0.5894
[ 0.59384591  0.01605689  0.07652862  0.06198564  0.02809606  0.13853254
  0.03567003  0.0492843 ]
min samples:  70
0.59272
[ 0.60065385  0.015593    0.0680145   0.06155913  0.02849016  0.14517088
  0.03246111  0.04805737]
min samples:  80
0.59036
[ 0.60670319  0.01596219  0.06363447  0.06221003  0.02769316  0.15051434
  0.02927823  0.04400438]
min samples:  90
0.59496
[ 0.60869423  0.015308    0.05885167  0.06156502  0.02801813  0.1583998
  0.02978986  0.03937329]
min samples:  100
0.59612
[ 0.60794866  0.0132112   0.05973006  0.06190021  0.02818757  0.16220467
  0.02930171  0.03751594]
[ 7 10  4  2  5 11  6  0]


# Add sessions data

In [None]:
ses_agg = pd.read_csv()