In [1]:
# General libraries.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Draw inline
%matplotlib inline

# Set seaborn style
sns.set_context("notebook", font_scale=1.5)

# Sklearn preprocessing
from sklearn.preprocessing import OneHotEncoder

# Sklearn libraries.
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
np.random.seed(0)



### Prepare Data

In [2]:
# Load full data set (combined with actions)
train = pd.read_csv('../data/train_combined_actions.zip')
print("train shape:", train.shape)

# load test data
test = pd.read_csv('../data/test_combined_actions.zip')
print("test shape:", test.shape)

  interactivity=interactivity, compiler=compiler, result=result)


train shape: (213451, 541)
test shape: (62096, 540)


In [3]:
train = train.reset_index().iloc[: , 1:]

# Shuffle training set
shuffle = np.random.permutation(np.arange(train.shape[0]))
train = train.reindex(shuffle)

# Split labels from training set
train_labels = train['country_destination']
train_data = train.drop('country_destination', axis=1)

# Concatenate test data (so able to binarize categorical features later)
data = pd.concat((train_data, test))

# remove action features
col = data.columns.get_loc("last_action") # last column index before action columns begin
data = data.iloc[: , :col+1]

# Define row index on where to split full dataset for dev, train, and test
dev_cutoff = 25000   # 12% of training data
test_cutoff = train.shape[0]

dev_labels, train_labels = train_labels[:25000], train_labels[25000:]

print("full dataset shape:", data.shape)
print("dev labels shape", dev_labels.shape)
print("train_labels shape", train_labels.shape)
print("features:", list(data))

full dataset shape: (275547, 26)
dev labels shape (25000,)
train_labels shape (188451,)
features: ['id', 'date_account_created', 'timestamp_first_active', 'gender', 'age', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser', 'month_created', 'season_created', 'year_created', 'bin_age', 'bin_lang', 'days_since_creation', 'first_hour', 'count_actions', 'number_devices', 'longest_session', 'total_time', 'last_action']


### Encode Labels

In [4]:
# Output 3 classes: NDF, US, Non-US
def country(col):
    if col == 'NDF':
        return "NDF"
    elif col == 'US':
        return 'US'
    else:
        return 'non-US'
    
t = train_labels.apply(country)
d = dev_labels.apply(country)

# Convert labels to numeric
le = preprocessing.LabelEncoder() # Initialize label_encoder
t_lab, d_lab = le.fit_transform(t), le.fit_transform(d)

print(t_lab.shape)
print(d_lab.shape)

country_code = dict(zip(le.classes_, le.transform(le.classes_)))
print(country_code)

(188451,)
(25000,)
{'NDF': 0, 'US': 1, 'non-US': 2}


### Iterating through features using Logistic Regression 

In [5]:
def binarize(data, devcut, testcut, column): 
    dev_bin = pd.get_dummies(data[column])[:devcut]
    train_bin = pd.get_dummies(data[column])[devcut:testcut]
    test_bin = pd.get_dummies(data[column])[testcut:]
    return dev_bin, train_bin, test_bin

def logreg(c, penalty, train, train_label, dev, dev_label):
    lm = LogisticRegression(C = c, penalty = penalty)
    lm.fit(tr, t_lab)
    score = lm.score(d, d_lab)
    return lm, score

# cat_feat = ['bin_age', 'first_browser', 'gender', 'signup_method', 'last_action']

cat_feat = ['bin_age', 'first_browser', 'gender', 'signup_method', 'last_action', 
            'season_created', 'first_device_type', 'signup_app', 'affiliate_channel', 'first_hour']

# Initialize with first feature column 
d, tr, te = binarize(data, dev_cutoff, test_cutoff, cat_feat[0])

print("First Feature:", cat_feat[0])
lm1, score1 = logreg(50, "l1", tr, t_lab, d, d_lab) # Run logreg with l1 penalty
print("L1 num cols: ", tr.shape[1], "score:", score1)

lm2, score2 = logreg(50, "l2", tr, t_lab, d, d_lab) # Run logreg with l2 penalty
print("L2 num cols: ", tr.shape[1], "score:", score2)

pred = lm2.predict(d)
conf = confusion_matrix(pred, d_lab)
print(conf)

d2, tr2, te2 = binarize(data, dev_cutoff, test_cutoff, cat_feat[2])

# Binarize categorical features, add to dataset, run logistic regression on binarized features
for x in cat_feat[1:]:
    d_bin, tr_bin, te_bin = binarize(data, dev_cutoff, test_cutoff, x)
    d = np.concatenate((d, d_bin), axis=1)
    tr = np.concatenate((tr, tr_bin), axis=1)
    te = np.concatenate((te, te_bin), axis=1)

    # Run logistic regression on data set with added feature
    print("\nAdded Feature:", x)
    lm1, score1 = logreg(50, "l1", tr, t_lab, d, d_lab)
    print("L1 num cols: ", tr.shape[1], "score:", score1)
    lm2, score2 = logreg(50, "l2", tr, t_lab, d, d_lab)
    print("L2 num cols: ", tr.shape[1], "score:", score2)

    pred = lm2.predict(d)
    conf = confusion_matrix(pred, d_lab)
    print(conf)

# Iterate over numerical features
num_feat = ['count_actions', 'days_since_creation', 'total_time', 
            'number_devices', 'longest_session']
# num_feat = ['count_actions']

# underperforming numerical features:
# 'days_since_creation', 'total_time', 'number_devices', 'longest_session', 'request_photography'


for y in num_feat:
    # get column and fill in NaNs with training data column mean
    mean = np.mean(data[y][dev_cutoff:test_cutoff])
    
    tr_col = data[y][dev_cutoff:test_cutoff].fillna(mean)
    d_col = data[y][:dev_cutoff].fillna(mean)
    te_col = data[y][test_cutoff:].fillna(mean)
    
    tr_col = tr_col.reshape(tr.shape[0], 1)
    d_col = d_col.reshape(d.shape[0], 1)
    te_col = te_col.reshape(te.shape[0], 1)
    
    # add column to previous training set
    tr = np.concatenate((tr, tr_col), axis=1)
    d = np.concatenate((d, d_col), axis=1)
    te = np.concatenate((te, te_col), axis=1)

    # Run logistic regression on data set with added feature
    print("\nAdded Feature:", y)
    lm1, score1 = logreg(50, "l1", tr, t_lab, d, d_lab)
    print("L1 num cols: ", tr.shape[1], "score:", score1)
    lm2, score2 = logreg(50, "l2", tr, t_lab, d, d_lab)
    print("L2 num cols: ", tr.shape[1], "score:", score2)

    pred = lm2.predict(d)
    conf = confusion_matrix(pred, d_lab)
    print(conf)

First Feature: bin_age
L1 num cols:  5 score: 0.58604
L2 num cols:  5 score: 0.58604
[[14651  7272  3077]
 [    0     0     0]
 [    0     0     0]]

Added Feature: first_browser
L1 num cols:  60 score: 0.58764
L2 num cols:  60 score: 0.58764
[[12963  5543  2385]
 [ 1688  1728   692]
 [    0     1     0]]

Added Feature: gender
L1 num cols:  64 score: 0.59548
L2 num cols:  64 score: 0.59548
[[12871  5255  2262]
 [ 1780  2016   815]
 [    0     1     0]]

Added Feature: signup_method
L1 num cols:  68 score: 0.62764
L2 num cols:  68 score: 0.62764
[[12518  4098  1789]
 [ 2133  3173  1288]
 [    0     1     0]]

Added Feature: last_action
L1 num cols:  200 score: 0.63364
L2 num cols:  200 score: 0.63364
[[12595  4024  1772]
 [ 2056  3246  1305]
 [    0     2     0]]

Added Feature: season_created
L1 num cols:  204 score: 0.6324
L2 num cols:  204 score: 0.6324
[[12609  4069  1786]
 [ 2042  3201  1291]
 [    0     2     0]]

Added Feature: first_device_type
L1 num cols:  213 score: 0.63204





Added Feature: count_actions
L1 num cols:  250 score: 0.63396
L2 num cols:  250 score: 0.63384
[[12652  4078  1787]
 [ 1987  3187  1283]
 [   12     7     7]]

Added Feature: days_since_creation
L1 num cols:  251 score: 0.63512
L2 num cols:  251 score: 0.63484
[[12676  4076  1789]
 [ 1965  3189  1282]
 [   10     7     6]]

Added Feature: total_time
L1 num cols:  252 score: 0.6356
L2 num cols:  252 score: 0.58604
[[14651  7272  3077]
 [    0     0     0]
 [    0     0     0]]

Added Feature: number_devices
L1 num cols:  253 score: 0.63496
L2 num cols:  253 score: 0.58604
[[14651  7272  3077]
 [    0     0     0]
 [    0     0     0]]

Added Feature: longest_session
L1 num cols:  254 score: 0.63544
L2 num cols:  254 score: 0.58604
[[14651  7272  3077]
 [    0     0     0]
 [    0     0     0]]


In [6]:
# Run last model and iterate over Cs
Cs = [0.5, 1, 10, 50, 100]

max1, max2 = 0, 0
c1, c2 = 0, 0

for c in Cs: 
    lm1, score1 = logreg(c, "l1", tr, t_lab, d, d_lab)
    print("\nL1 num cols: %4s,  C: %2s,  score: %.4f" %(tr.shape[1], c, score1))
    
    lm2, score2 = logreg(c, "l2", tr, t_lab, d, d_lab)
    print("L2 num cols: %4s,  C: %2s,  score: %.4f" %(tr.shape[1], c, score2))

    pred = lm1.predict(d)
    conf = confusion_matrix(pred, d_lab)
    print(conf)
    
    if score1 > max1:
        max1 = score1
        c1 = c
    if score2 > max2:
        max2 = score2
        c2 = c
    
print("\nBest L1 c:", c1, "score:", max1)
print("Best L2 c:", c2, "score:", max2)


L1 num cols:  254,  C: 0.5,  score: 0.6354
L2 num cols:  254,  C: 0.5,  score: 0.5860
[[12707  4089  1802]
 [ 1933  3176  1272]
 [   11     7     3]]

L1 num cols:  254,  C:  1,  score: 0.6352
L2 num cols:  254,  C:  1,  score: 0.5860
[[12702  4090  1803]
 [ 1938  3175  1271]
 [   11     7     3]]

L1 num cols:  254,  C: 10,  score: 0.6352
L2 num cols:  254,  C: 10,  score: 0.5860
[[12701  4088  1803]
 [ 1938  3176  1270]
 [   12     8     4]]

L1 num cols:  254,  C: 50,  score: 0.6352
L2 num cols:  254,  C: 50,  score: 0.5860
[[12700  4087  1803]
 [ 1939  3176  1270]
 [   12     9     4]]

L1 num cols:  254,  C: 100,  score: 0.6353
L2 num cols:  254,  C: 100,  score: 0.5860
[[12704  4088  1802]
 [ 1935  3175  1271]
 [   12     9     4]]

Best L1 c: 0.5 score: 0.63544
Best L2 c: 0.5 score: 0.58604
