In [1]:
# General libraries.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Draw inline
%matplotlib inline

# Set seaborn style
sns.set_context("notebook", font_scale=1.5)

# Sklearn preprocessing
from sklearn.preprocessing import OneHotEncoder

# Sklearn libraries.
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
np.random.seed(0)



### Prepare Data

In [2]:
# Load full data set (combined with actions)
train = pd.read_csv('../data/train_combined_actions.zip')
print("train shape:", train.shape)

# load test data
test = pd.read_csv('../data/test_combined_actions.zip')
print("test shape:", test.shape)

  interactivity=interactivity, compiler=compiler, result=result)


train shape: (213451, 541)
test shape: (62096, 540)


In [3]:
test.head(5)

Unnamed: 0,id,date_account_created,timestamp_first_active,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,...,view_resolutions,view_search_results,view_security_checks,view_user_real_names,wishlist,wishlist_content_update,wishlist_note,your_listings,your_reservations,your_trips
0,5uwns89zht,2014-07-01,20140701000006,FEMALE,35.0,facebook,0,en,direct,direct,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,jtl0dijy2j,2014-07-01,20140701000051,-unknown-,,basic,0,en,direct,direct,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,xx0ulgorjt,2014-07-01,20140701000148,-unknown-,,basic,0,en,direct,direct,...,0.0,48.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0
3,6c6puo6ix0,2014-07-01,20140701000215,-unknown-,,basic,0,en,direct,direct,...,0.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
4,czqhjk3yfe,2014-07-01,20140701000305,-unknown-,,basic,0,en,direct,direct,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [15]:
# subset full data set to only include users that have sessions data, remove action counts for now 
mod_train = train[train.count_actions.isnull() != True]
mod_train = mod_train.reset_index().iloc[: , 1:]

# Shuffle training set
shuffle = np.random.permutation(np.arange(mod_train.shape[0]))
mod_train = mod_train.reindex(shuffle)

# Split labels from training set
train_labels = mod_train['country_destination']
train_data = mod_train.drop('country_destination', axis=1)

# Concatenate test data (so able to binarize categorical features later)
data = pd.concat((train_data, test))

# remove action features
col = data.columns.get_loc("last_action") # last column index before action columns begin
data = data.iloc[: , :col+1]

# Define row index on where to split full dataset for dev, train, and test
dev_cutoff = 8850   # 12% of training data
test_cutoff = mod_train.shape[0]

dev_labels, train_labels = train_labels[:dev_cutoff], train_labels[dev_cutoff:]

print("full dataset shape:", data.shape)
print("dev labels shape", dev_labels.shape)
print("train_labels shape", train_labels.shape)
print("features:", list(data))

full dataset shape: (135911, 26)
dev labels shape (8850,)
train_labels shape (64965,)
features: ['id', 'date_account_created', 'timestamp_first_active', 'gender', 'age', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser', 'month_created', 'season_created', 'year_created', 'bin_age', 'bin_lang', 'days_since_creation', 'first_hour', 'count_actions', 'number_devices', 'longest_session', 'total_time', 'last_action']


In [5]:
mod_train.shape

(73815, 541)

### Encode Labels

In [16]:
# Output 3 classes: NDF, US, Non-US
def country(col):
    if col == 'NDF':
        return "NDF"
    elif col == 'US':
        return 'US'
    else:
        return 'non-US'
    
t = train_labels.apply(country)
d = dev_labels.apply(country)

# Convert labels to numeric
le = preprocessing.LabelEncoder() # Initialize label_encoder
t_lab, d_lab = le.fit_transform(t), le.fit_transform(d)

print(t_lab.shape)
print(d_lab.shape)

country_code = dict(zip(le.classes_, le.transform(le.classes_)))
print(country_code)

(64965,)
(8850,)
{'NDF': 0, 'US': 1, 'non-US': 2}


### Iterating through features using Logistic Regression 

In [17]:
import time
start = time.time()

def binarize(data, devcut, testcut, column): 
    dev_bin = pd.get_dummies(data[column])[:devcut]
    train_bin = pd.get_dummies(data[column])[devcut:testcut]
    test_bin = pd.get_dummies(data[column])[testcut:]
    return dev_bin, train_bin, test_bin

def logreg(c, penalty, train, train_label, dev, dev_label):
    lm = LogisticRegression(C = c, penalty = penalty)
    lm.fit(tr, t_lab)
    score = lm.score(d, d_lab)
    return lm, score
    
cat_feat = ['bin_age', 'first_browser', 'gender', 'signup_method', 'last_action']
# cat_feat = ['bin_age', 'first_browser', 'gender', 'signup_method', 'last_action', 
#             'season_created', 'first_device_type', 'signup_app', 'affiliate_channel', 'first_hour']

# underperforming categorical variables: 
# 'season_created', 'first_device_type', signup_app', 'affiliate_channel', 'first_hour'

# Initialize with first feature column 
d, tr, te = binarize(data, dev_cutoff, test_cutoff, cat_feat[0])

print("First Feature:", cat_feat[0])
lm1, score1 = logreg(50, "l1", tr, t_lab, d, d_lab) # Run logreg with l1 penalty
print("L1 num cols: ", tr.shape[1], "score:", score1)

lm2, score2 = logreg(50, "l2", tr, t_lab, d, d_lab) # Run logreg with l2 penalty
print("L2 num cols: ", tr.shape[1], "score:", score2)

pred = lm2.predict(d)
conf = confusion_matrix(pred, d_lab)
print(conf)

d2, tr2, te2 = binarize(data, dev_cutoff, test_cutoff, cat_feat[2])

# Binarize categorical features, add to dataset, run logistic regression on binarized features
for x in cat_feat[1:]:
    d_bin, tr_bin, te_bin = binarize(data, dev_cutoff, test_cutoff, x)
    d = np.concatenate((d, d_bin), axis=1)
    tr = np.concatenate((tr, tr_bin), axis=1)
    te = np.concatenate((te, te_bin), axis=1)

    # Run logistic regression on data set with added feature
    print("\nAdded Feature:", x)
    lm1, score1 = logreg(50, "l1", tr, t_lab, d, d_lab)
    print("L1 num cols: ", tr.shape[1], "score:", score1)
    lm2, score2 = logreg(50, "l2", tr, t_lab, d, d_lab)
    print("L2 num cols: ", tr.shape[1], "score:", score2)

    pred = lm2.predict(d)
    conf = confusion_matrix(pred, d_lab)
    print(conf)

# underperforming numerical features:
# 'days_since_creation', 'total_time', 'number_devices', 'longest_session', 'request_photography'

# num_feat = ['count_actions', 'days_since_creation', 'total_time', 
#             'number_devices', 'longest_session']

num_feat = ['count_actions']

for y in num_feat:
    # get column and fill in NaNs with training data column mean
    mean = np.mean(data[y][dev_cutoff:test_cutoff])
    
    tr_col = data[y][dev_cutoff:test_cutoff].fillna(mean)
    d_col = data[y][:dev_cutoff].fillna(mean)
    te_col = data[y][test_cutoff:].fillna(mean)
    
    tr_col = tr_col.reshape(tr.shape[0], 1)
    d_col = d_col.reshape(d.shape[0], 1)
    te_col = te_col.reshape(te.shape[0], 1)
    
    # add column to previous training set
    tr = np.concatenate((tr, tr_col), axis=1)
    d = np.concatenate((d, d_col), axis=1)
    te = np.concatenate((te, te_col), axis=1)

    # Run logistic regression on data set with added feature
    print("\nAdded Feature:", y)
    lm1, score1 = logreg(50, "l1", tr, t_lab, d, d_lab)
    print("L1 num cols: ", tr.shape[1], "score:", score1)
    lm2, score2 = logreg(50, "l2", tr, t_lab, d, d_lab)
    print("L2 num cols: ", tr.shape[1], "score:", score2)

    pred = lm2.predict(d)
    conf = confusion_matrix(pred, d_lab)
    print(conf)
    
end = time.time()
print(end - start)

First Feature: bin_age
L1 num cols:  5 score: 0.602824858757
L2 num cols:  5 score: 0.602824858757
[[5335 2455 1060]
 [   0    0    0]
 [   0    0    0]]

Added Feature: first_browser
L1 num cols:  45 score: 0.610847457627
L2 num cols:  45 score: 0.610847457627
[[4380 1429  664]
 [ 954 1026  396]
 [   1    0    0]]

Added Feature: gender
L1 num cols:  49 score: 0.620677966102
L2 num cols:  49 score: 0.620790960452
[[4728 1689  754]
 [ 606  766  306]
 [   1    0    0]]

Added Feature: signup_method
L1 num cols:  53 score: 0.656271186441
L2 num cols:  53 score: 0.656271186441
[[4583 1230  578]
 [ 752 1225  482]
 [   0    0    0]]

Added Feature: last_action
L1 num cols:  185 score: 0.664971751412
L2 num cols:  185 score: 0.664858757062
[[4620 1191  570]
 [ 715 1264  490]
 [   0    0    0]]

Added Feature: count_actions




L1 num cols:  186 score: 0.664745762712
L2 num cols:  186 score: 0.664971751412
[[4621 1191  566]
 [ 713 1264  494]
 [   1    0    0]]
14.860193967819214


In [12]:
start = time.time()
# Run last model and iterate over Cs
Cs = [0.5, 1, 10, 50, 100]

max1, max2 = 0, 0
c1, c2 = 0, 0

for c in Cs: 
    lm1, score1 = logreg(c, "l1", tr, t_lab, d, d_lab)
    print("\nL1 num cols: %4s,  C: %2s,  score: %.4f" %(tr.shape[1], c, score1))
    
    lm2, score2 = logreg(c, "l2", tr, t_lab, d, d_lab)
    print("L2 num cols: %4s,  C: %2s,  score: %.4f" %(tr.shape[1], c, score2))

    pred = lm1.predict(d)
    conf = confusion_matrix(pred, d_lab)
    print(conf)
    
    if score1 > max1:
        max1 = score1
        c1 = c
    if score2 > max2:
        max2 = score2
        c2 = c
    
print("\nBest L1 c:", c1, "score:", max1)
print("Best L2 c:", c2, "score:", max2)
end = time.time()
print(end - start)


L1 num cols:  186,  C: 0.5,  score: 0.6650
L2 num cols:  186,  C: 0.5,  score: 0.6654
[[4618 1188  564]
 [ 716 1267  496]
 [   1    0    0]]

L1 num cols:  186,  C:  1,  score: 0.6650
L2 num cols:  186,  C:  1,  score: 0.6650
[[4620 1190  566]
 [ 714 1265  494]
 [   1    0    0]]

L1 num cols:  186,  C: 10,  score: 0.6649
L2 num cols:  186,  C: 10,  score: 0.6649
[[4617 1188  567]
 [ 717 1267  493]
 [   1    0    0]]

L1 num cols:  186,  C: 50,  score: 0.6649
L2 num cols:  186,  C: 50,  score: 0.6650
[[4618 1189  567]
 [ 716 1266  493]
 [   1    0    0]]

L1 num cols:  186,  C: 100,  score: 0.6647
L2 num cols:  186,  C: 100,  score: 0.6647
[[4618 1190  567]
 [ 716 1265  493]
 [   1    0    0]]

Best L1 c: 0.5 score: 0.664971751412
Best L2 c: 0.5 score: 0.665423728814
23.81705641746521


# Running through Test data

In [9]:
# Use L1 regularization with c=1

te.shape
lm = LogisticRegression(C = c1, penalty = "l1")
lm.fit(tr, t_lab)
preds = lm.predict(te)
preds

country_code = dict(zip(le.classes_, le.transform(le.classes_)))
print(country_code)

countries = []
for i in preds:
    if i == 0:
        pred = 'NDF'
    elif i == 1:
        pred = 'US'
    else:
        pred = 'other'
    countries.append(pred)


{'NDF': 0, 'US': 1, 'non-US': 2}


In [10]:
# create dataframe and write to csv
ids = list(data.id[test_cutoff:])
logreg_sub = pd.DataFrame({'id': ids, 'country': countries}, columns=['id', 'country'])

logreg_sub.to_csv('./logreg_submission.csv',sep=',', index=False)