In [119]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn library for preprocessing
from sklearn import preprocessing

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# Set the randomizer seed so results are the same each time.
np.random.seed(0)


In [127]:
# Read in csv and create arrays
users_train_raw = pd.read_csv('../zip_files/train_users_2.csv.zip')
sessions_raw = pd.read_csv('../zip_files/sessions.csv.zip')
demographics = pd.read_csv('../zip_files/age_gender_bkts.csv.zip')
countries = pd.read_csv('../zip_files/countries.csv.zip')
test = pd.read_csv('../zip_files/test_users.csv.zip')



In [262]:
# Shuffle data
np.random.seed(0)
shuffle = np.random.permutation(np.arange(users_train_raw.shape[0]))
len(shuffle)
x = users_train_raw.reindex(shuffle).ix[:,1:] # remove user_id

# encode all values in numbers 
y = pd.DataFrame()
for column in list(x):
    y[column] = pd.factorize(x[column], sort=True)[0]

# split out labels from features
# normalize features
data, labels = preprocessing.normalize(np.asarray(y)[:,:-1]), np.asarray(y)[:,-1]

# Split into train and dev.
dev_data, dev_labels = data[:25000], labels[:25000]
train_data, train_labels = data[25000:], labels[25000:]



In [263]:
print((train_data.shape))
print((dev_data.shape))

(188451, 14)
(25000, 14)


In [270]:
label_names = np.unique(x["country_destination"])

print ('training data shape:', train_data.shape)
print ('training label shape:', train_labels.shape)
print ('dev data shape:', dev_data.shape)
print ('dev label shape:', dev_labels.shape)
print ('labels names:', label_names)

training data shape: (188451, 14)
training label shape: (188451,)
dev data shape: (25000, 14)
dev label shape: (25000,)
labels names: ['AU' 'CA' 'DE' 'ES' 'FR' 'GB' 'IT' 'NDF' 'NL' 'PT' 'US' 'other']


In [271]:
# Bernoulli Naive Bayes
# finding the best alpha
alphas = {'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}

bnb_clf = BernoulliNB()
bnb = GridSearchCV(estimator=bnb_clf, param_grid=[alphas], cv=5, scoring="accuracy", refit=True)
bnb.fit(train_data, train_labels)
for params, mean_score, scores in bnb.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r" %(mean_score, scores.std()/2, params))

print("\nOptimized Parameters: ", bnb.best_estimator_)
print("optimized accuracy: %.4f" %bnb.score(dev_data, dev_labels))
print("Best alpha:", bnb.best_params_)

  self.feature_log_prob_ = (np.log(smoothed_fc) -
  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
  jll += self.class_log_prior_ + neg_prob.sum(axis=1)
  self.feature_log_prob_ = (np.log(smoothed_fc) -
  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
  jll += self.class_log_prior_ + neg_prob.sum(axis=1)
  self.feature_log_prob_ = (np.log(smoothed_fc) -
  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
  jll += self.class_log_prior_ + neg_prob.sum(axis=1)
  self.feature_log_prob_ = (np.log(smoothed_fc) -
  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
  jll += self.class_log_prior_ + neg_prob.sum(axis=1)
  self.feature_log_prob_ = (np.log(smoothed_fc) -
  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
  jll += self.class_log_prior_ + neg_prob.sum(axis=1)


0.003 (+/-0.000) for {'alpha': 0.0}
0.876 (+/-0.000) for {'alpha': 0.0001}
0.876 (+/-0.000) for {'alpha': 0.001}
0.876 (+/-0.000) for {'alpha': 0.01}
0.876 (+/-0.000) for {'alpha': 0.1}
0.876 (+/-0.000) for {'alpha': 0.5}
0.876 (+/-0.000) for {'alpha': 1.0}
0.876 (+/-0.000) for {'alpha': 2.0}
0.876 (+/-0.000) for {'alpha': 10.0}

Optimized Parameters:  BernoulliNB(alpha=0.0001, binarize=0.0, class_prior=None, fit_prior=True)
optimized accuracy: 0.8769
Best alpha: {'alpha': 0.0001}




In [272]:
alphas = {'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}

for alpha in alphas.get("alpha"):
    bnb = BernoulliNB(alpha = alpha)
    bnb.fit(train_data, train_labels)
    score = bnb.score(dev_data, dev_labels)
    print("alpha: %f  dev accuracy: %.2f" %(alpha, score))

  self.feature_log_prob_ = (np.log(smoothed_fc) -
  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
  jll += self.class_log_prior_ + neg_prob.sum(axis=1)


alpha: 0.000000  dev accuracy: 0.00
alpha: 0.000100  dev accuracy: 0.88
alpha: 0.001000  dev accuracy: 0.88
alpha: 0.010000  dev accuracy: 0.88
alpha: 0.100000  dev accuracy: 0.88
alpha: 0.500000  dev accuracy: 0.88
alpha: 1.000000  dev accuracy: 0.88
alpha: 2.000000  dev accuracy: 0.88
alpha: 10.000000  dev accuracy: 0.88
