In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn library for preprocessing
from sklearn import preprocessing

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# Set the randomizer seed so results are the same each time.
np.random.seed(0)



In [2]:
# Read in csv and create arrays
users_train_raw = pd.read_csv('../zip_files/train_users_2.csv.zip')
sessions_raw = pd.read_csv('../zip_files/sessions.csv.zip')
demographics = pd.read_csv('../zip_files/age_gender_bkts.csv.zip')
countries = pd.read_csv('../zip_files/countries.csv.zip')
test = pd.read_csv('../zip_files/test_users.csv.zip')

In [3]:
# Shuffle data
np.random.seed(0)
shuffle = np.random.permutation(np.arange(users_train_raw.shape[0]))
len(shuffle)
x = users_train_raw.reindex(shuffle).ix[:,1:] # remove user_id

# encode all values in numbers 
y = pd.DataFrame()
for column in list(x):
    y[column] = pd.factorize(x[column], sort=True)[0]

# split out labels from features
# normalize features
data, labels = preprocessing.normalize(np.asarray(y)[:,:-1]), np.asarray(y)[:,-1]

# Split into train and dev.
dev_data, dev_labels = data[:25000], labels[:25000]
train_data, train_labels = data[25000:], labels[25000:]



In [14]:
# MODEL WITHOUT SIGNUP_FLOW VARIABLE

train = pd.read_csv('../train_dev_data/train_data.csv')
dev = pd.read_csv('../train_dev_data/dev_data.csv')

In [21]:
#CREATING A SMALLER DATA SET

small_train = train[:10000]
small_dev = dev[:10000]
small_train_labels = train_labels[:10000]
small_dev_labels = dev_labels[:10000]

In [22]:
#variables = ['id', 'date_first_booking', 'gender', 'signup_method', 'language', 'signup_flow', 'affiliate_channel', 'affiliate_provider', 'signup_app', 'first_device_type', 'first_browser']
#variables = ['id', 'date_first_booking', 'gender', 'language', 'first_device_type', 'first_browser']
variables = ['gender']

train_variables_data = pd.get_dummies(small_train[['gender']], columns=variables)
train_variables_data.head()

Unnamed: 0,gender_-unknown-,gender_FEMALE,gender_MALE,gender_OTHER
0,1,0,0,0
1,0,1,0,0
2,1,0,0,0
3,1,0,0,0
4,0,1,0,0


In [23]:
dev_variables_data = pd.get_dummies(small_dev[['gender']], columns=variables)
dev_variables_data.head()

Unnamed: 0,gender_-unknown-,gender_FEMALE,gender_MALE,gender_OTHER
0,0,0,1,0
1,0,1,0,0
2,1,0,0,0
3,1,0,0,0
4,0,1,0,0


In [24]:
#convert that dataframe back into an array to test the model
train_variables_array = np.array(train_variables_data)
dev_variables_array = np.array(dev_variables_data)

In [25]:
print((train_data.shape))
print((dev_data.shape))
print "--------"
print((small_train.shape))
print((small_dev.shape))
print((small_train_labels.shape))
print((small_dev_labels.shape))
print "--------"
print((train_variables_array.shape))
print((dev_variables_array.shape))

(188451, 14)
(25000, 14)
--------
(50000, 16)
(25000, 16)
(50000,)
(25000,)
--------
(50000, 4)
(25000, 4)


#### GENDER    ||    Best Accuracy: __0.59____

In [17]:
variables = ['gender']

train_variables_data = pd.get_dummies(small_train[['gender']], columns=variables)
dev_variables_data = pd.get_dummies(small_dev[['gender']], columns=variables)

#convert that dataframe back into an array to test the model
train_variables_array = np.array(train_variables_data)
dev_variables_array = np.array(dev_variables_data)

alphas = {'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}

for alpha in alphas.get("alpha"):
    bnb = BernoulliNB(alpha = alpha)
    bnb.fit(train_variables_data, small_train_labels)
    score = bnb.score(dev_variables_data, small_dev_labels)
    print("alpha: %f  dev accuracy: %.2f" %(alpha, score))

alpha: 0.000000  dev accuracy: 0.00
alpha: 0.000100  dev accuracy: 0.59
alpha: 0.001000  dev accuracy: 0.59
alpha: 0.010000  dev accuracy: 0.59
alpha: 0.100000  dev accuracy: 0.59
alpha: 0.500000  dev accuracy: 0.59
alpha: 1.000000  dev accuracy: 0.59
alpha: 2.000000  dev accuracy: 0.59
alpha: 10.000000  dev accuracy: 0.59


#### AGE    ||    Best Accuracy: __????____

In [26]:
variables = ['age']

train_variables_data = pd.get_dummies(small_train[['age']], columns=variables)
dev_variables_data = pd.get_dummies(small_dev[['age']], columns=variables)

#convert that dataframe back into an array to test the model
train_variables_array = np.array(train_variables_data)
dev_variables_array = np.array(dev_variables_data)

alphas = {'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}

for alpha in alphas.get("alpha"):
    bnb = BernoulliNB(alpha = alpha)
    bnb.fit(train_variables_data, small_train_labels)
    score = bnb.score(dev_variables_data, small_dev_labels)
    print("alpha: %f  dev accuracy: %.2f" %(alpha, score))

ValueError: Expected input with 110 features, got 104 instead

#### CENDY'S MODEL WITH ALL VARIABLES || Best Accuracy: 0.88

In [27]:
alphas = {'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}

for alpha in alphas.get("alpha"):
    bnb = BernoulliNB(alpha = alpha)
    bnb.fit(train_data, train_labels)
    score = bnb.score(dev_data, dev_labels)
    print("alpha: %f  dev accuracy: %.2f" %(alpha, score))

alpha: 0.000000  dev accuracy: 0.00
alpha: 0.000100  dev accuracy: 0.88
alpha: 0.001000  dev accuracy: 0.88
alpha: 0.010000  dev accuracy: 0.88
alpha: 0.100000  dev accuracy: 0.88
alpha: 0.500000  dev accuracy: 0.88
alpha: 1.000000  dev accuracy: 0.88
alpha: 2.000000  dev accuracy: 0.88
alpha: 10.000000  dev accuracy: 0.88
