In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn library for preprocessing
from sklearn import preprocessing

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# Set the randomizer seed so results are the same each time.
np.random.seed(0)



## CSV Files to DataFrames

In [2]:
users_train_raw = pd.read_csv('../w207_group_project/zip_files/train_users_2.csv.zip')
sessions_raw = pd.read_csv('../w207_group_project/zip_files/sessions.csv.zip')
demographics = pd.read_csv('../w207_group_project/zip_files/age_gender_bkts.csv.zip')
countries = pd.read_csv('../w207_group_project/zip_files/countries.csv.zip')
test = pd.read_csv('../w207_group_project/zip_files/test_users.csv.zip')
sessions = pd.read_csv('../w207_group_project/zip_files/sessions.csv.zip')


In [104]:
users_train_raw.rename(columns={'id': 'user_id'}, inplace=True)
users_train_raw.head()

Unnamed: 0,user_id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


In [5]:
sessions.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0


## Converting RAW training data to have the columns we want + binarize

!! Note : We will not split the data into training & dev set until later.

1. language
2. gender
3. signup_app



In [62]:
interested_columns = ['language', 'gender', 'signup_app']

In [90]:
raw_train_binary_vars = pd.get_dummies(users_train_raw[['language', 'gender', 'signup_app']], columns=interested_columns)
#dev_binary_vars = pd.get_dummies(dev[['language', 'gender', 'signup_app']], columns=interested_columns)

In [91]:
raw_train_binary_vars.head()

Unnamed: 0,language_ca,language_cs,language_da,language_de,language_el,language_en,language_es,language_fi,language_fr,language_hr,...,language_tr,language_zh,gender_-unknown-,gender_FEMALE,gender_MALE,gender_OTHER,signup_app_Android,signup_app_Moweb,signup_app_Web,signup_app_iOS
0,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [93]:
#adding back the user_id column to the binarized training & dev data sets:
raw_train_binary_vars = pd.concat([users_train_raw['user_id'], train_binary_vars], axis=1)
#dev_data = pd.concat([dev['user_id'], dev_binary_vars], axis=1)

Unnamed: 0,user_id,language_ca,language_cs,language_da,language_de,language_el,language_en,language_es,language_fi,language_fr,...,language_tr,language_zh,gender_-unknown-,gender_FEMALE,gender_MALE,gender_OTHER,signup_app_Android,signup_app_Moweb,signup_app_Web,signup_app_iOS
0,gxn3p5htnn,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0
1,820tgsjxq7,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2,4ft3gnwmtx,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,bjjt8pjhuk,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4,87mebub9p4,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0


## Converting Sessions DataFrame to have the columns we want + binarize

1. action: request photography
2. action: use mobile site

In [4]:
columns_of_interest = ['action']

sessions_dummy = pd.get_dummies(sessions[['action']], columns=columns_of_interest)

In [8]:
sessions_dummy.head()

Unnamed: 0,action_10,action_11,action_12,action_15,action_about_us,action_accept_decline,action_account,action_acculynk_bin_check_failed,action_acculynk_bin_check_success,action_acculynk_load_pin_pad,...,action_view,action_views,action_views_campaign,action_views_campaign_rules,action_webcam_upload,action_weibo_signup_referral_finish,action_why_host,action_widget,action_wishlists,action_zendesk_login_jwt
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
imp_actions = sessions_dummy[['action_request_photography', 'action_use_mobile_site']]

In [106]:
concat_imp_actions = pd.concat([sessions, imp_actions], axis=1)
#axis = 1 to join columns together
concat_imp_actions.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed,action_request_photography,action_use_mobile_site
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0,0,0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0,0,0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0,0,0
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0,0,0
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0,0,0


In [26]:
#taking average value of action_request_photography and use mobile site for each user
concat_imp_actions_avg = concat_imp_actions.groupby('user_id', as_index=False)['action_request_photography', 'action_use_mobile_site'].mean()

In [31]:
concat_imp_actions_avg.head()

Unnamed: 0,user_id,action_request_photography,action_use_mobile_site
0,00023iyk9l,0.0,0.0
1,0010k6l0om,0.0,0.0
2,001wyh0pz8,0.0,0.0
3,0028jgx1x1,0.0,0.0
4,002qnbzfs5,0.0,0.0


In [33]:
#checking to see if users actually requested photography 
print(concat_imp_actions_avg.loc[concat_imp_actions_avg['action_request_photography'] != 0])

           user_id  action_request_photography  action_use_mobile_site
1088    0aa9bzfc36                    0.008230                     0.0
1523    0ed0aw6d99                    0.008065                     0.0
1813    0h8lhey944                    0.003215                     0.0
5971    1kc5uhld53                    0.005682                     0.0
10057   2obwu7bnph                    0.007407                     0.0
10650   2tlvbo5nl2                    0.011628                     0.0
13890   3nspuyh617                    0.002604                     0.0
16301   4be33gw9y1                    0.006897                     0.0
18566   4x0y9grxsq                    0.005291                     0.0
18570   4x1hmm43sc                    0.002817                     0.0
22574   5z4igikflt                    0.003717                     0.0
23344   66u5vgencd                    0.008403                     0.0
24117   6dwlpzgivk                    0.010204                     0.0
27782 

In [34]:
#how many people performed the action use mobile site
print(concat_imp_actions_avg.loc[concat_imp_actions_avg['action_use_mobile_site'] != 0])


          user_id  action_request_photography  action_use_mobile_site
95713  pf0xhdin6y                         0.0                    0.25


It is a little odd that very few users used any of two actions and yet they came out to be the most important/predictive variables.

In [46]:
# changing columns values to either 0/1 for requesting the action or not
concat_imp_actions_avg['action_request_photography'] = (concat_imp_actions_avg['action_request_photography']!= 0).astype(int)
concat_imp_actions_avg['action_use_mobile_site'] = (concat_imp_actions_avg['action_use_mobile_site']!= 0).astype(int)



In [107]:
merge_raw_train_data_totalcols = pd.merge(raw_train_binary_vars,concat_imp_actions_avg, on='user_id')

In [109]:
merge_raw_train_data_totalcols.head()

Unnamed: 0,user_id,language_ca,language_cs,language_da,language_de,language_el,language_en,language_es,language_fi,language_fr,...,gender_-unknown-,gender_FEMALE,gender_MALE,gender_OTHER,signup_app_Android,signup_app_Moweb,signup_app_Web,signup_app_iOS,action_request_photography,action_use_mobile_site
0,d1mm9tcy42,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,0
1,yo8nz8bqcq,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
2,4grx6yxeby,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
3,ncf87guaf0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
4,4rvqpxoh3h,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0


## LET'S TEST IT WITH A MODEL - CENDY'S MODEL

### We need to first split raw training data into train & dev sets

### !! Note here we remove the user_id from our data sets -- to allow models to not use user_id as a variable

In [110]:
# Shuffle data
np.random.seed(0)
shuffle = np.random.permutation(np.arange(merge_raw_train_data_totalcols.shape[0]))
len(shuffle)
x = merge_raw_train_data_totalcols.reindex(shuffle).ix[:,1:] # remove user_id

# encode all values in numbers 
y = pd.DataFrame()
for column in list(x):
    y[column] = pd.factorize(x[column], sort=True)[0]

# split out labels from features
# normalize features
data, labels = preprocessing.normalize(np.asarray(y)[:,:-1]), np.asarray(y)[:,-1]

# Split into train and dev.
dev_data, dev_labels = data[:25000], labels[:25000]
train_data, train_labels = data[25000:], labels[25000:]

In [111]:
print((train_data.shape))
print((dev_data.shape))

(48815, 34)
(25000, 34)


In [99]:
# Bernoulli Naive Bayes
# finding the best alpha
alphas = {'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}

bnb_clf = BernoulliNB()
bnb = GridSearchCV(estimator=bnb_clf, param_grid=[alphas], cv=5, scoring="accuracy", refit=True)
bnb.fit(train_data, train_labels)
for params, mean_score, scores in bnb.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r" %(mean_score, scores.std()/2, params))

print("\nOptimized Parameters: ", bnb.best_estimator_)
print("optimized accuracy: %.4f" %bnb.score(dev_data, dev_labels))
print("Best alpha:", bnb.best_params_)

  self.feature_log_prob_ = (np.log(smoothed_fc) -
  np.log(smoothed_cc.reshape(-1, 1)))
  np.log(smoothed_cc.reshape(-1, 1)))
  self.class_log_prior_ = (np.log(self.class_count_) -
  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))


1.000 (+/-0.000) for {'alpha': 0.0}
1.000 (+/-0.000) for {'alpha': 0.0001}
1.000 (+/-0.000) for {'alpha': 0.001}
1.000 (+/-0.000) for {'alpha': 0.01}
1.000 (+/-0.000) for {'alpha': 0.1}
1.000 (+/-0.000) for {'alpha': 0.5}
1.000 (+/-0.000) for {'alpha': 1.0}
1.000 (+/-0.000) for {'alpha': 2.0}
1.000 (+/-0.000) for {'alpha': 10.0}
('\nOptimized Parameters: ', BernoulliNB(alpha=0.0, binarize=0.0, class_prior=None, fit_prior=True))
optimized accuracy: 1.0000
('Best alpha:', {'alpha': 0.0})




In [112]:
alphas = {'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}

for alpha in alphas.get("alpha"):
    bnb = BernoulliNB(alpha = alpha)
    bnb.fit(train_data, train_labels)
    score = bnb.score(dev_data, dev_labels)
    print("alpha: %f  dev accuracy: %.2f" %(alpha, score))

alpha: 0.000000  dev accuracy: 1.00
alpha: 0.000100  dev accuracy: 1.00
alpha: 0.001000  dev accuracy: 1.00
alpha: 0.010000  dev accuracy: 1.00
alpha: 0.100000  dev accuracy: 1.00
alpha: 0.500000  dev accuracy: 1.00
alpha: 1.000000  dev accuracy: 1.00
alpha: 2.000000  dev accuracy: 1.00
alpha: 10.000000  dev accuracy: 1.00


## LET'S TEST IT WITH A MODEL - MELANIE'S MODEL

In [113]:
# Try a model
def mod_test(k_vals, train_data, train_labels, dev_data, dev_labels=None):

### STUDENT START ###

    # We want to evaluate a variety of values for k, so we need
    # to enclose our work in a loop.
    for elem in k_vals:
        
        # Create a classifier object, fit our training data &
        # initialize a variable to hold the predictions
        knn = KNeighborsClassifier(n_neighbors=elem)
        knn.fit(train_data, train_labels)
        preds = knn.predict(dev_data)
        
        # We evaluate the accuracy for each value of k by comparing
        # the predictions and the labels, then updating values for
        # correct and total
        correct, total = 0, 0
        for pred, label in zip(preds, dev_labels):
            if pred == label: 
                correct += 1
            total += 1
        print 'For k=%s, total: %3d  correct: %3d  accuracy: %3.2f' %(elem, total, correct, 1.0*correct/total)

k_vals = [7, 10, 15, 20, 50]
mod_test(k_vals, train_data[:5000], train_labels[:5000], dev_data[:2000], dev_labels[:2000])

For k=7, total: 2000  correct: 2000  accuracy: 1.00
For k=10, total: 2000  correct: 2000  accuracy: 1.00
For k=15, total: 2000  correct: 2000  accuracy: 1.00
For k=20, total: 2000  correct: 2000  accuracy: 1.00
For k=50, total: 2000  correct: 2000  accuracy: 1.00
