In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn library for preprocessing
from sklearn import preprocessing

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import chi2

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# Set the randomizer seed so results are the same each time.
np.random.seed(0)




In [2]:
# Read in csv and create arrays
users_train_raw = pd.read_csv('./w207_group_project/zip_files/train_users_2.csv.zip')
sessions_raw = pd.read_csv('./w207_group_project/zip_files/sessions.csv.zip')

In [4]:
# rename sessions_raw
d = sessions_raw
print(list(d))
d.head()

['user_id', 'action', 'action_type', 'action_detail', 'device_type', 'secs_elapsed']


Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0


# actions

In [18]:
print(len(d.action.unique()))
print(d.action.unique()[:10])

360
['lookup' 'search_results' 'personalize' 'index' 'similar_listings'
 'ajax_refresh_subtotal' 'show' 'header_userpic' 'ask_question' nan]


In [6]:
# count of actions and their proportion to total
actions_total = pd.crosstab(index=d["action"],     # Make a crosstab
                            columns="count")      # Name the count column
# sort actions descending
actions_total = actions_total.sort('count', ascending=False)
actions_total.head(10)



col_0,count
action,Unnamed: 1_level_1
show,2768278
index,843699
search_results,725226
personalize,706824
search,536057
ajax_refresh_subtotal,487744
update,365130
similar_listings,364624
social_connections,339000
reviews,320591


In [7]:
# count of action per user id
actions = pd.crosstab(index=d["user_id"], columns=d["action"])

# reset index so user_id is its own column
actions.reset_index(level=0, inplace=True)

actions.head()

action,user_id,10,11,12,15,about_us,accept_decline,account,acculynk_bin_check_failed,acculynk_bin_check_success,...,view,views,views_campaign,views_campaign_rules,webcam_upload,weibo_signup_referral_finish,why_host,widget,wishlists,zendesk_login_jwt
0,00023iyk9l,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0010k6l0om,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,001wyh0pz8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0028jgx1x1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,002qnbzfs5,9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# append user location from training data
country = pd.DataFrame({
    'country_destination': users_train_raw['country_destination'],
    'user_id': users_train_raw['id']
})

# set key
country.set_index(country['user_id'])

country.head()

Unnamed: 0,country_destination,user_id
0,NDF,gxn3p5htnn
1,NDF,820tgsjxq7
2,US,4ft3gnwmtx
3,other,bjjt8pjhuk
4,US,87mebub9p4


In [9]:
# merge action table with country table on user_id (keeps only users who have country_destinations)
actions = pd.merge(actions, country, on='user_id', how='inner')

actions.head()

Unnamed: 0,user_id,10,11,12,15,about_us,accept_decline,account,acculynk_bin_check_failed,acculynk_bin_check_success,...,views,views_campaign,views_campaign_rules,webcam_upload,weibo_signup_referral_finish,why_host,widget,wishlists,zendesk_login_jwt,country_destination
0,00023iyk9l,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,US
1,001wyh0pz8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,NDF
2,0028jgx1x1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,NDF
3,002qnbzfs5,9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,US
4,0035hobuyj,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,US


In [10]:
# Shuffle data
np.random.seed(0)
shuffle = np.random.permutation(np.arange(actions.shape[0]))
z = actions.reindex(shuffle).ix[:,1:] # remove user_id

# encode country to numbers 
country_code = pd.factorize(z['country_destination'], sort=True)[0]
z['country_code'] = country_code
z

# split out labels from features
data, labels = np.asarray(z)[:,:-2], np.asarray(z)[:,-1]
len(data)

# Split into train and dev & cast as integers
s_dev_data, s_dev_labels = data[:58000].astype(int), labels[:58000].astype(int)
s_train_data, s_train_labels = data[58000:].astype(int), labels[58000:].astype(int)


In [57]:
# fit bernoulliNB across all features
b = BernoulliNB(fit_prior=False)
b.fit(s_train_data, s_train_labels)
b.score(s_dev_data, s_dev_labels)

0.48194827586206895

In [58]:
# feature selection using chi2
from sklearn.feature_selection import SelectKBest

k = SelectKBest(chi2, k=2)
x = k.fit_transform(s_train_data, s_train_labels)
k.get_support(indices=True)

b = BernoulliNB(fit_prior=False)
b.fit(x, s_train_labels)
score = b.score(s_dev_data[:,k.get_support(indices=True)], s_dev_labels)
print("score:", score)

# which indices? 
indices = list(k.get_support(indices=True))

for i in indices:
    print(list(actions)[i])

score: 0.366413793103
ajax_referral_banner_type
request_photography


In [59]:
# feature selection using f_regression
from sklearn.feature_selection import f_regression

k = SelectKBest(f_regression, k=2)
x = k.fit_transform(s_train_data, s_train_labels)
k.get_support(indices=True)

b = BernoulliNB(fit_prior=False)
b.fit(x, s_train_labels)
score = b.score(s_dev_data[:,k.get_support(indices=True)], s_dev_labels)
print("score:", score)

# which indices? 
indices = list(k.get_support(indices=True))

for i in indices:
    print(list(actions)[i])

score: 0.551810344828
request_photography
use_mobile_site


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [60]:
# feature selection using f_classif
from sklearn.feature_selection import f_classif

k = SelectKBest(f_classif, k=2)
x = k.fit_transform(s_train_data, s_train_labels)
k.get_support(indices=True)

b = BernoulliNB(fit_prior=False)
b.fit(x, s_train_labels)
score = b.score(s_dev_data[:,k.get_support(indices=True)], s_dev_labels)
print("score:", score)

# which indices? 
indices = list(k.get_support(indices=True))

for i in indices:
    print(list(actions)[i])

score: 0.551810344828
request_photography
use_mobile_site


  73  86  95  99 101 102 103 108 111 115 120 122 130 139 147 168 170 171
 176 177 188 190 191 193 197 209 234 235 238 253 259 263 270 272 273 278
 283 301 303 304 305 308 311 313 318 320 323 325 336 339 342 349 352 356] are constant.
  f = msb / msw


In [61]:
# feature selection using mutual_info_classif
from sklearn.feature_selection import mutual_info_classif

k = SelectKBest(mutual_info_classif, k=2)
x = k.fit_transform(s_train_data, s_train_labels)
k.get_support(indices=True)

b = BernoulliNB(fit_prior=False)
b.fit(x, s_train_labels)
score = b.score(s_dev_data[:,k.get_support(indices=True)], s_dev_labels)
print("score:", score)

# which indices? 
indices = list(k.get_support(indices=True))

for i in indices:
    print(list(actions)[i])

score: 0.594637931034
payout_update
request_photography


# action_detail

In [21]:
print(len(d.action_detail.unique()))
print(d.action_detail.unique()[:10])

156
[nan 'view_search_results' 'wishlist_content_update' 'similar_listings'
 'change_trip_characteristics' 'p3' 'header_userpic' 'contact_host'
 'message_post' '-unknown-']


In [23]:
# count of action_detail and their proportion to total
action_detail_total = pd.crosstab(index=d["action_detail"],     # Make a crosstab
                                  columns="count")      # Name the count column
# sort action_detail descending
action_detail_total = action_detail_total.sort('count', ascending=False)
action_detail_total.head(10)



col_0,count
action_detail,Unnamed: 1_level_1
view_search_results,1776885
p3,1376550
-unknown-,1031141
wishlist_content_update,706824
user_profile,656839
change_trip_characteristics,487744
similar_listings,364624
user_social_connections,336799
update_listing,269779
listing_reviews,269021


In [24]:
# count of action per user id
ad = pd.crosstab(index=d["user_id"], columns=d["action_detail"])

# reset index so user_id is its own column
ad.reset_index(level=0, inplace=True)

ad.head()

action_detail,user_id,-unknown-,account_notification_settings,account_payment_methods,account_payout_preferences,account_privacy_settings,account_transaction_history,admin_templates,airbnb_picks_wishlists,alteration_field,...,view_resolutions,view_search_results,view_security_checks,view_user_real_names,wishlist,wishlist_content_update,wishlist_note,your_listings,your_reservations,your_trips
0,00023iyk9l,0,0,0,0,0,0,0,0,0,...,0,5,0,0,0,4,0,0,0,2
1,0010k6l0om,5,0,0,0,0,0,0,0,0,...,0,10,0,0,0,8,0,0,0,0
2,001wyh0pz8,6,0,0,0,0,0,0,0,0,...,0,66,0,0,0,0,0,0,0,0
3,0028jgx1x1,1,0,0,0,0,0,0,0,0,...,0,9,0,0,0,0,0,0,0,0
4,002qnbzfs5,184,0,0,0,0,0,0,0,0,...,0,125,0,0,0,0,0,0,0,0


In [25]:
# merge action_detail table with country table on user_id (keeps only users who have country_destinations)
ad = pd.merge(ad, country, on='user_id', how='inner')

ad.head()

Unnamed: 0,user_id,-unknown-,account_notification_settings,account_payment_methods,account_payout_preferences,account_privacy_settings,account_transaction_history,admin_templates,airbnb_picks_wishlists,alteration_field,...,view_search_results,view_security_checks,view_user_real_names,wishlist,wishlist_content_update,wishlist_note,your_listings,your_reservations,your_trips,country_destination
0,00023iyk9l,0,0,0,0,0,0,0,0,0,...,5,0,0,0,4,0,0,0,2,US
1,001wyh0pz8,6,0,0,0,0,0,0,0,0,...,66,0,0,0,0,0,0,0,0,NDF
2,0028jgx1x1,1,0,0,0,0,0,0,0,0,...,9,0,0,0,0,0,0,0,0,NDF
3,002qnbzfs5,184,0,0,0,0,0,0,0,0,...,125,0,0,0,0,0,0,0,0,US
4,0035hobuyj,9,0,0,0,0,0,0,0,0,...,200,0,0,0,26,0,0,0,0,US


In [27]:
# Shuffle data
np.random.seed(0)
shuffle = np.random.permutation(np.arange(actions.shape[0]))
a = ad.reindex(shuffle).ix[:,1:] # remove user_id

# encode country to numbers 
country_code = pd.factorize(z['country_destination'], sort=True)[0]
a['country_code'] = country_code

# split out labels from features
data, labels = np.asarray(a)[:,:-2], np.asarray(a)[:,-1]
len(data)

# Split into train and dev & cast as integers
ad_dev_data, ad_dev_labels = data[:58000].astype(int), labels[:58000].astype(int)
ad_train_data, ad_train_labels = data[58000:].astype(int), labels[58000:].astype(int)


In [31]:
# fit bernoulliNB across all features
b = BernoulliNB(fit_prior=True)
b.fit(ad_train_data, ad_train_labels)
b.score(ad_dev_data, ad_dev_labels)

0.58244827586206893

In [45]:
# feature selection using chi2
from sklearn.feature_selection import SelectKBest

k = SelectKBest(chi2, k=2)
x = k.fit_transform(ad_train_data, ad_train_labels)
k.get_support(indices=True)

b = BernoulliNB(fit_prior=True)
b.fit(x, ad_train_labels)
score = b.score(ad_dev_data[:,k.get_support(indices=True)], ad_dev_labels)
print("score:", score)

# which indices? 
indices = list(k.get_support(indices=True))

for i in indices:
    print(list(ad)[i])

score: 0.609862068966
account_transaction_history
p1


In [47]:
# feature selection using f-regression
from sklearn.feature_selection import f_regression

k = SelectKBest(f_regression, k=2)
x = k.fit_transform(ad_train_data, ad_train_labels)
k.get_support(indices=True)

b = BernoulliNB(fit_prior=True)
b.fit(x, ad_train_labels)
score = b.score(ad_dev_data[:,k.get_support(indices=True)], ad_dev_labels)
print("score:", score)

# which indices? 
indices = list(k.get_support(indices=True))

for i in indices:
    print(list(ad)[i])

score: 0.610224137931
cancellation_policy_click
set_password


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [49]:
# feature selection using f-classif
from sklearn.feature_selection import f_classif

k = SelectKBest(f_classif, k=2)
x = k.fit_transform(ad_train_data, ad_train_labels)
k.get_support(indices=True)

b = BernoulliNB(fit_prior=True)
b.fit(x, ad_train_labels)
score = b.score(ad_dev_data[:,k.get_support(indices=True)], ad_dev_labels)
print("score:", score)

# which indices? 
indices = list(k.get_support(indices=True))

for i in indices:
    print(list(ad)[i])

score: 0.609844827586
account_transaction_history
host_standard_suspension


 100 109 112 117 121 139 140 141 143 144 145 147 148] are constant.
  f = msb / msw
