# Model Analysis w imputed session data

In [33]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score
from sklearn import metrics
from sklearn.externals import joblib

In [34]:
#data = pd.read_csv("./data/df_all_inc_sess_imputed.csv")
data = pd.read_csv("./data/df_all_inc_sess_imputed_w_train_data_only.csv")

In [35]:
# get X and y
X = data[0:213451]
y = pd.read_csv("./data/y_2.csv")

In [36]:
# drop user_id
X = X.drop(['user_id'], axis=1)

In [5]:
# for users w/out session info, set relevant features to -1.
X.fillna(value=-1, inplace=True)

In [37]:
print X.shape
print y.shape

(213451, 770)
(213451, 1)


In [38]:
# encode y values
y_labels = {}
i = 0
for value in y.country_destination.unique():
    if value not in y_labels.keys():
        y_labels[value] = i
        i += 1
    y[y == value] = i-1

print "y is indexed by dict:\n%s" % y_labels

y is indexed by dict:
{'FR': 3, 'NL': 9, 'PT': 8, 'CA': 4, 'DE': 10, 'IT': 7, 'US': 1, 'NDF': 0, 'other': 2, 'AU': 11, 'GB': 5, 'ES': 6}


In [39]:
# set y to array-like type
y = y.country_destination
y.ravel().astype(int)

array([0, 0, 1, ..., 0, 0, 0])

In [40]:
# train / test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=1)

In [41]:
print X_train.shape
print y_train.shape
print X_test.shape
print y_test.shape

(149415, 770)
(149415,)
(64036, 770)
(64036,)


## GradientBoostingClassifier w imputed (w all data)
### n_estimators=50,   NO  scaling    => Acc= 0.7649 
### n_estimators=100, NO  scaling    => Acc= 0.7753 
### n_estimators=150, NO scaling     => Acc= 0.7809 
## n_estimators=200, NO scaling     => Acc= 0.7849 [*****BEST*******]
### n_estimators=100, max_depth=6, NO scaling     => Acc= 0.7834


## GradientBoostingClassifier w imputed (w training data only -no cheating)
### n_estimators=50,   NO  scaling    => Acc= 0.7631

In [43]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=50, verbose=1)
gbc.fit(X_train, y_train.astype(int))
y_pred_gbc = gbc.predict(X_test)

      Iter       Train Loss   Remaining Time 
         1      274243.7061          123.77m
         2      245266.2249          126.17m
         3      224299.0047          124.95m
         4      207925.6332          122.50m
         5      195193.5485          120.44m
         6      184980.7752          117.68m
         7      176691.3161          115.12m
         8      169830.3406          113.46m
         9      164105.3818          109.20m
        10      159415.8832          105.37m
        20      136063.2687           74.74m
        30      127962.4660           48.57m
        40      122948.2810           24.02m
        50      119284.3689            0.00s


In [18]:
gbc_100_0_7753

In [44]:
print metrics.accuracy_score(y_test.astype(int), y_pred_gbc.astype(int))

0.763117621338


In [45]:
conf_matrix_gbc = metrics.confusion_matrix(y_test.astype(int), y_pred_gbc.astype(int))
print conf_matrix_gbc
print sum(sum(conf_matrix_gbc))

[[35855  1656     5     2     4     1     4     8     0     3     6     8]
 [ 5606 12941     3     5     4     8     7    15     2    11     5     9]
 [ 1097  1816     5     1     1     4     1     2     1     2     2     1]
 [  522   941     4    17     0     3     1     4     0     1     0     1]
 [  124   298     1     0     5     0     0     0     0     0     0     0]
 [  233   466     0     0     0     6     0     2     1     1     1     0]
 [  228   427     0     1     1     0    15     2     0     0     0     1]
 [  318   526     1     1     0     3     0    10     0     1     4     0]
 [   19    46     0     0     0     0     0     1     0     1     0     0]
 [   74   142     0     1     0     0     2     0     0     6     0     0]
 [  119   189     0     1     0     0     0     1     0     0     4     0]
 [   47   108     0     0     0     0     0     0     0     0     0     3]]
64036


In [None]:
predict_proba_test_gbc = gbc.predict_proba(X_test)
predict_proba_test_gbc.shape

In [None]:
print y_labels

In [None]:
y_indexes = {}
for country,index in y_labels.items():
    y_indexes[index] = country
print y_indexes

In [None]:
for i in range(5):
    probas = predict_proba_test_gbc[i]
    probas_indexed = []
    for j in range(12):
        probas_indexed.append([y_indexes[j], probas[j]])
    probas_indexed.sort(key=lambda tup: tup[1], reverse=True)
    print "item %d: labels: %s" % (i, probas_indexed[0:5])
    

In [14]:
feat_importances = []
for i in range(len(X_test.columns)):
    #print "%s: %.3f" % (X_test.columns[i], abo.feature_importances_[i])
    feat_importances.append((X_test.columns[i], gbc.feature_importances_[i]))

feat_importances = sorted(feat_importances, key=lambda x: x[1], reverse=True)
for pair in feat_importances:
    print pair[0], pair[1]

action_detail_p5 0.0405285304906
action_requested 0.0253737876286
action_type_booking_request 0.0187705553983
action_detail_pending 0.0184869621895
tfa_year 0.0176889774667
avg_secs_elapsed 0.0174196605897
action_detail_post_checkout_action 0.017326563924
age 0.0167951157472
action_pending 0.0158747628336
action_ajax_google_translate_reviews 0.0144712281812
dac_year 0.0132245399572
action_detail_change_trip_characteristics 0.0128842526416
action_detail_translate_listing_reviews 0.0125849205879
device_type_Mac Desktop 0.0118006987243
action_other_hosting_reviews_first 0.0116267757907
action_message_to_host_change 0.0109804415942
action_detail_wishlist 0.0109062630521
action_detail_similar_listings 0.00962232494596
action_ajax_refresh_subtotal 0.00916184791921
device_type_Tablet 0.00906499556038
action_detail_message_post 0.00906232981451
action_detail_create_phone_numbers 0.00886368021428
action_type_message_post 0.00867759240817
action_ 0.00852570073054
action_travel_plans_current 0.00

In [46]:
# persist good models
joblib.dump(gbc, './models/gbc_50_0_7631_train.pkl') 
# to load model:
# clf = joblib.load('filename.pkl')

['./models/gbc_50_0_7631_train.pkl',
 './models/gbc_50_0_7631_train.pkl_01.npy',
 './models/gbc_50_0_7631_train.pkl_02.npy',
 './models/gbc_50_0_7631_train.pkl_03.npy',
 './models/gbc_50_0_7631_train.pkl_04.npy']

In [27]:
X_test_subm = data[213451:]

In [28]:
X_test_subm[X_test_subm.hit_count.isnull()]

Unnamed: 0,user_id,age,dac_year,dac_month,dac_day,tfa_year,tfa_month,tfa_day,gender_-unknown-,gender_FEMALE,...,device_type_Opera Phone,num_sessions_1_hrs,num_sessions_2_hrs,num_sessions_4_hrs,num_sessions_8_hrs,num_sessions_16_hrs,num_sessions_24_hrs,secs_elapsed,hit_count,avg_secs_elapsed
213558,scojulukfu,0,2014,7,1,2014,7,1,1,0,...,,,,,,,,,,
213984,s781ycpax3,42,2014,7,1,2014,7,1,0,1,...,,,,,,,,,,
213985,jl44r564wq,22,2014,7,1,2014,7,1,0,0,...,,,,,,,,,,
213986,n6ol4as9g0,0,2014,7,1,2014,7,1,1,0,...,,,,,,,,,,
213990,o0e86z6ufg,0,2014,7,1,2014,7,1,1,0,...,,,,,,,,,,
213995,uvgqiech5j,0,2014,7,1,2014,7,1,1,0,...,,,,,,,,,,
214312,zmxlayd386,40,2014,7,2,2014,7,2,1,0,...,,,,,,,,,,
214404,eym9cf0pps,33,2014,7,2,2014,7,2,0,1,...,,,,,,,,,,
214561,dfujsgd070,0,2014,7,2,2014,7,2,1,0,...,,,,,,,,,,
214587,plqi5snlmb,0,2014,7,2,2014,7,2,1,0,...,,,,,,,,,,
