In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn library for preprocessing
from sklearn import preprocessing

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# Set the randomizer seed so results are the same each time.
np.random.seed(0)



In [7]:
# Read in csv and create arrays
users_train_raw = pd.read_csv('../data/train_users_2.csv.zip')
sessions_raw = pd.read_csv('../data/sessions.csv.zip')
test = pd.read_csv('../data/test_users.csv')
sessions = pd.read_csv('../data/sessions.csv.zip')

In [8]:
users_train_raw.rename(columns={'id': 'user_id'}, inplace=True)
test.rename(columns={'id': 'user_id'}, inplace=True)

In [9]:
users_train_raw

Unnamed: 0,user_id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US
5,osr2jwljor,2010-01-01,20100101215619,2010-01-02,-unknown-,,basic,0,en,other,other,omg,Web,Mac Desktop,Chrome,US
6,lsw9q7uk0j,2010-01-02,20100102012558,2010-01-05,FEMALE,46.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,US
7,0d01nltbrs,2010-01-03,20100103191905,2010-01-13,FEMALE,47.0,basic,0,en,direct,direct,omg,Web,Mac Desktop,Safari,US
8,a1vcnhxeij,2010-01-04,20100104004211,2010-07-29,FEMALE,50.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,US
9,6uh8zyj2gn,2010-01-04,20100104023758,2010-01-04,-unknown-,46.0,basic,0,en,other,craigslist,omg,Web,Mac Desktop,Firefox,US


In [7]:
# Making dataframes to be joined back into data
x = sessions.groupby(['user_id'],as_index=False)  # groups by username

# Total number of actions per user
count_df = x.count() # df of count per use of each parameter type
user_num_visits = count_df[['user_id', 'device_type']]  # Creating a df with username and max number of visits - used device type because that's always max value

# Number of different devices used to access per user
y = sessions.groupby('user_id')['device_type'].nunique()  # Df where sessions is grouped by user, and unique number of devices used is returned
y = y.to_frame()  # Converts pandas series to df
y['user_id']= y.index   # Changes user_id from index to column
y.index = list(range(0,len(y))) # sets index as numeric


# Longest Action by a user
max_time = sessions.groupby('user_id')['secs_elapsed'].max()  # Df where sessions is grouped by user, and unique number of devices used is returned
max_time = max_time.to_frame()  # Converts pandas series to df
max_time['user_id']= max_time.index   # Changes user_id from index to column
max_time.index = list(range(0,len(max_time))) # sets index as numeric

# Total time spent on site per user
total_time = sessions.groupby('user_id')['secs_elapsed'].sum()  # Df where sessions is grouped by user, and unique number of devices used is returned
total_time = total_time.to_frame()  # Converts pandas series to df
total_time['user_id']= total_time.index   # Changes user_id from index to column
total_time.index = list(range(0,len(total_time))) # sets index as numeric


In [8]:
# The last action a user made 
last_action = sessions.groupby('user_id')['action_detail'].last()
last_action = last_action.to_frame() 
last_action['user_id'] = last_action.index
last_action.index = list(range(0,len(last_action)))

In [None]:
# The longest action of a user <-- Could Use some help with this one

# longest_action = sessions[['user_id','action_detail','secs_elapsed']].groupby(['user_id','action_detail']).max()

In [9]:
# Converting Date Columns to Data Time [TRAIN]
users_train_raw['date_account_created'] = pd.to_datetime(users_train_raw['date_account_created'])
users_train_raw['date_first_booking'] = pd.to_datetime(users_train_raw['date_first_booking'])

# Creating Parameter for the Delta between first [TRAIN]
users_train_raw['signup_delta'] = users_train_raw['date_account_created'] - users_train_raw['date_first_booking']

# Converting Date Columns to Data Time [TEST]
test['date_account_created'] = pd.to_datetime(test['date_account_created'])
test['date_first_booking'] = pd.to_datetime(test['date_first_booking'])

# Creating Parameter for the Delta between first [TEST]
test['signup_delta'] = test['date_account_created'] - test['date_first_booking']

In [10]:
# Add the new parameters into Train
new_train_raw = pd.merge(users_train_raw, user_num_visits, on='user_id', how='left')
new_train_raw = pd.merge(new_train_raw, y, on='user_id', how='left')
new_train_raw = pd.merge(new_train_raw, max_time, on='user_id', how='left')
new_train_raw = pd.merge(new_train_raw, total_time, on='user_id', how='left')
new_train_raw = pd.merge(new_train_raw, last_action, on='user_id', how='left')

# Changing column names in Train
new_train_raw.rename(columns={'device_type_x': 'number_visits','device_type_y': 'number_devices', 'secs_elapsed_x': 'longest_session', 'secs_elapsed_y': 'total_time_on_site' }, inplace=True)


# Add the new parameters to Test
test_w_sessions = pd.merge(test, user_num_visits, on='user_id', how='left')
test_w_sessions = pd.merge(test_w_sessions, y, on='user_id', how='left')
test_w_sessions = pd.merge(test_w_sessions, max_time, on='user_id', how='left')
test_w_sessions = pd.merge(test_w_sessions, total_time, on='user_id', how='left')
test_w_sessions = pd.merge(test_w_sessions, last_action, on='user_id', how='left')

# Changing column names in Test
test_w_sessions.rename(columns={'device_type_x': 'number_visits','device_type_y': 'number_devices', 'secs_elapsed_x': 'longest_session', 'secs_elapsed_y': 'total_time_on_site' }, inplace=True)


In [11]:
# Extracting the Hour the user first accessed Airbnb
def first_hour_(x):
    return int(str(x)[8:10])

new_train_raw['first_hour'] = new_train_raw.timestamp_first_active.apply(first_hour_)
test_w_sessions['first_hour'] = test.timestamp_first_active.apply(first_hour_)

In [12]:
# Export to CSV
new_train_raw.to_csv('train_w_sessions.csv',sep=',')
test_w_sessions.to_csv('test_w_sessions.csv',sep=',')

## Work bellow builds on above, but adds in the trimming related to addressing NDF, first booking date, and the introduction of session data

In [69]:
modern_train = new_train_raw.copy()
modern_test = test_w_sessions.copy()

modern_train = modern_train[pd.isnull(modern_train.number_visits) !=  True] # REmoving all rows that predated sessions information
modern_train = modern_train.drop('date_first_booking', 1)  #Removing date of first booking
modern_train = modern_train.drop('signup_delta', 1)   # Removing Delta between signup and booking
modern_train.reset_index(drop=True, inplace=True) 

print len(new_train_raw)
print len(modern_train)


modern_test = modern_test.drop('date_first_booking', 1)  # removing because they are misleading when in comes to test, because by defintion test has not had a booking
modern_test = modern_test.drop('signup_delta', 1)  # removing because they are misleading when in comes to test, because by defintion test has not had a booking
 
    
modern_train['first_hour'] = modern_train['first_hour'].astype(str)
modern_test['first_hour']  = modern_test['first_hour'].astype(str)

213451
73815


In [71]:
# Export to CSV
modern_train.to_csv('modern_train.csv',sep=',') 
modern_test.to_csv('modern_test.csv',sep=',')

In [15]:
modern_train

Unnamed: 0,user_id,date_account_created,timestamp_first_active,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,...,signup_app,first_device_type,first_browser,country_destination,number_visits,number_devices,longest_session,total_time_on_site,action_detail,first_hour
0,d1mm9tcy42,2014-01-01,20140101000936,MALE,62.0,basic,0,en,sem-non-brand,google,...,Web,Windows Desktop,Chrome,other,127.0,2.0,606881.0,3427529.0,p3,0
1,yo8nz8bqcq,2014-01-01,20140101001558,-unknown-,,basic,0,en,direct,direct,...,Web,Mac Desktop,Firefox,NDF,9.0,1.0,115983.0,207842.0,wishlist_content_update,0
2,4grx6yxeby,2014-01-01,20140101001639,-unknown-,,basic,0,en,sem-brand,google,...,Web,Windows Desktop,Firefox,NDF,16.0,2.0,336801.0,1135444.0,wishlist_content_update,0
3,ncf87guaf0,2014-01-01,20140101002146,-unknown-,,basic,0,en,direct,direct,...,Web,Windows Desktop,Chrome,NDF,152.0,3.0,732296.0,3755100.0,wishlist_content_update,0
4,4rvqpxoh3h,2014-01-01,20140101002619,-unknown-,,basic,25,en,direct,direct,...,iOS,iPhone,-unknown-,GB,8.0,1.0,886.0,2555.0,-unknown-,0
5,c8mfesvkv0,2014-01-01,20140101002626,-unknown-,,basic,0,en,direct,direct,...,Web,iPad,Mobile Safari,NDF,3.0,1.0,1371616.0,1380288.0,create_user,0
6,xwxei6hdk4,2014-01-01,20140101002742,FEMALE,32.0,facebook,0,en,seo,google,...,Web,iPad,Mobile Safari,US,7.0,2.0,46262.0,49673.0,confirm_email_link,0
7,5f45ro5uzk,2014-01-01,20140101003535,-unknown-,,basic,0,en,direct,direct,...,Web,Windows Desktop,Chrome,NDF,46.0,1.0,117638.0,605413.0,-unknown-,0
8,ro2stddszp,2014-01-01,20140101005503,-unknown-,19.0,basic,0,en,sem-brand,google,...,Web,Mac Desktop,Safari,other,43.0,1.0,658848.0,1284401.0,change_trip_characteristics,0
9,qtw88d9pbl,2014-01-01,20140101005837,MALE,25.0,basic,0,en,direct,direct,...,Web,Mac Desktop,Chrome,NDF,364.0,1.0,485255.0,2868205.0,wishlist_content_update,0


## Work Below Attempts to Bias Towards Non-DNF data

In [25]:
# Decreasing the amount of NDF labels in the Dataset
biased_train_33 = modern_train.drop(modern_train[modern_train['country_destination'] == 'NDF'].sample(frac=0.33).index)
biased_train_60 = modern_train.drop(modern_train[modern_train['country_destination'] == 'NDF'].sample(frac=0.60).index)
biased_train_15 = modern_train.drop(modern_train[modern_train['country_destination'] == 'NDF'].sample(frac=0.15).index)
biased_train_05 = modern_train.drop(modern_train[modern_train['country_destination'] == 'NDF'].sample(frac=0.05).index)


In [26]:
print len(biased_train_15)
print len(biased_train_33)
print len(biased_train_60)
print len(biased_train_05)

67059
58951
46790
71563


In [27]:
# Export to CSV
biased_train_15.to_csv('biased_train_15.csv',sep=',')
biased_train_33.to_csv('biased_train_33.csv',sep=',')
biased_train_60.to_csv('biased_train_60.csv',sep=',')
biased_train_05.to_csv('biased_train_05.csv',sep=',')

In [31]:
grouped_modern = modern_train.groupby(['country_destination'],as_index=False)

In [32]:
grouped_modern.count()

Unnamed: 0,country_destination,user_id,date_account_created,timestamp_first_active,gender,age,signup_method,signup_flow,language,affiliate_channel,...,first_affiliate_tracked,signup_app,first_device_type,first_browser,number_visits,number_devices,longest_session,total_time_on_site,action_detail,first_hour
0,AU,152,152,152,152,121,152,152,152,152,...,152,152,152,152,152,152,150,150,152,152
1,CA,440,440,440,440,335,440,440,440,440,...,440,440,440,440,440,440,431,431,440,440
2,DE,250,250,250,250,204,250,250,250,250,...,250,250,250,250,250,250,249,249,250,250
3,ES,707,707,707,707,525,707,707,707,707,...,707,707,707,707,707,707,702,702,707,707
4,FR,1435,1435,1435,1435,1057,1435,1435,1435,1435,...,1434,1435,1435,1435,1435,1435,1419,1419,1435,1435
5,GB,731,731,731,731,552,731,731,731,731,...,730,731,731,731,731,731,727,727,731,731
6,IT,979,979,979,979,673,979,979,979,979,...,979,979,979,979,979,979,974,974,979,979
7,NDF,45041,45041,45041,45041,18867,45041,45041,45041,45041,...,44744,45041,45041,45041,45041,45041,44222,44222,45039,45041
8,NL,247,247,247,247,194,247,247,247,247,...,247,247,247,247,247,247,246,246,247,247
9,PT,83,83,83,83,49,83,83,83,83,...,83,83,83,83,83,83,82,82,83,83


In [55]:
def bias_maker(data, labels, factor):
    ''' This function takes a data set, a list of labels, and a factor (type float),
    and returns a dataframe with the factor applied to rows with that label type'''
    df = data.copy()
    for destination in labels:
        one_country = df[df.country_destination == destination]
        for iters in range(factor):
            df = df.append(one_country[:], ignore_index=True)
    return df



In [56]:
# Increasing Underrepresented Labels in the dataset

labels = ["AU", 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NL', 'PT', 'other']

up_bias_x2 = bias_maker(modern_train, labels, 2)
up_bias_x3 = bias_maker(modern_train, labels, 3)
up_bias_x5 = bias_maker(modern_train, labels, 5)
up_bias_x10 = bias_maker(modern_train, labels, 10)
up_bias_x20 = bias_maker(modern_train, labels, 20)

print len(up_bias_x2)
print len(up_bias_x3)
print len(up_bias_x5)
print len(up_bias_x10)
print len(up_bias_x20)


91173
99852
117210
160605
247395


In [58]:
print up_bias_x5.shape
print modern_train.shape

(117210, 21)
(73815, 21)


In [59]:
up_bias_x10

Unnamed: 0,user_id,date_account_created,timestamp_first_active,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,...,signup_app,first_device_type,first_browser,country_destination,number_visits,number_devices,longest_session,total_time_on_site,action_detail,first_hour
0,d1mm9tcy42,2014-01-01,20140101000936,MALE,62.0,basic,0,en,sem-non-brand,google,...,Web,Windows Desktop,Chrome,other,127.0,2.0,606881.0,3427529.0,p3,0
1,yo8nz8bqcq,2014-01-01,20140101001558,-unknown-,,basic,0,en,direct,direct,...,Web,Mac Desktop,Firefox,NDF,9.0,1.0,115983.0,207842.0,wishlist_content_update,0
2,4grx6yxeby,2014-01-01,20140101001639,-unknown-,,basic,0,en,sem-brand,google,...,Web,Windows Desktop,Firefox,NDF,16.0,2.0,336801.0,1135444.0,wishlist_content_update,0
3,ncf87guaf0,2014-01-01,20140101002146,-unknown-,,basic,0,en,direct,direct,...,Web,Windows Desktop,Chrome,NDF,152.0,3.0,732296.0,3755100.0,wishlist_content_update,0
4,4rvqpxoh3h,2014-01-01,20140101002619,-unknown-,,basic,25,en,direct,direct,...,iOS,iPhone,-unknown-,GB,8.0,1.0,886.0,2555.0,-unknown-,0
5,c8mfesvkv0,2014-01-01,20140101002626,-unknown-,,basic,0,en,direct,direct,...,Web,iPad,Mobile Safari,NDF,3.0,1.0,1371616.0,1380288.0,create_user,0
6,xwxei6hdk4,2014-01-01,20140101002742,FEMALE,32.0,facebook,0,en,seo,google,...,Web,iPad,Mobile Safari,US,7.0,2.0,46262.0,49673.0,confirm_email_link,0
7,5f45ro5uzk,2014-01-01,20140101003535,-unknown-,,basic,0,en,direct,direct,...,Web,Windows Desktop,Chrome,NDF,46.0,1.0,117638.0,605413.0,-unknown-,0
8,ro2stddszp,2014-01-01,20140101005503,-unknown-,19.0,basic,0,en,sem-brand,google,...,Web,Mac Desktop,Safari,other,43.0,1.0,658848.0,1284401.0,change_trip_characteristics,0
9,qtw88d9pbl,2014-01-01,20140101005837,MALE,25.0,basic,0,en,direct,direct,...,Web,Mac Desktop,Chrome,NDF,364.0,1.0,485255.0,2868205.0,wishlist_content_update,0


In [45]:
grouped_bias = up_bias_x20.groupby(['country_destination'],as_index=False)
grouped_bias.count()

Unnamed: 0,country_destination,user_id,date_account_created,timestamp_first_active,gender,age,signup_method,signup_flow,language,affiliate_channel,...,first_affiliate_tracked,signup_app,first_device_type,first_browser,number_visits,number_devices,longest_session,total_time_on_site,action_detail,first_hour
0,AU,3192,3192,3192,3192,2541,3192,3192,3192,3192,...,3192,3192,3192,3192,3192,3192,3150,3150,3192,3192
1,CA,9240,9240,9240,9240,7035,9240,9240,9240,9240,...,9240,9240,9240,9240,9240,9240,9051,9051,9240,9240
2,DE,5250,5250,5250,5250,4284,5250,5250,5250,5250,...,5250,5250,5250,5250,5250,5250,5229,5229,5250,5250
3,ES,14847,14847,14847,14847,11025,14847,14847,14847,14847,...,14847,14847,14847,14847,14847,14847,14742,14742,14847,14847
4,FR,30135,30135,30135,30135,22197,30135,30135,30135,30135,...,30114,30135,30135,30135,30135,30135,29799,29799,30135,30135
5,GB,15351,15351,15351,15351,11592,15351,15351,15351,15351,...,15330,15351,15351,15351,15351,15351,15267,15267,15351,15351
6,IT,20559,20559,20559,20559,14133,20559,20559,20559,20559,...,20559,20559,20559,20559,20559,20559,20454,20454,20559,20559
7,NDF,45041,45041,45041,45041,18867,45041,45041,45041,45041,...,44744,45041,45041,45041,45041,45041,44222,44222,45039,45041
8,NL,5187,5187,5187,5187,4074,5187,5187,5187,5187,...,5187,5187,5187,5187,5187,5187,5166,5166,5187,5187
9,PT,1743,1743,1743,1743,1029,1743,1743,1743,1743,...,1743,1743,1743,1743,1743,1743,1722,1722,1743,1743


In [63]:
# Increasing all value types (including NDF) so that proportions stay the same but there are more underrepresented examples

labels = ["AU", 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NL', 'PT', 'other', 'NDF', 'US']

all_bias_x2 = bias_maker(modern_train, labels, 2)
all_bias_x3 = bias_maker(modern_train, labels, 3)
all_bias_x5 = bias_maker(modern_train, labels, 5)
all_bias_x10 = bias_maker(modern_train, labels, 10)
all_bias_x20 = bias_maker(modern_train, labels, 20)

print len(all_bias_x2)
print len(all_bias_x3)
print len(all_bias_x5)
print len(all_bias_x10)
print len(all_bias_x20)

grouped_allbias = all_bias_x20.groupby(['country_destination'],as_index=False)
print grouped_allbias.count()


221445
295260
442890
811965
1550115
   country_destination  user_id  date_account_created  timestamp_first_active  \
0                   AU     3192                  3192                    3192   
1                   CA     9240                  9240                    9240   
2                   DE     5250                  5250                    5250   
3                   ES    14847                 14847                   14847   
4                   FR    30135                 30135                   30135   
5                   GB    15351                 15351                   15351   
6                   IT    20559                 20559                   20559   
7                  NDF   945861                945861                  945861   
8                   NL     5187                  5187                    5187   
9                   PT     1743                  1743                    1743   
10                  US   421995                421995                  42

In [65]:
# Increasing all value types (including NDF) so that proportions stay the same but there are more underrepresented examples

labels = ['NDF', 'US']

mixed_bias_x2 = bias_maker(all_bias_x2, labels, 2)

print len(all_bias_x2)
print len(mixed_bias_x2)

grouped_mixedbias = mixed_bias_x2.groupby(['country_destination'],as_index=False)
print grouped_mixedbias.count()

221445
612261
   country_destination  user_id  date_account_created  timestamp_first_active  \
0                   AU      456                   456                     456   
1                   CA     1320                  1320                    1320   
2                   DE      750                   750                     750   
3                   ES     2121                  2121                    2121   
4                   FR     4305                  4305                    4305   
5                   GB     2193                  2193                    2193   
6                   IT     2937                  2937                    2937   
7                  NDF   405369                405369                  405369   
8                   NL      741                   741                     741   
9                   PT      249                   249                     249   
10                  US   180855                180855                  180855   
11            

In [67]:
# Increasing USA

labels = ['US']

biased_usa = bias_maker(modern_train, labels, 2)

print len(biased_usa)
print len(modern_train)



114005
73815


In [64]:
# Export to CSV
up_bias_x2.to_csv('up_bias_x2.csv',sep=',')
up_bias_x3.to_csv('up_bias_x3.csv',sep=',')
up_bias_x5.to_csv('up_bias_x5.csv',sep=',')
up_bias_x10.to_csv('up_bias_x10.csv',sep=',')
up_bias_x20.to_csv('up_bias_x20.csv',sep=',')

all_bias_x2.to_csv('all_bias_x2.csv',sep=',')
all_bias_x3.to_csv('all_bias_x3.csv',sep=',')
all_bias_x5.to_csv('all_bias_x5.csv',sep=',')
all_bias_x10.to_csv('all_bias_x10.csv',sep=',')
all_bias_x20.to_csv('all_bias_x20.csv',sep=',')

mixed_bias_x2.to_csv('mixedbias.csv',sep=',')
biased_usa.to_csv('biaed_usa.csv',sep=',')