In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn library for preprocessing
from sklearn import preprocessing

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# Set the randomizer seed so results are the same each time.
np.random.seed(0)



In [2]:
# Read in csv and create arrays
users_train_raw = pd.read_csv('../zip_files/train_users_2.csv.zip')
sessions_raw = pd.read_csv('../zip_files/sessions.csv.zip')
demographics = pd.read_csv('../zip_files/age_gender_bkts.csv.zip')
countries = pd.read_csv('../zip_files/countries.csv.zip')
test = pd.read_csv('../zip_files/test_users.csv.zip')
sessions = pd.read_csv('../zip_files/sessions.csv.zip')

In [3]:
users_train_raw.rename(columns={'id': 'user_id'}, inplace=True)
test.rename(columns={'id': 'user_id'}, inplace=True)

In [None]:
users_train_raw

Unnamed: 0,user_id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US
5,osr2jwljor,2010-01-01,20100101215619,2010-01-02,-unknown-,,basic,0,en,other,other,omg,Web,Mac Desktop,Chrome,US
6,lsw9q7uk0j,2010-01-02,20100102012558,2010-01-05,FEMALE,46.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,US
7,0d01nltbrs,2010-01-03,20100103191905,2010-01-13,FEMALE,47.0,basic,0,en,direct,direct,omg,Web,Mac Desktop,Safari,US
8,a1vcnhxeij,2010-01-04,20100104004211,2010-07-29,FEMALE,50.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,US
9,6uh8zyj2gn,2010-01-04,20100104023758,2010-01-04,-unknown-,46.0,basic,0,en,other,craigslist,omg,Web,Mac Desktop,Firefox,US


In [None]:
# Making dataframes to be joined back into data
x = sessions.groupby(['user_id'],as_index=False)  # groups by username

# Total number of actions per user
count_df = x.count() # df of count per use of each parameter type
user_num_visits = count_df[['user_id', 'device_type']]  # Creating a df with username and max number of visits - used device type because that's always max value

# Number of different devices used to access per user
y = sessions.groupby('user_id')['device_type'].nunique()  # Df where sessions is grouped by user, and unique number of devices used is returned
y = y.to_frame()  # Converts pandas series to df
y['user_id']= y.index   # Changes user_id from index to column
y.index = list(range(0,len(y))) # sets index as numeric


# Longest Action by a user
max_time = sessions.groupby('user_id')['secs_elapsed'].max()  # Df where sessions is grouped by user, and unique number of devices used is returned
max_time = max_time.to_frame()  # Converts pandas series to df
max_time['user_id']= max_time.index   # Changes user_id from index to column
max_time.index = list(range(0,len(max_time))) # sets index as numeric

# Total time spent on site per user
total_time = sessions.groupby('user_id')['secs_elapsed'].sum()  # Df where sessions is grouped by user, and unique number of devices used is returned
total_time = total_time.to_frame()  # Converts pandas series to df
total_time['user_id']= total_time.index   # Changes user_id from index to column
total_time.index = list(range(0,len(total_time))) # sets index as numeric


In [None]:
# The last action a user made 
last_action = sessions.groupby('user_id')['action_detail'].last()
last_action = last_action.to_frame() 
last_action['user_id'] = last_action.index
last_action.index = list(range(0,len(last_action)))

In [None]:
# The longest action of a user <-- Could Use some help with this one

# longest_action = sessions[['user_id','action_detail','secs_elapsed']].groupby(['user_id','action_detail']).max()

In [None]:
# Converting Date Columns to Data Time [TRAIN]
users_train_raw['date_account_created'] = pd.to_datetime(users_train_raw['date_account_created'])
users_train_raw['date_first_booking'] = pd.to_datetime(users_train_raw['date_first_booking'])

# Creating Parameter for the Delta between first [TRAIN]
users_train_raw['signup_delta'] = users_train_raw['date_account_created'] - users_train_raw['date_first_booking']

# Converting Date Columns to Data Time [TEST]
test['date_account_created'] = pd.to_datetime(test['date_account_created'])
test['date_first_booking'] = pd.to_datetime(test['date_first_booking'])

# Creating Parameter for the Delta between first [TEST]
test['signup_delta'] = test['date_account_created'] - test['date_first_booking']

In [None]:
# Add the new parameters into Train
new_train_raw = pd.merge(users_train_raw, user_num_visits, on='user_id', how='left')
new_train_raw = pd.merge(new_train_raw, y, on='user_id', how='left')
new_train_raw = pd.merge(new_train_raw, max_time, on='user_id', how='left')
new_train_raw = pd.merge(new_train_raw, total_time, on='user_id', how='left')
new_train_raw = pd.merge(new_train_raw, last_action, on='user_id', how='left')

# Changing column names in Train
new_train_raw.rename(columns={'device_type_x': 'number_visits','device_type_y': 'number_devices', 'secs_elapsed_x': 'longest_session', 'secs_elapsed_y': 'total_time_on_site' }, inplace=True)


# Add the new parameters to Test
test_w_sessions = pd.merge(test, user_num_visits, on='user_id', how='left')
test_w_sessions = pd.merge(test_w_sessions, y, on='user_id', how='left')
test_w_sessions = pd.merge(test_w_sessions, max_time, on='user_id', how='left')
test_w_sessions = pd.merge(test_w_sessions, total_time, on='user_id', how='left')
test_w_sessions = pd.merge(test_w_sessions, last_action, on='user_id', how='left')

# Changing column names in Test
test_w_sessions.rename(columns={'device_type_x': 'number_visits','device_type_y': 'number_devices', 'secs_elapsed_x': 'longest_session', 'secs_elapsed_y': 'total_time_on_site' }, inplace=True)


In [None]:
# Extracting the Hour the user first accessed Airbnb
def first_hour_(x):
    return int(str(x)[8:10])

new_train_raw['first_hour'] = new_train_raw.timestamp_first_active.apply(first_hour_)
test_w_sessions['first_hour'] = test.timestamp_first_active.apply(first_hour_)

In [None]:
# Export to CSV
new_train_raw.to_csv('train_w_sessions.csv',sep=',')
test_w_sessions.to_csv('test_w_sessions.csv',sep=',')

## Work bellow builds on above, but adds in the trimming related to addressing NDF, first booking date, and the introduction of session data

In [None]:
modern_train = new_train_raw.copy()
modern_test = test_w_sessions.copy()

modern_train = modern_train[pd.isnull(modern_train.number_visits) !=  True] # REmoving all rows that predated sessions information
modern_train = modern_train.drop('date_first_booking', 1)  #Removing date of first booking
modern_train = modern_train.drop('signup_delta', 1)   # Removing Delta between signup and booking
modern_train.reset_index(drop=True, inplace=True) 

print len(new_train_raw)
print len(modern_train)


modern_test = modern_test.drop('date_first_booking', 1)
modern_test = modern_test.drop('signup_delta', 1)


In [None]:
# Export to CSV
modern_train.to_csv('modern_train.csv',sep=',')
modern_test.to_csv('modern_test.csv',sep=',')

In [None]:
modern_train

## Work Below Attempts to Bias Towards Non-DNF data

In [25]:
biased_train_33 = modern_train.drop(modern_train[modern_train['country_destination'] == 'NDF'].sample(frac=0.33).index)
biased_train_60 = modern_train.drop(modern_train[modern_train['country_destination'] == 'NDF'].sample(frac=0.60).index)
biased_train_15 = modern_train.drop(modern_train[modern_train['country_destination'] == 'NDF'].sample(frac=0.15).index)
biased_train_05 = modern_train.drop(modern_train[modern_train['country_destination'] == 'NDF'].sample(frac=0.05).index)


In [26]:
print len(biased_train_15)
print len(biased_train_33)
print len(biased_train_60)
print len(biased_train_05)

67059
58951
46790
71563


In [27]:
# Export to CSV
biased_train_15.to_csv('biased_train_15.csv',sep=',')
biased_train_33.to_csv('biased_train_33.csv',sep=',')
biased_train_60.to_csv('biased_train_60.csv',sep=',')
biased_train_05.to_csv('biased_train_05.csv',sep=',')