## ADDRESSING THE NDFs
This notebook will attempt to hardcode rules that address to if-and-only-if relationship between Date of First Booking and NDF

In [1]:
# Import Statements

# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Numpy libraries
from numpy import nan

# SK-learn library for preprocessing
from sklearn import preprocessing

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# Set the randomizer seed so results are the same each time.
np.random.seed(0)



In [2]:
# Read in csv and create arrays
users_train_raw = pd.read_csv('../train_dev_data/train_w_sessions.csv', index_col=0)   # Note: this is the user data with new paramters from sessions
test = pd.read_csv('../train_dev_data/test_w_sessions.csv', index_col=0)
demographics = pd.read_csv('../zip_files/age_gender_bkts.csv.zip')
countries = pd.read_csv('../zip_files/countries.csv.zip') 

users_train_raw = users_train_raw.drop('country_destination', 1)


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Removing all NDF's from training data
train_no_ndf = users_train_raw[pd.isnull(users_train_raw.date_first_booking) !=  True]
train_no_ndf
train_no_ndf.reset_index(drop=True, inplace=True)


len(train_no_ndf)

88908

In [10]:
# Creating a new version of 
train_no_ndf_2 = train_no_ndf.copy()
train_no_ndf_2 = train_no_ndf_2.drop('date_first_booking', 1)  #Removing date of first booking
train_no_ndf_2 = train_no_ndf_2.drop('signup_delta', 1)   # Removing Delta between signup and booking
train_no_ndf_2 = users_train_raw[pd.isnull(users_train_raw.number_visits) !=  True] # REmoving all rows that predated sessions information
train_no_ndf_2.reset_index(drop=True, inplace=True) 

len(train_no_ndf_2)

73815

In [14]:
train_no_old = users_train_raw[pd.isnull(users_train_raw.number_visits) !=  True]
train_no_old = train_no_old.drop('date_first_booking', 1)  #Removing date of first booking
train_no_ndf_2 = train_no_old.drop('signup_delta', 1)   # Removing Delta between signup and booking

len(train_no_old)

73815

In [6]:
# Shuffle data
np.random.seed(0)
shuffle = np.random.permutation(np.arange(train_no_ndf.shape[0]))
len(shuffle)
x = train_no_ndf.reindex(shuffle)

# encode all values in numbers 
y = pd.DataFrame()
for column in list(x):
    y[column] = pd.factorize(x[column], sort=True)[0]

# split out labels from data frame
data, labels = preprocessing.normalize(np.asarray(y)[:,:-1]), np.asarray(y)[:,-1]

# Split into train and dev.
dev_data, dev_labels = data[:8000], labels[:8000]
train_data, train_labels = data[8000:], labels[8000:]



In [7]:
# Shuffle data - Attempt 2 
np.random.seed(0)
shuffle = np.random.permutation(np.arange(train_no_ndf_2.shape[0]))
len(shuffle)
x = train_no_ndf.reindex(shuffle)

# encode all values in numbers 
y = pd.DataFrame()
for column in list(x):
    y[column] = pd.factorize(x[column], sort=True)[0]

# split out labels from data frame
data_2, labels_2 = preprocessing.normalize(np.asarray(y)[:,:-1]), np.asarray(y)[:,-1]

# Split into train and dev.
dev_data_2, dev_labels_2 = data_2[:8000], labels_2[:8000]
train_data_2, train_labels_2 = data_2[8000:], labels_2[8000:]

ValueError: Found array with 0 sample(s) (shape=(0, 21)) while a minimum of 1 is required by the normalize function.

In [17]:
# Shuffle data - Attempt 3
np.random.seed(0)
shuffle = np.random.permutation(np.arange(train_no_old.shape[0]))
len(shuffle)
x = train_no_ndf.reindex(shuffle)

# encode all values in numbers 
y = pd.DataFrame()
for column in list(x):
    y[column] = pd.factorize(x[column], sort=True)[0]

# split out labels from data frame
data_3, labels_3 = preprocessing.normalize(np.asarray(y)[:,:-1]), np.asarray(y)[:,-1]

# Split into train and dev.
dev_data_3, dev_labels_3 = data_3[:8000], labels_3[:8000]
train_data_3, train_labels_3 = data_3[8000:], labels_3[8000:]

In [22]:
# Bernoulli Naive Bayes - For Training/Dev that have all NDF removed but does still have date_first_booking
# finding the best alpha
alphas = {'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}

bnb_clf = BernoulliNB()
bnb = GridSearchCV(estimator=bnb_clf, param_grid=[alphas], cv=5, scoring="accuracy", refit=True)
bnb.fit(train_data, train_labels)
for params, mean_score, scores in bnb.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r" %(mean_score, scores.std()/2, params))

print("\nOptimized Parameters: ", bnb.best_estimator_)
print("optimized accuracy: %.4f" %bnb.score(dev_data, dev_labels))
print("Best alpha:", bnb.best_params_)

0.676 (+/-0.001) for {'alpha': 0.0}
0.720 (+/-0.001) for {'alpha': 0.0001}
0.720 (+/-0.001) for {'alpha': 0.001}
0.720 (+/-0.001) for {'alpha': 0.01}
0.720 (+/-0.001) for {'alpha': 0.1}
0.720 (+/-0.000) for {'alpha': 0.5}
0.720 (+/-0.000) for {'alpha': 1.0}
0.720 (+/-0.000) for {'alpha': 2.0}
0.721 (+/-0.000) for {'alpha': 10.0}
('\nOptimized Parameters: ', BernoulliNB(alpha=10.0, binarize=0.0, class_prior=None, fit_prior=True))
optimized accuracy: 0.7220
('Best alpha:', {'alpha': 10.0})


In [None]:
# Bernoulli Naive Bayes - For Training/Dev that have all NDF & pre-session removed, as well as booking date and booking delta columns
# finding the best alpha
alphas = {'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}

bnb_clf_2 = BernoulliNB()
bnb_2 = GridSearchCV(estimator=bnb_clf_2, param_grid=[alphas], cv=5, scoring="accuracy", refit=True)
bnb_2.fit(train_data_2, train_labels_2)
for params, mean_score, scores in bnb_2.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r" %(mean_score, scores.std()/2, params))

print("\nOptimized Parameters: ", bnb_2.best_estimator_)
print("optimized accuracy: %.4f" %bnb_2.score(dev_data_2, dev_labels_2))
print("Best alpha:", bnb_2.best_params_)

In [18]:
# Bernoulli Naive Bayes - For Training/Dev that have all NDF & pre-session removed, as well as booking date and booking delta columns
# finding the best alpha
alphas = {'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}

bnb_clf_3 = BernoulliNB()
bnb_3 = GridSearchCV(estimator=bnb_clf_3, param_grid=[alphas], cv=5, scoring="accuracy", refit=True)
bnb_3.fit(train_data_3, train_labels_3)
for params, mean_score, scores in bnb_3.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r" %(mean_score, scores.std()/2, params))

print("\nOptimized Parameters: ", bnb_3.best_estimator_)
print("optimized accuracy: %.4f" %bnb_3.score(dev_data_3, dev_labels_3))
print("Best alpha:", bnb_3.best_params_)

  self.feature_log_prob_ = (np.log(smoothed_fc) -
  neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
  jll += self.class_log_prior_ + neg_prob.sum(axis=1)


0.815 (+/-0.001) for {'alpha': 0.0}
0.843 (+/-0.001) for {'alpha': 0.0001}
0.843 (+/-0.001) for {'alpha': 0.001}
0.843 (+/-0.001) for {'alpha': 0.01}
0.843 (+/-0.001) for {'alpha': 0.1}
0.843 (+/-0.001) for {'alpha': 0.5}
0.843 (+/-0.001) for {'alpha': 1.0}
0.843 (+/-0.001) for {'alpha': 2.0}
0.844 (+/-0.001) for {'alpha': 10.0}
('\nOptimized Parameters: ', BernoulliNB(alpha=10.0, binarize=0.0, class_prior=None, fit_prior=True))
optimized accuracy: 0.8384
('Best alpha:', {'alpha': 10.0})




In [None]:
# Processing Test data like Train & Dev

# test_data = pd.DataFrame()
# for column in list(test):
#     test_data[column] = pd.factorize(test[column], sort=True)[0]
    
# test_labels = []
# for label in test.user_id:
#     test_labels.append(label)
    
# test_data = preprocessing.normalize(np.asarray(test_data)[:,:])

# # NB Model on Test - Just for Funsies
# test_prediction = bnb.predict(test_data)

In [19]:
# Removing all NDF's from training data
test_updated = test.drop('date_first_booking', 1)  #Removing date of first booking
test_updated = test.drop('signup_delta', 1)   # Removing Delta between signup and booking


In [20]:
# Processing Test data like Train & Dev

test_data_2 = pd.DataFrame()
for column in list(test_updated):
    test_data_2[column] = pd.factorize(test_updated[column], sort=True)[0]
    
test_labels_2 = []
for label in test_updated.user_id:
    test_labels_2.append(label)
    
test_data_2 = preprocessing.normalize(np.asarray(test_data_2)[:,:])

# NB Model on Test - Just for Funsies
# test_prediction_2 = bnb_2.predict(test_data_2)
test_prediction_3 = bnb_3.predict(test_data_2)

In [None]:
# submission = pd.DataFrame(data=test_prediction,index=test_labels)
# submission.columns.name = 'id'
# submission.rename(columns={0:'country'}, inplace=True)
# vals_to_replace = { 0 : 'AU', 1 : 'CA', 2 : 'DE', 3 : 'ES', 4 : 'FR', 5 : 'GB', 6 : 'IT', 7 : 'NDF', 8 : 'NL', 9 : 'PT', 10 : 'US', 11 : 'other' }
# submission['country'] = submission['country'].map(vals_to_replace)
# submission.to_csv('airbnb_submission_1.csv',sep=',',index_label='id')
# submission

In [None]:
# Submission for data with no NDF in training data, no data_first_reservation, no date_Delta, as well as all pre-sessions data removed
submission = pd.DataFrame(data=test_prediction_2,index=test_labels_2)
submission.columns.name = 'id'
submission.rename(columns={0:'country'}, inplace=True)
vals_to_replace = { 0 : 'AU', 1 : 'CA', 2 : 'DE', 3 : 'ES', 4 : 'FR', 5 : 'GB', 6 : 'IT', 7 : 'NDF', 8 : 'NL', 9 : 'PT', 10 : 'US', 11 : 'other'}
submission['country'] = submission['country'].map(vals_to_replace)
submission = submission.fillna(value='NDF')
submission.to_csv('airbnb_submission_1.csv',sep=',',index_label='id')
submission

In [21]:
# Submission#3 : for data with no NDF in training data, no data_first_reservation, no date_Delta, as well as all pre-sessions data removed
submission = pd.DataFrame(data=test_prediction_3,index=test_labels_2)
submission.columns.name = 'id'
submission.rename(columns={0:'country'}, inplace=True)
vals_to_replace = { 0 : 'AU', 1 : 'CA', 2 : 'DE', 3 : 'ES', 4 : 'FR', 5 : 'GB', 6 : 'IT', 7 : 'NDF', 8 : 'NL', 9 : 'PT', 10 : 'US', 11 : 'other'}
submission['country'] = submission['country'].map(vals_to_replace)
submission = submission.fillna(value='NDF')
submission.to_csv('airbnb_submission_1.csv',sep=',',index_label='id')
submission

id,country
5uwns89zht,NDF
jtl0dijy2j,NDF
xx0ulgorjt,NL
6c6puo6ix0,NL
czqhjk3yfe,NL
szx28ujmhf,NDF
guenkfjcbq,US
tkpq0mlugk,NL
3xtgd5p9dn,NDF
md9aj22l5a,NDF
