# Expedia hotel recommendation

Using one hot encoder to convert categorical variables and create sparse matrix.
___

In [168]:
# Import libraries

import pandas as pd
import numpy as np
from scipy import sparse
import random
import re
import sys
import multiprocessing
from collections import defaultdict, OrderedDict

from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import log_loss
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

import ml_metrics
import joblib

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Functions to load and format data

In [156]:
def formatdata(chunk):
    '''Format data of each startified chunk.'''
    
    # fill NAs for origin-destination distance
    chunk['orig_destination_distance'].fillna(0.0, inplace=True)
  
    # parse datetime objects
    for col in ['date_time', 'srch_ci', 'srch_co']:
        chunk[col] = pd.to_datetime(chunk[col], errors='coerce')
    # create booking dayofyear
    chunk['bookmonth'] = chunk['date_time'].apply(lambda x: x.month)
    # fill NA values with booking datetime and stay length of zero
    chunk['srch_co'].fillna(chunk['date_time'], inplace=True)
    chunk['srch_ci'].fillna(chunk['date_time'], inplace=True)
    # create stay length
    chunk['stay'] = chunk['srch_co']-chunk['srch_ci']
    chunk['stay'] = chunk['stay'].apply(lambda x: x.days)

    # create stay dayofyear
    chunk['staydoy'] = chunk['srch_ci'].apply(lambda x: x.dayofyear)
     
    # drop processed columns
    chunk.drop(['date_time','srch_ci','srch_co'], axis=1, inplace=True)
    
    return chunk

def stratshufspl(chunk, fraction, ylabel):
    '''Startified shuffle split of chunks.'''
    sss = StratifiedShuffleSplit(chunk[ylabel], test_size=fraction, 
                                 random_state=42, n_iter=1)
    for _, idx in sss:
        train = chunk.iloc[idx].copy()
    return train


def fractionate(trainiter, fraction, ylabel):
    '''Utilizes only one core.'''
    print('')
    print('loading data...')
    
    # create empty list and add formatted data chunks
    chunks = list()
    for chunk in trainiter:
        # if using whole dataset skip this step
        if fraction < 1.0:
            chunk = stratshufspl(chunk, fraction, ylabel)
        curr = formatdata(chunk)
        chunks.append(curr)
        
    # concatenate chunks
    train = pd.concat(chunks, axis=0)
    
    # split concatenated set into X and y for ml model fitting
    X = train.drop(ylabel, axis=1, inplace=False)
    y = train[ylabel]
    return X, y

Load and format data.

In [157]:
dataurl = '/Users/dbricare/Documents/Python/datasets/expedia/'

rawcols = ['site_name', 'user_location_country', 'user_location_region', 
            'is_package', 'srch_adults_cnt', 'srch_children_cnt', 
            'srch_destination_type_id', 'orig_destination_distance', 'hotel_country', 
            'hotel_market', 'srch_ci', 'srch_co', 'date_time', 'hotel_cluster']

ylabel = rawcols[-1]

# csviter = pd.read_csv(dataurl+'train.csv.gz', sep=',', compression='gzip', chunksize=2000000, usecols=rawcols)
csviter = pd.read_csv(dataurl+'train200th.csv', sep=',', chunksize=200000, usecols=rawcols)


X, y = fractionate(csviter, 1.0, rawcols[-1])

print(X.shape)
print(X.columns)
X.head()


loading data...
(188351, 13)
Index(['site_name', 'user_location_country', 'user_location_region',
       'orig_destination_distance', 'is_package', 'srch_adults_cnt',
       'srch_children_cnt', 'srch_destination_type_id', 'hotel_country',
       'hotel_market', 'bookmonth', 'stay', 'staydoy'],
      dtype='object')


Unnamed: 0,site_name,user_location_country,user_location_region,orig_destination_distance,is_package,srch_adults_cnt,srch_children_cnt,srch_destination_type_id,hotel_country,hotel_market,bookmonth,stay,staydoy
0,18,231,68,0.0,1,4,0,6,105,29,2,2,89
1,13,46,171,5763.2976,0,2,0,1,203,253,10,7,339
2,2,66,220,188.2171,0,3,0,1,50,682,8,3,266
3,34,205,155,60.4896,0,2,0,1,198,401,11,1,326
4,2,66,435,4362.4117,0,2,0,1,204,27,6,3,192


## One hot encoding

In [154]:
# check data types of X

dtypes = []
for col in X.columns:
    dtypes.append(X[col].dtype)
print(dtypes)

[dtype('int64'), dtype('int64'), dtype('int64'), dtype('float64'), dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64')]


In [183]:
le = LabelEncoder()
test = le.fit_transform(X['hotel_market'])
# print(test.shape)
print(len(X['hotel_market'].unique()))
print(len(np.unique(test)))

1956
1956


In [173]:
# expected number of columns in sparse matrix is
# max of cat features plus 1 due to zero indexing plus 1 for non-cat features

notcats = ['orig_destination_distance','stay', 'is_package']
cats = list(X.columns)
for cat in notcats:
    cats.remove(cat)
    
print('expected number of columns in sparse matrix:', X[cats].max().sum()+len(X.columns))
X[cats].max()

expected number of columns in sparse matrix: 4065


site_name                     53
user_location_country        239
user_location_region        1027
srch_adults_cnt                9
srch_children_cnt              9
srch_destination_type_id       9
hotel_country                212
hotel_market                2117
bookmonth                     12
staydoy                      365
dtype: int64

**Need to ensure that test data is properly encoded. All possible values in test data must be included. See below for appropriate choice of max number of features for each categorical variable.**

In [163]:
# create ordered dict to indicate categorical variables and number of features

catdict = OrderedDict()
for col in X.columns:
    if col in notcats:
        val = False
    else:
        val = True
    catdict.update({col:val})

# maxfeat = np.array([t[1] for t in catdict.values() if t[1]!=0], dtype=int)
maxfeatures = [54, 240, 1028, 10, 10, 10, 213, 2118, 13, 367]
mask = np.array(list(catdict.values()))

print(mask)
print(maxfeatures)
# print(maxfeat.shape, maxfeat.sum(), X.shape)

[ True  True  True False False  True  True  True  True  True  True False
  True]
[54, 240, 1022, 10, 10, 10, 213, 2118, 13, 367]


In [164]:
enc = OneHotEncoder(n_values=maxfeatures, categorical_features=mask, dtype=int, sparse=True)

Xsparse = enc.fit_transform(X.values)
print('all features:',sum(maxfeatures)+3)
print('sparse matrix shape:', Xsparse.shape)
print('total encoding values:', sum(enc.n_values)) 

all features: 4060
sparse matrix shape: (188351, 4060)
total encoding values: 4057


In [165]:
# check sparsity ratio

sparsity_ratio = lambda X: 1.0 - np.count_nonzero(X) / float(X.shape[0] * X.shape[1])
spratio = sparsity_ratio(Xsparse.toarray())
print('')
print("sparsity ratio:", spratio)


sparsity ratio: 0.9970723130823798


## Train model

In [166]:
est = RandomForestClassifier(random_state=42)
est.set_params(n_jobs=2, n_estimators=50, max_depth=8)
gridprms = {'max_features': ['sqrt', 0.1, 0.2]}

grid = GridSearchCV(est, param_grid=gridprms, cv=3, scoring='log_loss')
grid.fit(Xsparse, y)

print(grid.best_params_)
grid.grid_scores_

{'max_features': 0.2}


[mean: -4.19819, std: 0.00824, params: {'max_features': 'sqrt'},
 mean: -3.92320, std: 0.00424, params: {'max_features': 0.1},
 mean: -3.85129, std: 0.00193, params: {'max_features': 0.2}]

## Test encoding of test data

In [178]:
testiter = pd.read_csv(dataurl+'test.csv.gz', sep=',', compression='gzip', 
                       chunksize=100000, usecols=rawcols[:-1])
first = True
for chunk in testiter:
    chunk = formatdata(chunk)
    currmin = chunk[cats].min()
    currmax = chunk[cats].max()
    if first:
        minvals = currmin
        maxvals = currmax
        first = False
    else:
        if any([both[0]!=both[1] for both in zip(minvals,currmin)]):
            minvals = [min(both[0],both[1]) for both in zip(minvals,currmin)]
        if any([both[0]!=both[1] for both in zip(maxvals,currmax)]):
            maxvals = [max(both[0],both[1]) for both in zip(maxvals,currmax)]
print(minvals,maxvals)

[0, 0, 0, 0, 0, 1, 0, 0, 1, 1] [53, 239, 1027, 9, 9, 9, 212, 2117, 12, 366]


In [184]:
dffulltest = pd.read_csv(dataurl+'test.csv.gz', sep=',', compression='gzip')

In [188]:
# possible to reduce number of features in sparse matrix?
print(len(dffulltest['user_location_region'].unique()))
print(len(dffulltest['hotel_market'].unique()))

988
2115


In [191]:
len(dffulltest[dffulltest['orig_destination_distance'].isnull()])

847461

In [179]:
dftest = pd.DataFrame({'min':minvals, 'max':maxvals}, index=sermin.index)
print(dftest['max'].sum())
dftest

4053


Unnamed: 0,max,min
site_name,53,0
user_location_country,239,0
user_location_region,1027,0
srch_adults_cnt,9,0
srch_children_cnt,9,0
srch_destination_type_id,9,1
hotel_country,212,0
hotel_market,2117,0
bookmonth,12,1
staydoy,366,1


In [180]:
print(dftest['max'].apply(lambda x: x+1).tolist())
print(sum(dftest['max'].apply(lambda x: x+1).tolist()))

[54, 240, 1028, 10, 10, 10, 213, 2118, 13, 367]
4063


## Revise model size down

- Try larger grain on booking time, use month instead of day of year.

In [172]:
print(grid.best_estimator_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features=0.2, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=2,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
