### Number of clicks prediction
---

In [23]:
import pandas as pd
import numpy as np
import requests
from io import StringIO

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostRegressor,  BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MaxAbsScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.base import clone, TransformerMixin
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import KFold
import time
import pickle
from pandas.plotting import scatter_matrix
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import SGDRegressor
import xgboost
from xgboost import XGBRegressor




In [2]:
# First, let's read our cleaned data

In [3]:
hotel_data = pd.read_pickle('cleaned.pkl')
hotel_data.head()

Unnamed: 0,content_score,n_images,distance_to_center,avg_rating,stars,n_reviews,avg_rank,avg_price,avg_saving_percent,n_clicks,...,city_id_878630.0,city_id_878634.0,city_id_878644.0,city_id_878652.0,city_id_878668.0,city_id_878678.0,city_id_878696.0,city_id_878704.0,city_id_878708.0,city_id_878736.0
0,70.0,2.0,1199.0,77.0,4.0,861.0,17.55,81.64,18.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,67.0,3.0,12585.0,90.0,4.0,4371.0,17.383,189.38,28.0,4.0,...,0,0,0,0,0,0,0,0,0,0
3,59.0,8.0,3291.0,73.0,2.0,3084.0,7.0,72.16,2.0,4.0,...,0,0,0,0,0,0,0,0,0,0
4,66.0,1.0,288.0,80.0,0.0,603.0,12.564,173.25,0.0,10.0,...,0,0,0,0,0,0,0,0,0,0
5,58.0,2.0,1249.0,87.0,0.0,1683.0,18.391,96.7,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# experimenting with all columns from the encoding of the city_id columns proved to be very slow, so for now I am dropping those columns

In [5]:
hotel_data = hotel_data.iloc[:,:10]
hotel_data 

Unnamed: 0,content_score,n_images,distance_to_center,avg_rating,stars,n_reviews,avg_rank,avg_price,avg_saving_percent,n_clicks
0,70.0,2.0,1199.0,77.0,4.0,861.0,17.550,81.64,18.0,0.0
1,67.0,3.0,12585.0,90.0,4.0,4371.0,17.383,189.38,28.0,4.0
3,59.0,8.0,3291.0,73.0,2.0,3084.0,7.000,72.16,2.0,4.0
4,66.0,1.0,288.0,80.0,0.0,603.0,12.564,173.25,0.0,10.0
5,58.0,2.0,1249.0,87.0,0.0,1683.0,18.391,96.70,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
396469,48.0,1.0,469.0,83.0,0.0,0.0,21.000,78.86,0.0,0.0
396470,42.0,0.0,689.0,83.0,0.0,72.0,14.174,116.56,4.0,0.0
396471,70.0,1.0,1424.0,83.0,0.0,0.0,16.211,16.11,0.0,4.0
396472,42.0,0.0,164.0,84.0,0.0,222.0,20.000,253.81,0.0,0.0


#### Initial Modeling

In [6]:
# First let's create our X and y for the model
initial_model_data = hotel_data.copy()

y_inital_model= initial_model_data['n_clicks']
X_initial_model = initial_model_data.drop(columns = ['n_clicks'])

In [7]:
# now let's split the data into train and test sets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_initial_model,y_inital_model,test_size = 0.2, random_state = 2020)

### Pipeline

In [None]:
# To create a pipeline we need to specify that columns we want to prepoces

In [9]:
numerical_cols = list(X_train.columns)
numerical_cols

['content_score',
 'n_images',
 'distance_to_center',
 'avg_rating',
 'stars',
 'n_reviews',
 'avg_rank',
 'avg_price',
 'avg_saving_percent']

In [10]:
numerical_transformer = Pipeline(steps = [('imputer',SimpleImputer()),
                                         ('scaler',StandardScaler())                                                     
                                        ])

In [11]:
preprocessor = ColumnTransformer(transformers = [('num',numerical_transformer,numerical_cols)])

### Baseline: Linear Model :
Linear models are fast, and easy to build and interpret, we will start with Ridge models which apply linear regression with regularization.


In [12]:
reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', RidgeCV(alphas=[1e-3, 1e-2, 1e-1,0.2,0.4,0.6,0.8, 1]))])

In [13]:
start_time = time.time()
reg.fit(X_train,y_train)
print('model score on training set = ',reg.score(X_train, y_train))
print('model score on test set = ',reg.score(X_test, y_test))
y_pred = reg.predict(X_test)
elapsed_time = time.time() - start_time
print('elapsed time = ',elapsed_time )
print('MSE = ',mean_squared_error(y_test, y_pred))

model score on training set =  0.1140293197510267
model score on test set =  0.11193137467656533
elapsed time =  0.4749009609222412
MSE =  28.67729133190142


> The model is fas but it has very poor performance. 

### Testing other Models:

We will now test several models including SGD, Gradient Boost, Random Forest and XGB and evaluate them base on mean squared error and performance

In [19]:
def build_models():
    """
    creates regression a selected models with the defualt values
    
    input :
    ----
    None
    returns:
    -------
    Two lists, one for models and the second is for the name of the models
    """
    sgd = SGDRegressor(random_state=2020)
    grad = GradientBoostingRegressor(random_state=2020)
    rf =RandomForestRegressor(random_state=2020)
    xgb = XGBRegressor()
    regressors = [sgd, grad,rf,xgb]
    regressors_names = ['SGDRegressor','Gradient Boost','Random Forest','xgb']
    return  regressors,regressors_names

In [22]:
# now let's test these models and evaluate their performance

    
regressors,regressor_names = build_models()

for regressor,regressor_name in zip(regressors,regressor_names):
    regressor_pipeline = Pipeline(steps = [('preprocessor',preprocessor),
                                      ('regressor',regressor)])
    start_time = time.time()
    regressor_pipeline.fit(X_train,y_train)
    print('Results for ',regressor_name)
    print('Training score = ',regressor_pipeline.score(X_train,y_train))
    print('Test score = ',regressor_pipeline.score(X_test,y_test))
    y_pred = regressor_pipeline.predict(X_test)
    elapsed_time = time.time() - start_time
    print('elapsed time = ',elapsed_time )
    print('MSE = ',mean_squared_error(y_test, y_pred))
    print('______________________________________________________')

Results for  SGDRegressor
Training score =  0.1129600916843917
Test score =  0.11090880631948974
elapsed time =  0.9910900592803955
MSE =  28.710311855143033
______________________________________________________
Results for  Gradient Boost
Training score =  0.21398156141352231
Test score =  0.2109019065342218
elapsed time =  38.67053008079529
MSE =  25.481359514896205
______________________________________________________
Results for  Random Forest
Training score =  0.885254852220597
Test score =  0.18821664050586462
elapsed time =  158.27261209487915
MSE =  26.213906487378154
______________________________________________________
Results for  xgb
Training score =  0.33790573991777506
Test score =  0.2627092981409206
elapsed time =  5.937211990356445
MSE =  23.808408101135683
______________________________________________________


 #### According to this results,we have the following observations :
- Linear Regression SGDRegressor have the fastest models but also have the lowest accuracy
- Gradient Boost model has the a bias issues, while random forest has a variance issues.
- XGBRegressor seems to be the most promsing in terms of MSE. We will adopt this model and try to fine tune the paramters to improve the mse.

### Model Improvements : Hyperparamters optimization

In [24]:
# let's create our X_train and X_test since it has it's own cross-validation.
data_train = xgboost.DMatrix(X_train, label=y_train)
data_test = xgboost.DMatrix(X_test, label=y_test)

In [25]:
# we will now create our initial set of paramters, which are the defual

In [111]:
params = {
    'max_depth':6,
#     'min_child_weight': 1,
#     'eta':.3,
#     'subsample': 1,
#     'colsample_bytree': 1,
#     # Other parameters
#     'objective':'reg:squarederror',
}

In [112]:
num_boost_round = 999

In [113]:
# now let's perform cross validation using the built in funciton

In [121]:
cv_results = xgboost.cv(
    params,
    data_train,
    num_boost_round=num_boost_round,
    seed=2020,
    nfold=11,
    metrics={'rmse'},
    early_stopping_rounds=20
)


In [119]:
cv_results


Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,5.561825,0.004163,5.565379,0.085006
1,5.338302,0.003944,5.345159,0.082214
2,5.216655,0.003717,5.226262,0.080064
3,5.141550,0.004085,5.155470,0.078434
4,5.096869,0.003355,5.115657,0.078065
...,...,...,...,...
235,4.308867,0.011122,4.870575,0.070481
236,4.306961,0.011159,4.870630,0.070494
237,4.305108,0.010985,4.871042,0.070627
238,4.302811,0.010836,4.870680,0.070432


In [122]:
cv_results['test-rmse-mean'].min()**2


23.820700052975212

In [123]:
## Tunning paramters

In [None]:
#Parameters max_depth and min_child_weight

In [124]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(1,15)
    for min_child_weight in range(1,8)
]

In [None]:
#Define initial best params and MAE
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgboost.cv(
        params,
        data_train,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=10,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_mae = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], min_mae))

CV with max_depth=1, min_child_weight=1
	MAE 5.123396699999999 for 1099 rounds
CV with max_depth=1, min_child_weight=2
	MAE 5.1233697 for 1099 rounds
CV with max_depth=1, min_child_weight=3
	MAE 5.123360699999999 for 1099 rounds
CV with max_depth=1, min_child_weight=4
	MAE 5.1233264 for 1099 rounds
CV with max_depth=1, min_child_weight=5
	MAE 5.1233264 for 1099 rounds
CV with max_depth=1, min_child_weight=6
	MAE 5.1233264 for 1099 rounds
CV with max_depth=1, min_child_weight=7
	MAE 5.1233264 for 1099 rounds
CV with max_depth=2, min_child_weight=1
	MAE 4.867987600000001 for 1099 rounds
CV with max_depth=2, min_child_weight=2
	MAE 4.865995 for 1099 rounds
CV with max_depth=2, min_child_weight=3
	MAE 4.8670746000000005 for 1099 rounds
CV with max_depth=2, min_child_weight=4
	MAE 4.8675370000000004 for 1099 rounds
CV with max_depth=2, min_child_weight=5
	MAE 4.865147500000001 for 1099 rounds
CV with max_depth=2, min_child_weight=6
	MAE 4.8687001 for 1091 rounds
CV with max_depth=2, min_chi

In [None]:
params['max_depth'] = 9
params['min_child_weight'] = 6

In [None]:
#Parameters subsample and colsample_bytree

In [None]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [None]:
min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgboost.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

In [None]:
params['subsample'] = 0.9
params['colsample_bytree'] = 0.8

In [None]:
#Parameter ETA

In [None]:
%time
# This can take some time…
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    %time cv_results = xgboost.cv(params,dtrain,num_boost_round=num_boost_round,seed=42,nfold=5,metrics=['rmse'],early_stopping_rounds=10)
    # Update best score
    mean_mae = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))


In [None]:
params['eta'] = .03

In [None]:
# Let's look at out final parameters
params

In [None]:
cv_results = xgboost.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'rmse'},
    early_stopping_rounds=10
)

In [None]:
cv_results

In [None]:
cv_results['test-rmse-mean'].min()



In [None]:
## Training the final model

In [None]:
model = xgboost.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

In [None]:
num_boost_round = model.best_iteration + 1
best_model = xgboost.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)

In [None]:
mean_squared_error(best_model.predict(dtest), y_test)