### Number of clicks prediction
---

In [5]:
import pandas as pd
import numpy as np
import requests
from io import StringIO

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostRegressor,  BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MaxAbsScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.base import clone, TransformerMixin
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import KFold
import time
import pickle
from pandas.plotting import scatter_matrix
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import SGDRegressor
import xgboost
from xgboost import XGBRegressor




In [6]:
# First, let's read our cleaned data

In [7]:
hotel_data = pd.read_pickle('cleaned.pkl')
hotel_data.head()

Unnamed: 0,hotel_id,content_score,n_images,distance_to_center,avg_rating,stars,n_reviews,avg_rank,avg_price,avg_saving_percent,n_clicks,count
0,97674060000.0,70.0,2.0,1199.0,77.0,4.0,861.0,17.55,81.64,18.0,0.0,80.0
1,97688890000.0,67.0,3.0,12585.0,90.0,4.0,4371.0,17.383,189.38,28.0,4.0,751.0
3,98242790000.0,59.0,8.0,3291.0,73.0,2.0,3084.0,7.0,72.16,2.0,4.0,5.0
4,98334380000.0,66.0,1.0,288.0,80.0,0.0,603.0,12.564,173.25,0.0,10.0,73.0
5,98393260000.0,58.0,2.0,1249.0,87.0,0.0,1683.0,18.391,96.7,0.0,0.0,68.0


In [8]:
# experimenting with all columns from the encoding of the city_id columns proved to be very slow, so for now I am dropping those columns

In [9]:
# hotel_data = hotel_data.iloc[:,:10]
# hotel_data 

#### Initial Modeling

In [10]:
# First let's create our X and y for the model
initial_model_data = hotel_data.copy()

y_inital_model= initial_model_data['n_clicks']
X_initial_model = initial_model_data.drop(columns = ['n_clicks'])

In [11]:
# now let's split the data into train and test sets

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_initial_model,y_inital_model,test_size = 0.2, random_state = 2020)

### Pipeline

In [None]:
# To create a pipeline we need to specify that columns we want to prepoces

In [None]:
numerical_cols = list(X_train.columns)
numerical_cols

In [None]:
numerical_transformer = Pipeline(steps = [('imputer',SimpleImputer()),
                                         ('scaler',StandardScaler())                                                     
                                        ])

In [None]:
preprocessor = ColumnTransformer(transformers = [('num',numerical_transformer,numerical_cols)])

### Baseline: Linear Model :
Linear models are fast, and easy to build and interpret, we will start with Ridge models which apply linear regression with regularization.


In [None]:
reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', RidgeCV(alphas=[1e-3, 1e-2, 1e-1,0.2,0.4,0.6,0.8, 1]))])

In [None]:
start_time = time.time()
reg.fit(X_train,y_train)
print('model score on training set = ',reg.score(X_train, y_train))
print('model score on test set = ',reg.score(X_test, y_test))
y_pred = reg.predict(X_test)
elapsed_time = time.time() - start_time
print('elapsed time = ',elapsed_time )
print('MSE = ',mean_squared_error(y_test, y_pred))

> The model is fas but it has very poor performance. 

### Testing other Models:

We will now test several models including SGD, Gradient Boost, Random Forest and XGB and evaluate them base on mean squared error and performance

In [None]:
def build_models():
    """
    creates regression a selected models with the defualt values
    
    input :
    ----
    None
    returns:
    -------
    Two lists, one for models and the second is for the name of the models
    """
    sgd = SGDRegressor(random_state=2020)
    grad = GradientBoostingRegressor(random_state=2020)
    rf =RandomForestRegressor(random_state=2020)
    xgb = XGBRegressor()
    regressors = [sgd, grad,rf,xgb]
    regressors_names = ['SGDRegressor','Gradient Boost','Random Forest','xgb']
    return  regressors,regressors_names

In [None]:
# now let's test these models and evaluate their performance

    
regressors,regressor_names = build_models()

for regressor,regressor_name in zip(regressors,regressor_names):
    regressor_pipeline = Pipeline(steps = [('preprocessor',preprocessor),
                                      ('regressor',regressor)])
    start_time = time.time()
    regressor_pipeline.fit(X_train,y_train)
    print('Results for ',regressor_name)
    print('Training score = ',regressor_pipeline.score(X_train,y_train))
    print('Test score = ',regressor_pipeline.score(X_test,y_test))
    y_pred = regressor_pipeline.predict(X_test)
    elapsed_time = time.time() - start_time
    print('elapsed time = ',elapsed_time )
    print('MSE = ',mean_squared_error(y_test, y_pred))
    print('______________________________________________________')

 #### According to this results,we have the following observations :
- Linear Regression SGDRegressor have the fastest models but also have the lowest accuracy
- Gradient Boost model has the a bias issues, while random forest has a variance issues.
- XGBRegressor seems to be the most promsing in terms of MSE. We will adopt this model and try to fine tune the paramters to improve the mse.

### Model Improvements : Hyperparamters optimization

In [13]:
# let's create our X_train and X_test since it has it's own cross-validation.
data_train = xgboost.DMatrix(X_train, label=y_train)
data_test = xgboost.DMatrix(X_test, label=y_test)

In [None]:
# we will now create our initial set of paramters, which are the defual

In [17]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:squarederror',
}

In [18]:
params['eval_metric'] = "rmse"


In [19]:
num_boost_round = 999

In [23]:
model = xgboost.train(
    params,
    data_train,
    num_boost_round=num_boost_round,
    evals=[(data_test, "Test")],
    early_stopping_rounds=10
)

[0]	Test-rmse:5.55954
Will train until Test-rmse hasn't improved in 10 rounds.
[1]	Test-rmse:5.32165
[2]	Test-rmse:5.17549
[3]	Test-rmse:5.09130
[4]	Test-rmse:5.02825
[5]	Test-rmse:4.99746
[6]	Test-rmse:4.97401
[7]	Test-rmse:4.94375
[8]	Test-rmse:4.93054
[9]	Test-rmse:4.92417
[10]	Test-rmse:4.91023
[11]	Test-rmse:4.89676
[12]	Test-rmse:4.89233
[13]	Test-rmse:4.88596
[14]	Test-rmse:4.87754
[15]	Test-rmse:4.87577
[16]	Test-rmse:4.85856
[17]	Test-rmse:4.85533
[18]	Test-rmse:4.85155
[19]	Test-rmse:4.83889
[20]	Test-rmse:4.83113
[21]	Test-rmse:4.82732
[22]	Test-rmse:4.82468
[23]	Test-rmse:4.81992
[24]	Test-rmse:4.81599
[25]	Test-rmse:4.81128
[26]	Test-rmse:4.80826
[27]	Test-rmse:4.80396
[28]	Test-rmse:4.80261
[29]	Test-rmse:4.80074
[30]	Test-rmse:4.79898
[31]	Test-rmse:4.79578
[32]	Test-rmse:4.79217
[33]	Test-rmse:4.79119
[34]	Test-rmse:4.78560
[35]	Test-rmse:4.78445
[36]	Test-rmse:4.78051
[37]	Test-rmse:4.77498
[38]	Test-rmse:4.76942
[39]	Test-rmse:4.76817
[40]	Test-rmse:4.76787
[41]	Test-

In [None]:
# now let's perform cross validation using the built in funciton

In [24]:
cv_results = xgboost.cv(
    params,
    data_train,
    num_boost_round=num_boost_round,
    seed=2020,
    nfold=10,
    metrics={'rmse'},
    early_stopping_rounds=20
)


In [25]:
cv_results


Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,5.544535,0.005901,5.549616,0.059906
1,5.306300,0.006526,5.316954,0.058216
2,5.157468,0.006689,5.171986,0.055148
3,5.069151,0.006889,5.088351,0.055494
4,5.010549,0.007095,5.035326,0.053641
...,...,...,...,...
288,3.942795,0.012129,4.667424,0.058483
289,3.941149,0.012147,4.667588,0.058322
290,3.939369,0.012376,4.667427,0.058150
291,3.937337,0.012343,4.667178,0.058254


In [26]:
cv_results['test-rmse-mean'].min()**2


21.780264559875615

In [None]:
## Tunning paramters

In [None]:
#Parameters max_depth and min_child_weight

In [27]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(3,15)
    for min_child_weight in range(1,4)
]

In [None]:
#Define initial best params and MAE
min_rmse = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgboost.cv(
        params,
        data_train,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=10,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\trmse {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], min_rmse))

CV with max_depth=3, min_child_weight=1


In [None]:
params['max_depth'] = 3
params['min_child_ weight'] = 3

In [None]:
#Parameters subsample and colsample_bytree

In [None]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [None]:
min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgboost.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

In [None]:
params['subsample'] = 0.9
params['colsample_bytree'] = 0.8

In [None]:
#Parameter ETA

In [None]:
%time
# This can take some time…
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    %time cv_results = xgboost.cv(params,dtrain,num_boost_round=num_boost_round,seed=42,nfold=5,metrics=['rmse'],early_stopping_rounds=10)
    # Update best score
    mean_mae = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))


In [None]:
params['eta'] = .03

In [None]:
# Let's look at out final parameters
params

In [None]:
cv_results = xgboost.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'rmse'},
    early_stopping_rounds=10
)

In [None]:
cv_results

In [None]:
cv_results['test-rmse-mean'].min()



In [None]:
## Training the final model

In [None]:
model = xgboost.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

In [None]:
num_boost_round = model.best_iteration + 1
best_model = xgboost.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)

In [None]:
mean_squared_error(best_model.predict(dtest), y_test)