### Number of clicks prediction
---

In [1]:
import pandas as pd
import numpy as np
import requests
from io import StringIO

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostRegressor,  BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MaxAbsScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.base import clone, TransformerMixin
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import KFold
import time
import pickle
from pandas.plotting import scatter_matrix
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import SGDRegressor
from xgboost import XGBRegressor




In [2]:
hotel_data = pd.read_pickle('cleaned.pkl')
hotel_data.head()

Unnamed: 0,content_score,n_images,distance_to_center,avg_rating,stars,n_reviews,avg_rank,avg_price,avg_saving_percent,n_clicks,...,city_id_878630.0,city_id_878634.0,city_id_878644.0,city_id_878652.0,city_id_878668.0,city_id_878678.0,city_id_878696.0,city_id_878704.0,city_id_878708.0,city_id_878736.0
0,70.0,2.0,1199.0,77.0,4.0,861.0,17.55,81.64,18.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,67.0,3.0,12585.0,90.0,4.0,4371.0,17.383,189.38,28.0,4.0,...,0,0,0,0,0,0,0,0,0,0
3,59.0,8.0,3291.0,73.0,2.0,3084.0,7.0,72.16,2.0,4.0,...,0,0,0,0,0,0,0,0,0,0
4,66.0,1.0,288.0,80.0,0.0,603.0,12.564,173.25,0.0,10.0,...,0,0,0,0,0,0,0,0,0,0
5,58.0,2.0,1249.0,87.0,0.0,1683.0,18.391,96.7,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
hotel_data = hotel_data.iloc[:,:10]

#### Initial Modeling

In [4]:
# for our initial model we will just drop the rows that have nans, this is initial approach, we will later deal with how to fill those nans
initial_model_data = hotel_data.copy()

y_inital_model= initial_model_data['n_clicks']
X_initial_model = initial_model_data.drop(columns = ['n_clicks'])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_initial_model,y_inital_model,test_size = 0.2, random_state = 2020)

In [None]:
# To create a pipeline we need to specify that columns we want to prepoces

In [6]:
numerical_cols = list(X_train.columns[:9])
numerical_cols

['content_score',
 'n_images',
 'distance_to_center',
 'avg_rating',
 'stars',
 'n_reviews',
 'avg_rank',
 'avg_price',
 'avg_saving_percent']

In [None]:
numerical_transformer = Pipeline(steps = [('imputer',SimpleImputer()),
                                         ('scaler',StandardScaler())                                                     
                                        ])

In [None]:
preprocessor = ColumnTransformer(transformers = [('num',numerical_transformer,numerical_cols)])

### Baseline: Linear Model :
Linear model are fast, and easy to build and interpret, we will start with Ridge model whicha apply linear regression with regularization


In [None]:
reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', RidgeCV(alphas=[1e-3, 1e-2, 1e-1,0.2,0.4,0.6,0.8, 1]))])

In [None]:
start_time = time.time()
reg.fit(X_train,y_train)
print('model score on training set = ',reg.score(X_train, y_train))
print('model score on test set = ',reg.score(X_test, y_test))
y_pred = reg.predict(X_test)
elapsed_time = time.time() - start_time
print('elapsed time = ',elapsed_time )
print('MSE = ',mean_squared_error(y_test, y_pred))

### Testing other Models:

In [None]:
def get_models():
    sgd = SGDRegressor(random_state=2020)
    grad = GradientBoostingRegressor(random_state=2020)
    rf =RandomForestRegressor(random_state=2020)
    xgb = XGBRegressor()
    classifier_list = [sgd, grad,rf,xgb]
    classifier_name_list = ['SGDRegressor','Gradient Boost','Random Forest','xgb']
    return classifier_list,classifier_name_list
    
classifier_list, classifier_name_list = get_models()

for classifier,classifier_name in zip(classifier_list,classifier_name_list):
    regressor_pipeline = Pipeline(steps = [('preprocessor',preprocessor),
                                      ('regressor',classifier)])
    start_time = time.time()
    regressor_pipeline.fit(X_train,y_train)
    print('Results for ',classifier_name)
    print('Training score = ',regressor_pipeline.score(X_train,y_train))
    print('Test score = ',regressor_pipeline.score(X_test,y_test))
    y_pred = regressor_pipeline.predict(X_test)
    elapsed_time = time.time() - start_time
    print('elapsed time = ',elapsed_time )
    print('MSE = ',mean_squared_error(y_test, y_pred))
    print('______________________________________________________')


 #### According to this results,we have the following observations :
- Linear Regression SGDRegressor have the fastest models but also have the lowest accuracy
- Gradient Boost model has the a bias issues, while random forest has a variance issues.
- XGBRegressor seems to be the most promsing in terms of MSE.

### Model Improvements : Hyperparamters optimization

In [None]:
regressor_pipeline = Pipeline(steps = [('preprocessor',preprocessor),
                                      ('regressor',XGBRegressor())])
#XGBRegressor()
params = {
            "regressor__n_estimators" :[100,150],
    
        }




In [None]:
regressor_pipeline

In [None]:
gs = RandomizedSearchCV(regressor_pipeline,params,cv = 2)
start_time = time.time()
gs.fit(X_train,y_train)
print('model score on training set = ',gs.score(X_train,y_train))
print('model score on test set = ',gs.score(X_test,y_test))
elapsed_time = time.time() - start_time
print('elapsed time = ',elapsed_time )

In [7]:
import xgboost

In [8]:
dtrain = xgboost.DMatrix(X_train, label=y_train)
dtest = xgboost.DMatrix(X_test, label=y_test)

In [9]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:linear',
}

In [11]:
num_boost_round = 9

In [14]:
cv_results = xgboost.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'rmse'},
    early_stopping_rounds=10
)




In [15]:
cv_results

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,5.561122,0.007799,5.564794,0.034563
1,5.337218,0.008214,5.345545,0.03449
2,5.215035,0.008878,5.227078,0.033832
3,5.14007,0.007863,5.155901,0.033633
4,5.093607,0.008268,5.114673,0.032957
5,5.062695,0.008143,5.089759,0.032866
6,5.041385,0.008616,5.073365,0.032439
7,5.023303,0.012695,5.059328,0.028617
8,5.011074,0.013032,5.052267,0.027478


In [16]:
cv_results['test-rmse-mean'].min()


5.0522666

In [23]:
#Parameters max_depth and min_child_weight

In [17]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]

In [18]:
#Define initial best params and MAE
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgboost.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=10,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_mae = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], min_mae))

CV with max_depth=9, min_child_weight=5
	MAE 5.0177492 for 8 rounds
CV with max_depth=9, min_child_weight=6
	MAE 5.012842400000001 for 8 rounds
CV with max_depth=9, min_child_weight=7
	MAE 5.016737600000001 for 8 rounds
CV with max_depth=10, min_child_weight=5
	MAE 5.0179414 for 8 rounds
CV with max_depth=10, min_child_weight=6
	MAE 5.0173586 for 8 rounds
CV with max_depth=10, min_child_weight=7
	MAE 5.0152201000000005 for 8 rounds
CV with max_depth=11, min_child_weight=5
	MAE 5.031211 for 8 rounds
CV with max_depth=11, min_child_weight=6
	MAE 5.0304305000000005 for 8 rounds
CV with max_depth=11, min_child_weight=7
	MAE 5.023995 for 8 rounds
Best params: 9, 6, RMSE: 5.012842400000001


In [19]:
params['max_depth'] = 9
params['min_child_weight'] = 6

In [22]:
#Parameters subsample and colsample_bytree

In [20]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [21]:
min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgboost.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with subsample=1.0, colsample=1.0
	MAE 5.023483000000001 for 8 rounds
CV with subsample=1.0, colsample=0.9
	MAE 5.0281944 for 8 rounds
CV with subsample=1.0, colsample=0.8
	MAE 5.034265400000001 for 8 rounds
CV with subsample=1.0, colsample=0.7
	MAE 5.0464702 for 8 rounds
CV with subsample=0.9, colsample=1.0
	MAE 5.017424 for 8 rounds
CV with subsample=0.9, colsample=0.9
	MAE 5.0194548 for 8 rounds
CV with subsample=0.9, colsample=0.8
	MAE 5.017242999999999 for 8 rounds
CV with subsample=0.9, colsample=0.7
	MAE 5.030288199999999 for 8 rounds
CV with subsample=0.8, colsample=1.0
	MAE 5.0260853999999995 for 8 rounds
CV with subsample=0.8, colsample=0.9
	MAE 5.0219624000000005 for 8 rounds
CV with subsample=0.8, colsample=0.8
	MAE 5.0266076 for 8 rounds
CV with subsample=0.8, colsample=0.7
	MAE 5.0316994 for 8 rounds
CV with subsample=0.7, colsample=1.0
	MAE 5.0316936 for 8 rounds
CV with subsample=0.7, colsample=0.9
	MAE 5.0292766 for 8 rounds
CV with subsample=0.7, colsample=0.8
	MAE

In [24]:
params['subsample'] = 0.9
params['colsample_bytree'] = 0.8

In [25]:
#Parameter ETA

In [26]:
%time
# This can take some time…
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    %time cv_results = xgboost.cv(params,dtrain,num_boost_round=num_boost_round,seed=42,nfold=5,metrics=['rmse'],early_stopping_rounds=10)
    # Update best score
    mean_mae = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))


CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 4.77 µs
CV with eta=0.3
CPU times: user 42.6 s, sys: 1.57 s, total: 44.2 s
Wall time: 4.88 s
	MAE 5.017242999999999 for 8 rounds

CV with eta=0.2
CPU times: user 45.7 s, sys: 902 ms, total: 46.6 s
Wall time: 4.64 s
	MAE 5.0649472 for 8 rounds

CV with eta=0.1
CPU times: user 48.6 s, sys: 966 ms, total: 49.6 s
Wall time: 4.92 s
	MAE 5.227490999999999 for 8 rounds

CV with eta=0.05
CPU times: user 49.5 s, sys: 869 ms, total: 50.4 s
Wall time: 4.96 s
	MAE 5.4661838000000005 for 8 rounds

CV with eta=0.01
CPU times: user 50.7 s, sys: 1.41 s, total: 52.1 s
Wall time: 5.34 s
	MAE 5.832947 for 8 rounds

CV with eta=0.005
CPU times: user 51.8 s, sys: 1.68 s, total: 53.5 s
Wall time: 5.44 s
	MAE 5.894923799999999 for 8 rounds

Best params: 0.3, MAE: 5.017242999999999


In [27]:
params['eta'] = .03

In [28]:
# Let's look at out final parameters
params

{'max_depth': 9,
 'min_child_weight': 6,
 'eta': 0.03,
 'subsample': 0.9,
 'colsample_bytree': 0.8,
 'objective': 'reg:linear'}

In [29]:
cv_results = xgboost.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'rmse'},
    early_stopping_rounds=10
)



In [30]:
cv_results

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,5.913422,0.00842,5.915222,0.034277
1,5.867158,0.008223,5.870719,0.034316
2,5.825013,0.009274,5.830807,0.033195
3,5.783758,0.009962,5.79158,0.032483
4,5.74581,0.009879,5.755746,0.032315
5,5.707881,0.009452,5.719818,0.032821
6,5.671789,0.008772,5.685879,0.033396
7,5.638101,0.007942,5.654564,0.034432
8,5.605409,0.007793,5.623817,0.034468


In [31]:
cv_results['test-rmse-mean'].min()



5.6238171999999995

In [32]:
## Training the final model

In [35]:
model = xgboost.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-rmse:5.92719
Will train until Test-rmse hasn't improved in 10 rounds.
[1]	Test-rmse:5.88268
[2]	Test-rmse:5.84406
[3]	Test-rmse:5.80358
[4]	Test-rmse:5.76412
[5]	Test-rmse:5.72865
[6]	Test-rmse:5.69319
[7]	Test-rmse:5.66376
[8]	Test-rmse:5.63184


In [36]:
num_boost_round = model.best_iteration + 1
best_model = xgboost.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)

[0]	Test-rmse:5.92719
[1]	Test-rmse:5.88268
[2]	Test-rmse:5.84406
[3]	Test-rmse:5.80358
[4]	Test-rmse:5.76412
[5]	Test-rmse:5.72865
[6]	Test-rmse:5.69319
[7]	Test-rmse:5.66376
[8]	Test-rmse:5.63184


In [37]:
mean_squared_error(best_model.predict(dtest), y_test)

31.71761236212424