### Number of clicks prediction
---

In [1]:
import pandas as pd
import numpy as np
import requests
from io import StringIO

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostRegressor,  BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MaxAbsScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.base import clone, TransformerMixin
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import KFold
import time
import pickle
from pandas.plotting import scatter_matrix
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import SGDRegressor
import xgboost
from xgboost import XGBRegressor






In [2]:
# First, let's read our cleaned data

In [3]:
hotel_data = pd.read_csv('cleaned.csv')
hotel_data = hotel_data.drop(columns = ['hotel_id'])
hotel_data.head()

Unnamed: 0,content_score,n_images,distance_to_center,avg_rating,stars,n_reviews,avg_rank,avg_price,avg_saving_percent,n_clicks,count
0,70.0,2.0,1199.0,77.0,4.0,861.0,17.55,81.64,18.0,0.0,80.0
1,67.0,3.0,12585.0,90.0,4.0,4371.0,17.383,189.38,28.0,4.0,751.0
2,59.0,8.0,3291.0,73.0,2.0,3084.0,7.0,72.16,2.0,4.0,5.0
3,66.0,1.0,288.0,80.0,0.0,603.0,12.564,173.25,0.0,10.0,73.0
4,58.0,2.0,1249.0,87.0,0.0,1683.0,18.391,96.7,0.0,0.0,68.0


#### Initial Modeling

In [4]:
# First let's create our X and y for the model
initial_model_data = hotel_data.copy()

In [5]:
# now let's split the data into train and test sets

In [6]:
def split_data(data):
    """
    Split the data into X and y and then into train and test set
    
    input:
    -----
    data: dataframe 
    
    returns:
    --------
    X_train,X_test,y_train,y_test
    """
    y = data['n_clicks']
    X = data.drop(columns = ['n_clicks'])
    
    (X_train, X_test, y_train, y_test)  = train_test_split(X,y,test_size = 0.2, random_state = 2020)
    return X_train, X_test, y_train, y_test

In [7]:
X_train_0, X_test_0, y_train_0, y_test_0 = split_data(initial_model_data)

### Pipeline

In [8]:
# To create a pipeline we need to specify that columns we want to prepoces

In [9]:
numerical_cols = list(X_train_0.columns)
numerical_cols

['content_score',
 'n_images',
 'distance_to_center',
 'avg_rating',
 'stars',
 'n_reviews',
 'avg_rank',
 'avg_price',
 'avg_saving_percent',
 'count']

In [10]:
numerical_transformer = Pipeline(steps = [('imputer',SimpleImputer()),
                                         ('scaler',StandardScaler())                                                     
                                        ])

In [11]:
preprocessor = ColumnTransformer(transformers = [('num',numerical_transformer,numerical_cols)])

### Baseline: Linear Model :
Linear models are fast, and easy to build and interpret, we will start with Ridge models which apply linear regression with regularization.


In [12]:
ridge=  RidgeCV(alphas=[1e-3, 1e-2, 1e-1,0.2,0.4,0.6,0.8, 1])
reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor',ridge)])

In [24]:
start_time = time.time()
reg_pipeline.fit(X_train_0,y_train_0)
print('model score on training set = ',reg_pipeline.score(X_train_0, y_train_0))
print('model score on test set = ',reg_pipeline.score(X_test_0, y_test_0))
y_pred_0 = reg_pipeline.predict(X_test_0)
elapsed_time = time.time() - start_time
print('elapsed time = ',elapsed_time )
print('MSE = ',mean_squared_error(y_test_0, y_pred_0))

model score on training set =  0.11949700423537135
model score on test set =  0.11691446442852249
elapsed time =  0.48073792457580566
MSE =  28.51637863610852


In [25]:
def evaluate_model(model,X_test,y_test):
    """
    evaluate the performance of a given model on the X_test
    
    input:
    -----
    
    model : fitted model
    X_test : dataframe
    y_test : dataframe
    
        """

    start_time = time.time()
    print('model score on test set = ',model.score(X_test, y_test))
    y_pred = model.predict(X_test)
    elapsed_time = time.time() - start_time
    print('elapsed time = ',elapsed_time )
    print('MSE = ',mean_squared_error(y_test, y_pred))
    
    

> The model is fas but it has very poor performance. Could we improve the performance.

In [26]:
# let's create a function for the pipeline since we will need many times later

In [27]:
def create_pipeline(X_train,y_train, model):
    
    """
    Create a pipeline for preprocessing and modeling
    
    inputs:
    -----
    X_train: dataframe
    model : sklearn model
    """
    
    numerical_cols = list(X_train.columns)
    numerical_transformer = Pipeline(steps = [('imputer',SimpleImputer()),
                                         ('scaler',StandardScaler())                                                     
                                        ])
    preprocessor = ColumnTransformer(transformers = [('num',numerical_transformer,numerical_cols)])
    reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', model)])
    reg_pipeline.fit(X_train,y_train)
    
    return reg_pipeline


### Feature Engineering

One fast way to improve the peformance is to create more meaningful feature. For example, we have the avg_price and the avg_saving_percent, but often people think in terms of how much money the save. Therefore, we can create new feature, let's call it avg_saving_cash by multiplying avg_price and avg_saving_percent



In [28]:
# first we will create a copy of the original dataset
hotel_data_fengineering = hotel_data.copy()

- Now let's create our new feature

In [29]:
hotel_data_fengineering['avg_saving_cash'] =hotel_data_fengineering['avg_price']*hotel_data_fengineering['avg_saving_percent']

In [30]:
# let's drop these columns to avoid colinearlity.

In [31]:
hotel_data_fengineering = hotel_data_fengineering.drop(columns = ['avg_price','avg_saving_percent'])
hotel_data_fengineering

Unnamed: 0,content_score,n_images,distance_to_center,avg_rating,stars,n_reviews,avg_rank,n_clicks,count,avg_saving_cash
0,70.0,2.0,1199.0,77.0,4.0,861.0,17.550,0.0,80.0,1469.52
1,67.0,3.0,12585.0,90.0,4.0,4371.0,17.383,4.0,751.0,5302.64
2,59.0,8.0,3291.0,73.0,2.0,3084.0,7.000,4.0,5.0,144.32
3,66.0,1.0,288.0,80.0,0.0,603.0,12.564,10.0,73.0,0.00
4,58.0,2.0,1249.0,87.0,0.0,1683.0,18.391,0.0,68.0,0.00
...,...,...,...,...,...,...,...,...,...,...
372925,48.0,1.0,469.0,83.0,0.0,0.0,21.000,0.0,16.0,0.00
372926,42.0,0.0,689.0,83.0,0.0,72.0,14.174,0.0,66.0,466.24
372927,70.0,1.0,1424.0,83.0,0.0,0.0,16.211,4.0,11.0,0.00
372928,42.0,0.0,164.0,84.0,0.0,222.0,20.000,0.0,5.0,0.00


In [32]:
# now let's split the data
X_train_1, X_test_1, y_train_1, y_test_1 = split_data(hotel_data_fengineering)

In [33]:
# making a pipleline
ridge=  RidgeCV(alphas=[1e-3, 1e-2, 1e-1,0.2,0.4,0.6,0.8, 1])
reg_pipeline2 = create_pipeline(X_train_1,y_train_1,ridge )

In [34]:
# evaluating the performance of the model
evaluate_model(reg_pipeline2,X_test_1,y_test_1)

model score on test set =  0.12239552717826341
elapsed time =  0.0450739860534668
MSE =  28.339385520035417


> Indeed we notice some improvement for the model which has the new features

### Testing other Models:

We will now test several models including SGD, Gradient Boost, Random Forest and XGB and evaluate them base on mean squared error and performance

In [None]:
def build_models():
    """
    creates regression a selected models with the defualt values
    
    input :
    ----
    None
    returns:
    -------
    Two lists, one for models and the second is for the name of the models
    """
    sgd = SGDRegressor(random_state=2020)
    grad = GradientBoostingRegressor(random_state=2020)
    rf =RandomForestRegressor(random_state=2020)
    xgb = XGBRegressor()
    regressors = [sgd, grad,rf,xgb]
    regressors_names = ['SGDRegressor','Gradient Boost','Random Forest','xgb']
    return  regressors,regressors_names

In [None]:
# now let's test these models and evaluate their performance

    
regressors,regressor_names = build_models()

for regressor,regressor_name in zip(regressors,regressor_names):
    regressor_pipeline = Pipeline(steps = [('preprocessor',preprocessor),
                                      ('regressor',regressor)])
    start_time = time.time()
    regressor_pipeline.fit(X_train,y_train)
    print('Results for ',regressor_name)
    print('Training score = ',regressor_pipeline.score(X_train,y_train))
    print('Test score = ',regressor_pipeline.score(X_test,y_test))
    y_pred = regressor_pipeline.predict(X_test)
    elapsed_time = time.time() - start_time
    print('elapsed time = ',elapsed_time )
    print('MSE = ',mean_squared_error(y_test, y_pred))
    print('______________________________________________________')

 #### According to this results,we have the following observations :
- Linear Regression SGDRegressor have the fastest models but also have the lowest accuracy
- Gradient Boost model has the a bias issues, while random forest has a variance issues.
- XGBRegressor seems to be the most promsing in terms of MSE. We will adopt this model and try to fine tune the paramters to improve the mse.

### Model Improvements : Hyperparamters optimization

In [None]:
# let's create our X_train and X_test since it has it's own cross-validation.
data_train = xgboost.DMatrix(X_train, label=y_train)
data_test = xgboost.DMatrix(X_test, label=y_test)

In [None]:
# we will now create our initial set of paramters, which are the defual

In [None]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:squarederror',
}

In [None]:
params['eval_metric'] = "rmse"


In [None]:
num_boost_round = 999

In [None]:
model = xgboost.train(
    params,
    data_train,
    num_boost_round=num_boost_round,
    evals=[(data_test, "Test")],
    early_stopping_rounds=10
)

In [None]:
## Tunning paramters

In [None]:
#Parameters max_depth and min_child_weight

In [None]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(6,10)
    for min_child_weight in range(1,4)
]

In [None]:
#Define initial best params and MAE
min_rmse = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgboost.cv(
        params,
        data_train,
        num_boost_round=num_boost_round,
        seed=2020,
        nfold=10,
        metrics={'rmse'},
        early_stopping_rounds=20
    )
    # Update best rmse
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\trmse {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], min_rmse))

In [None]:
params['max_depth'] = 4
params['min_child_weight'] = 2

In [None]:
#Parameters subsample and colsample_bytree

In [None]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [None]:
min_rmse = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgboost.cv(
        params,
        data_train,
        num_boost_round=num_boost_round,
        seed=2020,
        nfold=1010,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\trmse {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
print("Best params: {}, {}, rmse: {}".format(best_params[0], best_params[1], min_rmse))

In [None]:
params['subsample'] = 0.9
params['colsample_bytree'] = 0.8

In [None]:
#Parameter ETA

In [None]:
%time
# This can take some time…
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    %time cv_results = xgboost.cv(params,dtrain,num_boost_round=num_boost_round,seed=42,nfold=5,metrics=['rmse'],early_stopping_rounds=10)
    # Update best score
    mean_mae = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))


In [None]:
params['eta'] = .03

In [None]:
# Let's look at out final parameters
params

In [None]:
cv_results = xgboost.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'rmse'},
    early_stopping_rounds=10
)

In [None]:
cv_results

In [None]:
cv_results['test-rmse-mean'].min()



In [None]:
## Training the final model

In [None]:
model = xgboost.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

In [None]:
num_boost_round = model.best_iteration + 1
best_model = xgboost.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)

In [None]:
mean_squared_error(best_model.predict(dtest), y_test)