### Number of clicks prediction
---

In [1]:
import pandas as pd
import numpy as np
import requests
from io import StringIO

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostRegressor,  BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MaxAbsScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.base import clone, TransformerMixin
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import KFold
import time
import pickle
from pandas.plotting import scatter_matrix
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import SGDRegressor
from xgboost import XGBRegressor



In [3]:
hotel_data = pd.read_pickle('cleaned.pkl')
hotel_data.head()

Unnamed: 0,content_score,n_images,distance_to_center,avg_rating,stars,n_reviews,avg_rank,avg_price,avg_saving_percent,n_clicks,...,city_id_878630.0,city_id_878634.0,city_id_878644.0,city_id_878652.0,city_id_878668.0,city_id_878678.0,city_id_878696.0,city_id_878704.0,city_id_878708.0,city_id_878736.0
0,70.0,2.0,1199.0,77.0,4.0,861.0,17.55,81.64,18.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,67.0,3.0,12585.0,90.0,4.0,4371.0,17.383,189.38,28.0,4.0,...,0,0,0,0,0,0,0,0,0,0
3,59.0,8.0,3291.0,73.0,2.0,3084.0,7.0,72.16,2.0,4.0,...,0,0,0,0,0,0,0,0,0,0
4,66.0,1.0,288.0,80.0,0.0,603.0,12.564,173.25,0.0,10.0,...,0,0,0,0,0,0,0,0,0,0
5,58.0,2.0,1249.0,87.0,0.0,1683.0,18.391,96.7,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


#### Initial Modeling

In [4]:
# for our initial model we will just drop the rows that have nans, this is initial approach, we will later deal with how to fill those nans
initial_model_data = hotel_data.copy()

y_inital_model= initial_model_data['n_clicks']
X_initial_model = initial_model_data.drop(columns = ['n_clicks'])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_initial_model,y_inital_model,test_size = 0.2, random_state = 2020)

In [7]:
# To create a pipeline we need to specify that columns we want to prepoces

In [5]:
numerical_cols = list(X_train.columns[:9])
numerical_cols

NameError: name 'X_train' is not defined

In [9]:
numerical_transformer = Pipeline(steps = [('imputer',SimpleImputer()),
                                         ('scaler',StandardScaler())
                                         
                                         
                                        ])

In [10]:
preprocessor = ColumnTransformer(transformers = [('num',numerical_transformer,numerical_cols)])



### Baseline: Linear Model :
Linear model are fast, and easy to build and interpret, we will start with Ridge model whicha apply linear regression with regularization


In [13]:
reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', RidgeCV(alphas=[1e-3, 1e-2, 1e-1,0.2,0.4,0.6,0.8, 1]))])

In [14]:
start_time = time.time()
reg.fit(X_train,y_train)
print('model score on training set = ',reg.score(X_train, y_train))
print('model score on test set = ',reg.score(X_test, y_test))
y_pred = reg.predict(X_test)
elapsed_time = time.time() - start_time
print('elapsed time = ',elapsed_time )
print('MSE = ',mean_squared_error(y_test, y_pred))

model score on training set =  0.1140293197510267
model score on test set =  0.11193137467656533
elapsed time =  0.4928619861602783
MSE =  28.67729133190142


### Testing Models

In [9]:
def get_models():
    sgd = SGDRegressor(random_state=2020)
    grad = GradientBoostingRegressor(random_state=2020)
    rf =RandomForestRegressor(random_state=2020)
    xgb = XGBRegressor()
    classifier_list = [sgd, grad,rf,xgb]
    classifier_name_list = ['SGDRegressor','Gradient Boost','Random Forest','xgb']
    return classifier_list,classifier_name_list
    
classifier_list, classifier_name_list = get_models()

for classifier,classifier_name in zip(classifier_list,classifier_name_list):
    regressor_pipeline = Pipeline(steps = [('preprocessor',preprocessor),
                                      ('regressor',classifier)])
    start_time = time.time()
    regressor_pipeline.fit(X_train,y_train)
    print('Results for ',classifier_name)
    print('Training score = ',regressor_pipeline.score(X_train,y_train))
    print('Test score = ',regressor_pipeline.score(X_test,y_test))
    y_pred = regressor_pipeline.predict(X_test)
    elapsed_time = time.time() - start_time
    print('elapsed time = ',elapsed_time )
    print('MSE = ',mean_squared_error(y_test, y_pred))
    print('______________________________________________________')


Results for  SGDRegressor
Training score =  0.1129600916843917
Test score =  0.11090880631948974
elapsed time for fitting  =  1.0046007633209229
MSE =  28.710311855143033
______________________________________________________
Results for  Gradient Boost
Training score =  0.21398156141352231
Test score =  0.2109019065342218
elapsed time for fitting  =  35.49871516227722
MSE =  25.481359514896205
______________________________________________________
Results for  Random Forest
Training score =  0.885254852220597
Test score =  0.18821664050586462
elapsed time for fitting  =  160.0541341304779
MSE =  26.213906487378154
______________________________________________________
Results for  xgb
Training score =  0.33790573991777506
Test score =  0.2627092981409206
elapsed time for fitting  =  6.909333944320679
MSE =  23.808408101135683
______________________________________________________


 #### According to this results,we have the following observations :
- Linear Regression SGDRegressor have the fastest models but also have the lowest accuracy
- Gradient Boost model has the a bias issues, while random forest has a variance issues.
- XGBRegressor seems to be the most promsing in terms of MSE.

### Model Improvements