### Number of clicks prediction
---

In [352]:
import pandas as pd
import numpy as np
import requests
from io import StringIO

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostRegressor,  BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MaxAbsScaler,MinMaxScaler,PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.base import clone, TransformerMixin
from sklearn.model_selection import KFold
import time
import pickle
from pandas.plotting import scatter_matrix
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import SGDRegressor
import xgboost
from xgboost import XGBRegressor
from sklearn.model_selection import RepeatedStratifiedKFold





In [353]:
def loading_data(file_url):
    """
    Reading data from a given url
    
    inputs:
    ------
    url : link to the data
    returns
    -----:
    data : data frame  
    """
    
    file_id = file_url.split('/')[-2]
    dwn_url='https://drive.google.com/uc?export=download&id=' + file_id
    url = requests.get(dwn_url).text
    csv_raw = StringIO(url)
    df = pd.read_csv(csv_raw)
    
    
    return df

### Modeling
---

In [230]:
# First, let's read our cleaned data

In [405]:
hotel_data = pd.read_csv('cleaned.csv')
#hotel_data = hotel_data.drop(columns = ['hotel_id']) # dropping the hotel_id column as we can not use it as predictor!
hotel_data

Unnamed: 0,hotel_id,content_score,n_images,distance_to_center,avg_rating,stars,n_reviews,avg_rank,avg_price,avg_saving_percent,n_clicks,count
0,9.767406e+10,70.0,2.0,1199.0,77.0,4.0,861.0,17.550,81.64,18.0,0.0,80.0
1,9.768889e+10,67.0,3.0,12585.0,90.0,4.0,4371.0,17.383,189.38,28.0,4.0,751.0
2,9.824279e+10,59.0,8.0,3291.0,73.0,2.0,3084.0,7.000,72.16,2.0,4.0,5.0
3,9.833438e+10,66.0,1.0,288.0,80.0,0.0,603.0,12.564,173.25,0.0,10.0,73.0
4,9.839326e+10,58.0,2.0,1249.0,87.0,0.0,1683.0,18.391,96.70,0.0,0.0,68.0
...,...,...,...,...,...,...,...,...,...,...,...,...
372925,2.288169e+11,48.0,1.0,469.0,0.0,0.0,0.0,21.000,78.86,0.0,0.0,16.0
372926,2.289822e+11,42.0,0.0,689.0,83.0,0.0,72.0,14.174,116.56,4.0,0.0,66.0
372927,2.290205e+11,70.0,1.0,1424.0,0.0,0.0,0.0,16.211,16.11,0.0,4.0,11.0
372928,2.292421e+11,42.0,0.0,164.0,84.0,0.0,222.0,20.000,253.81,0.0,0.0,5.0


#### Initial Modeling

In [406]:
# First let's create our X and y for the model
initial_model_data = hotel_data.copy()

In [407]:
# now let's split the data into train and test sets
# since we have seen in the EDA that our target y has a skwed distribtion we will use stratified splitting

In [408]:
def split_data(data, test_size = 0.2):
    """
    Split the data into X and y and then into train and test set
    
    input:
    -----
    data: dataframe 
    
    returns:
    --------
    X_train,X_test,y_train,y_test
    """
    y = data['n_clicks']
    X = data.drop(columns = ['n_clicks'])
    # splitting with 20% test data for validation and using stratification paramter
    
    (X_train, X_test, y_train, y_test)  = train_test_split(X,y, stratify = y, test_size = test_size, random_state = 2020)
    return X_train, X_test, y_train, y_test

In [417]:
X_train_0, X_test_0, y_train_0, y_test_0 = split_data(initial_model_data) # creating out initial split

### Pipeline : MinMax scaling

In [418]:
# To create a pipeline we need to specify that columns we want to prepoces

In [419]:
numerical_cols = list(X_train_0.columns)
numerical_cols

['hotel_id',
 'content_score',
 'n_images',
 'distance_to_center',
 'avg_rating',
 'stars',
 'n_reviews',
 'avg_rank',
 'avg_price',
 'avg_saving_percent',
 'count']

In [420]:
numerical_transformer = Pipeline(steps = [('scaler',MinMaxScaler(feature_range=(1, 2))),
                                        ])

In [421]:
preprocessor = ColumnTransformer(transformers = [('num',numerical_transformer,numerical_cols)])

### Baseline: Linear Model with MinMax scaling and cross validation:
Our base model will be a Linear model, why? because linear models are fast, and easy to build and interpret,if we can fit a linear model with a good performance then it will be fast when deployed in produciton. Also we would understand exactly how each predictor contribute to our tarrget variable `n_clicks`. We will start with Ridge models which apply linear regression with regularization.

In [422]:
# cross validation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=2020)

In [423]:
# building lineaer model
ridge=  RidgeCV(alphas=[1e-3, 1e-2, 1e-1,0.2,0.4,0.6,0.8, 1], cv =cv)
reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor',ridge)])

In [424]:
start_time = time.time()
reg_pipeline.fit(X_train_0,y_train_0)
print('model score on training set = ',reg_pipeline.score(X_train_0, y_train_0))
print('model score on test set = ',reg_pipeline.score(X_test_0, y_test_0))
y_pred_0 = reg_pipeline.predict(X_test_0)
elapsed_time = time.time() - start_time
print('elapsed time = ',elapsed_time )
print('MSE = ',mean_squared_error(y_test_0, y_pred_0))

model score on training set =  0.12249691119190387
model score on test set =  0.12126171905468575
elapsed time =  10.228202819824219
MSE =  28.290494949386147


> The model is fas but it has very poor performance. Could we improve the performance.

### Base model +  Power Transformation:
EDA has revealed that some features has a heavy tail distributon issues. As we know linear models such as Ridge assumes that features have Gaussian probability distribution.  We will try to see if power transformation helps in this regards:

In [271]:
numerical_transformer = Pipeline(steps = [('scaler',MinMaxScaler(feature_range=(1, 2))),
                                         ('power',PowerTransformer(method='box-cox'))
                                        ])
preprocessor = ColumnTransformer(transformers = [('num',numerical_transformer,numerical_cols)])

In [272]:
ridge=  RidgeCV(alphas=[1e-3, 1e-2, 1e-1,0.2,0.4,0.6,0.8, 1], cv =cv)
reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor',ridge)])
start_time = time.time()
reg_pipeline.fit(X_train_0,y_train_0)
print('model score on training set = ',reg_pipeline.score(X_train_0, y_train_0))
print('model score on test set = ',reg_pipeline.score(X_test_0, y_test_0))
y_pred_0 = reg_pipeline.predict(X_test_0)
elapsed_time = time.time() - start_time
print('elapsed time = ',elapsed_time )
print('MSE = ',mean_squared_error(y_test_0, y_pred_0))

model score on training set =  0.1310143590969931
model score on test set =  0.13141377109306662
elapsed time =  10.577090740203857
MSE =  27.963655225721492


> Indeed there is a considerable improvement!

In [137]:
# We will create a reusable funciton to evaluate future models

In [273]:
def evaluate_model(model,X_train,y_train,X_test,y_test):
    """
    evaluate the performance of a given model on the X_test
    
    input:
    -----
    
    model : fitted model
    X_test : dataframe
    y_test : dataframe
    
        """

    start_time = time.time()
    print('model score on training set = ',model.score(X_train, y_train))
    print('model score on test set = ',model.score(X_test, y_test))
    y_pred = model.predict(X_test)
    elapsed_time = time.time() - start_time
    print('elapsed time = ',elapsed_time )
    print('MSE = ',mean_squared_error(y_test, y_pred))
    
    

In [204]:
# let's create a function for the pipeline since we will need many times later

In [274]:
def create_pipeline(X_train,y_train, model):
    
    """
    Create a pipeline for preprocessing and modeling
    
    inputs:
    -----
    X_train: dataframe
    model : sklearn model
    """
    
    numerical_cols = list(X_train.columns)
    numerical_transformer = Pipeline(steps = [('scaler',(MinMaxScaler(feature_range=(1, 2)))),
                                        ('power',PowerTransformer(method='box-cox'))
                                        ])
    preprocessor = ColumnTransformer(transformers = [('num',numerical_transformer,numerical_cols)])
    reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', model)])
    reg_pipeline.fit(X_train,y_train)
    
    return reg_pipeline


### Base model +  Power Transformation + Feature Engineering

One fast way to improve the peformance is to create more meaningful feature. For example, we have the avg_price and the avg_saving_percent, but often people think in terms of how much money the save. Therefore, we can create new feature, let's call it avg_saving_cash by multiplying avg_price and avg_saving_percent



In [275]:
# first we will create a copy of the original dataset
hotel_data_fengineering = hotel_data.copy()

- Now let's create our new feature

In [276]:
hotel_data_fengineering['avg_saving_cash'] =hotel_data_fengineering['avg_price']*hotel_data_fengineering['avg_saving_percent']

In [430]:
# now let's split the data
X_train_1, X_test_1, y_train_1, y_test_1 = split_data(hotel_data_fengineering)

In [431]:
# making a pipleline
ridge=  RidgeCV(alphas=[1e-3, 1e-2, 1e-1,0.2,0.4,0.6,0.8, 1], cv = cv)
reg_pipeline2 = create_pipeline(X_train_1,y_train_1,ridge )

In [279]:
# evaluating the performance of the model
evaluate_model(reg_pipeline2,X_train_1,y_train_1,X_test_1,y_test_1)

model score on training set =  0.13603752259575297
model score on test set =  0.13647822979195234
elapsed time =  0.18760395050048828
MSE =  27.800607767395142


> Again we notice some improvement for the model which has the new feature.

### Testing other Models:

We will now test several models including SGD, Gradient Boost, Random Forest and XGB and evaluate them base on mean squared error and performance.

In [186]:
def build_models():
    """
    creates regression a selected models with the defualt values
    
    input :
    ----
    None
    returns:
    -------
    Two lists, one for models and the second is for the name of the models
    """
    sgd = SGDRegressor(random_state=2020)
    grad = GradientBoostingRegressor(random_state=2020)
    rf = RandomForestRegressor(random_state=2020)
    xgb = XGBRegressor()
    regressors = [sgd, grad,rf,xgb]
    regressors_names = ['SGDRegressor','Gradient Boost','Random Forest','xgb']
    return  regressors,regressors_names

In [213]:
# now let's test these models and evaluate their performance
regressors,regressor_names = build_models()

for regressor,regressor_name in zip(regressors,regressor_names):
    print("Results for",regressor_name)
    print("__________________________________")
    reg_pipeline = create_pipeline(X_train_1,y_train_1,regressor)
    
    evaluate_model(reg_pipeline,X_train_1,y_train_1,X_test_1,y_test_1)
    print("        ")

Results for SGDRegressor
__________________________________
model score on training set =  0.13475556961054513
model score on test set =  0.13585433426047933
elapsed time =  0.17861604690551758
MSE =  27.904777066734866
        
Results for Gradient Boost
__________________________________
model score on training set =  0.23360363173886733
model score on test set =  0.23050581747363796
elapsed time =  0.7015597820281982
MSE =  24.848314895118612
        
Results for Random Forest
__________________________________
model score on training set =  0.8911539648440476
model score on test set =  0.22292319630546065
elapsed time =  14.00472903251648
MSE =  25.093171013326895
        
Results for xgb
__________________________________
model score on training set =  0.37751259491939926
model score on test set =  0.29432473231221057
elapsed time =  0.4946439266204834
MSE =  22.787490358450615
        


 #### According to this results,we have the following observations :
- Linear Regression SGDRegressor have the fastest models but also have the lowest accuracy,
- Gradient Boost model has bias issues, while random forest suffers from variance issues. We can reduce ovefitting of random forest by reducing max depth, n_estimotros or max_features.
- XGBRegressor seems to be the most promsing in terms of MSE. We will adopt this model and try to fine tune the paramters to improve the mse.

### Random Forest + hyperparamter optimization
let's try to reduce the ovefitting of random forest:

In [215]:
rf_regularized  = RandomForestRegressor(random_state=2020, n_estimators =100, max_depth = 13, max_features = 7)
reg_pipeline = create_pipeline(X_train_1,y_train_1,rf_regularized )
evaluate_model(reg_pipeline,X_train_1,y_train_1,X_test_1,y_test_1)
print("        ")

model score on training set =  0.38457126280535037
model score on test set =  0.2390538758264933
elapsed time =  5.826947927474976
MSE =  24.572283119288624
        


> We have reduced overfitting, but the training accuracy also decreased indicating less poweful model.

### XGBOOST : Model Improvements Using Hyperparamters optimization

In [280]:
# let's create our X_train and X_test since it has it's own cross-validation.
data_train = xgboost.DMatrix(X_train_1, label=y_train_1)
data_test = xgboost.DMatrix(X_test_1, label=y_test_1)

In [None]:
# we will now create our initial set of paramters, which are the defual

In [217]:
params = {
    # These are the hyper-parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:squarederror',
}

In [218]:
params['eval_metric'] = "rmse"


In [219]:
num_boost_round = 999

In [220]:
model = xgboost.train(
    params,
    data_train,
    num_boost_round=num_boost_round,
    evals=[(data_test, "Test")],
    early_stopping_rounds=5
)

[0]	Test-rmse:5.55423
Will train until Test-rmse hasn't improved in 5 rounds.
[1]	Test-rmse:5.32266
[2]	Test-rmse:5.18647
[3]	Test-rmse:5.11143
[4]	Test-rmse:5.06422
[5]	Test-rmse:5.03301
[6]	Test-rmse:5.01100
[7]	Test-rmse:4.99705
[8]	Test-rmse:4.98685
[9]	Test-rmse:4.97750
[10]	Test-rmse:4.97253
[11]	Test-rmse:4.95962
[12]	Test-rmse:4.95271
[13]	Test-rmse:4.94316
[14]	Test-rmse:4.92579
[15]	Test-rmse:4.92473
[16]	Test-rmse:4.92256
[17]	Test-rmse:4.91504
[18]	Test-rmse:4.91186
[19]	Test-rmse:4.90840
[20]	Test-rmse:4.90479
[21]	Test-rmse:4.89885
[22]	Test-rmse:4.89806
[23]	Test-rmse:4.89620
[24]	Test-rmse:4.89077
[25]	Test-rmse:4.88590
[26]	Test-rmse:4.88399
[27]	Test-rmse:4.87280
[28]	Test-rmse:4.86342
[29]	Test-rmse:4.86124
[30]	Test-rmse:4.85963
[31]	Test-rmse:4.85641
[32]	Test-rmse:4.85170
[33]	Test-rmse:4.85256
[34]	Test-rmse:4.85049
[35]	Test-rmse:4.84743
[36]	Test-rmse:4.84534
[37]	Test-rmse:4.83770
[38]	Test-rmse:4.83612
[39]	Test-rmse:4.83461
[40]	Test-rmse:4.83428
[41]	Test-r

- 1. Parameters `max_depth` and `min_child_weight`:

In [105]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(6,10)
    for min_child_weight in range(1,4)
]

In [119]:
#Define initial best params and MAE
min_rmse = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgboost.cv(
        params,
        data_train,
        num_boost_round=num_boost_round,
        seed=2020,
        nfold=10,
        metrics={'rmse'},
        early_stopping_rounds=20
    )
    # Update best rmse
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\trmse {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], min_rmse))

CV with max_depth=6, min_child_weight=1
	rmse 4.7532733 for 299 rounds
CV with max_depth=6, min_child_weight=2
	rmse 4.7547477 for 292 rounds
CV with max_depth=6, min_child_weight=3
	rmse 4.7499567 for 387 rounds
CV with max_depth=7, min_child_weight=1
	rmse 4.767374600000001 for 204 rounds
CV with max_depth=7, min_child_weight=2
	rmse 4.765694900000001 for 184 rounds
CV with max_depth=7, min_child_weight=3
	rmse 4.7645952 for 183 rounds
CV with max_depth=8, min_child_weight=1
	rmse 4.7831422 for 147 rounds
CV with max_depth=8, min_child_weight=2
	rmse 4.7878861 for 151 rounds
CV with max_depth=8, min_child_weight=3
	rmse 4.7794752 for 165 rounds
CV with max_depth=9, min_child_weight=1
	rmse 4.7935383 for 116 rounds
CV with max_depth=9, min_child_weight=2
	rmse 4.7994258 for 102 rounds
CV with max_depth=9, min_child_weight=3
	rmse 4.7920015000000005 for 122 rounds
Best params: 6, 3, RMSE: 4.7499567


In [221]:
params['max_depth'] = 6
params['min_child_weight'] = 3

In [None]:
#Parameters subsample and colsample_bytree

In [107]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [111]:
min_rmse = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgboost.cv(
        params,
        data_train,
        num_boost_round=num_boost_round,
        seed=2020,
        nfold=10,
        metrics={'rmse'},
        early_stopping_rounds=20
    )
    # Update best score
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\trmse {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (subsample,colsample)
print("Best params: {}, {}, rmse: {}".format(best_params[0], best_params[1], min_rmse))

CV with subsample=1.0, colsample=1.0
	rmse 4.7440166 for 265 rounds
CV with subsample=1.0, colsample=0.9
	rmse 4.7440166 for 362 rounds
CV with subsample=1.0, colsample=0.8
	rmse 4.7440166 for 256 rounds
CV with subsample=1.0, colsample=0.7
	rmse 4.7440166 for 369 rounds
CV with subsample=0.9, colsample=1.0
	rmse 4.7440166 for 284 rounds
CV with subsample=0.9, colsample=0.9
	rmse 4.7440166 for 281 rounds
CV with subsample=0.9, colsample=0.8
	rmse 4.7440166 for 301 rounds
CV with subsample=0.9, colsample=0.7
	rmse 4.7440166 for 274 rounds
CV with subsample=0.8, colsample=1.0
	rmse 4.7440166 for 237 rounds
CV with subsample=0.8, colsample=0.9
	rmse 4.7440166 for 232 rounds
CV with subsample=0.8, colsample=0.8
	rmse 4.7440166 for 255 rounds
CV with subsample=0.8, colsample=0.7
	rmse 4.7440166 for 282 rounds
CV with subsample=0.7, colsample=1.0
	rmse 4.7440166 for 204 rounds
CV with subsample=0.7, colsample=0.9
	rmse 4.7440166 for 194 rounds
CV with subsample=0.7, colsample=0.8
	rmse 4.744

In [222]:
params['subsample'] = 1
params['colsample_bytree'] = 0.7

In [None]:
#Parameter ETA

In [227]:
# This can take some time…
min_rmse = float("Inf")
best_params = None
for eta in [.1, .08, .07, .06]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    cv_results = xgboost.cv(
        params,
        data_train,
        num_boost_round=num_boost_round,
        seed=2020,
        nfold=10
        ,metrics=['rmse'],
        early_stopping_rounds=20)
    # Update best score
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\trmse {} for {} rounds\n".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = eta
print("Best params: {}, rmsee: {}".format(best_params, min_rmse))


CV with eta=0.1
	rmse 4.6809021 for 998 rounds

CV with eta=0.08
	rmse 4.683446600000001 for 998 rounds

CV with eta=0.07
	rmse 4.689411300000001 for 998 rounds

CV with eta=0.06
	rmse 4.6942932 for 998 rounds

Best params: 0.1, rmsee: 4.6809021


In [281]:
params['eta'] = .1

In [294]:
# Let's look at out final parameters
params

{'max_depth': 6,
 'min_child_weight': 3,
 'eta': 0.1,
 'subsample': 1,
 'colsample_bytree': 0.7,
 'objective': 'reg:squarederror',
 'eval_metric': 'rmse'}

In [283]:
cv_results = xgboost.cv(
    params,
    data_train,
    num_boost_round=num_boost_round,
    seed=2020,
    nfold=10,
    metrics={'rmse'},
    early_stopping_rounds=20
)

In [284]:
cv_results

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,5.812903,0.004810,5.814218,0.043816
1,5.691555,0.007500,5.694149,0.043112
2,5.589881,0.009826,5.594039,0.044018
3,5.502194,0.011535,5.507373,0.042961
4,5.426643,0.010272,5.432920,0.043116
...,...,...,...,...
994,4.004000,0.007651,4.673532,0.038993
995,4.003556,0.007467,4.673502,0.039005
996,4.002755,0.007493,4.673307,0.039087
997,4.002323,0.007520,4.673268,0.039092


In [388]:
cv_results['test-rmse-mean'].min()



4.6732428

In [296]:
## Training the final model in the whole dataset available for training

In [434]:
X = hotel_data.drop(columns = ['n_clicks'])
# let's also add the new featture avg_saving_cash
X['avg_saving_cash'] =X['avg_price']*X['avg_saving_percent']
y = hotel_data['n_clicks']

In [436]:
# let's prepare our new training data for xgboost
dtrain = xgboost.DMatrix(X, label=y)

In [437]:
num_boost_round = model.best_iteration + 1
best_model = xgboost.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
)

In [438]:
### let's save the model

best_model.save_model("best_model.model")

In [439]:
loaded_model = xgboost.Booster()
loaded_model.load_model("best_model.model")

In [440]:
X_test = pd.read_csv('cleaned_test.csv')

In [441]:
X_test.head()

Unnamed: 0,hotel_id,content_score,n_images,distance_to_center,avg_rating,stars,n_reviews,avg_rank,avg_price,avg_saving_percent,count,avg_saving_cash
0,14942256073,58.0,2.0,11503.0,89.0,0.0,168.0,13.5,90.19,32.0,12.0,2886.08
1,16036037903,68.0,4.0,938.0,81.0,5.0,735.0,13.667,98.27,19.0,2.0,1867.13
2,288585940112,19.0,0.0,38982.0,0.0,0.0,0.0,20.462,48.77,0.0,49.0,0.0
3,129041645070,47.0,0.0,1683.0,80.0,0.0,69.0,15.0,72.32,0.0,231.0,0.0
4,12460296563,59.0,3.0,1299.0,80.0,3.0,1470.0,10.0,24.54,19.0,7.0,466.26


In [442]:
X_test_0 = X_test.drop(columns = ['hotel_id'])

In [443]:
dtest = xgboost.DMatrix(X_test)

In [444]:
loaded_model.predict(dtest)

array([-0.6757989,  4.7302675,  0.4595676, ...,  1.7641987,  2.7218823,
        4.3619   ], dtype=float32)

In [400]:
X_train_1

Unnamed: 0,content_score,n_images,distance_to_center,avg_rating,stars,n_reviews,avg_rank,avg_price,avg_saving_percent,count,avg_saving_cash
22972,72.0,4.0,1937.0,67.0,4.0,2580.0,16.664,34.48,39.0,188.0,1344.72
143098,48.0,2.0,3486.0,88.0,0.0,144.0,14.500,177.19,0.0,133.0,0.00
136759,64.0,4.0,6283.0,69.0,0.0,3012.0,17.095,122.47,25.0,545.0,3061.75
48928,28.0,0.0,82803.0,0.0,0.0,0.0,15.471,26.82,0.0,15.0,0.00
230282,52.0,1.0,1970.0,0.0,0.0,0.0,24.000,159.54,0.0,71.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...
372378,60.0,0.0,6521.0,88.0,4.0,366.0,15.200,246.82,22.0,97.0,5430.04
4771,49.0,1.0,186.0,80.0,3.0,519.0,20.071,73.71,0.0,3.0,0.00
284107,18.0,0.0,24905.0,0.0,0.0,0.0,13.600,12.70,0.0,36.0,0.00
165832,46.0,1.0,2421.0,84.0,0.0,90.0,24.000,44.00,0.0,4.0,0.00


In [404]:
X_test.drop(columns= ['hotel_id'])

Unnamed: 0,content_score,n_images,distance_to_center,avg_rating,stars,n_reviews,avg_rank,avg_price,avg_saving_percent,count
0,58.0,2.0,11503.0,89.0,0.0,168.0,13.500,90.19,32.0,12.0
1,68.0,4.0,938.0,81.0,5.0,735.0,13.667,98.27,19.0,2.0
2,19.0,0.0,38982.0,0.0,0.0,0.0,20.462,48.77,0.0,49.0
3,47.0,0.0,1683.0,80.0,0.0,69.0,15.000,72.32,0.0,231.0
4,59.0,3.0,1299.0,80.0,3.0,1470.0,10.000,24.54,19.0,7.0
...,...,...,...,...,...,...,...,...,...,...
130813,47.0,0.0,3259.0,84.0,0.0,138.0,14.280,48.83,0.0,9.0
130814,50.0,0.0,5032.0,86.0,0.0,996.0,7.000,32.46,4.0,5.0
130815,62.0,0.0,1666.0,79.0,0.0,102.0,17.600,113.39,19.0,5.0
130816,51.0,1.0,521.0,85.0,3.0,324.0,18.925,85.63,0.0,119.0
