In [1]:
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
def get_ensemble_models():
    rf =RandomForestClassifier(n_estimators=51,min_samples_leaf=5,min_samples_split=3)
    bagg = BaggingClassifier(n_estimators=51,random_state=42)
    extra = ExtraTreesClassifier(n_estimators=51,random_state=42)
    ada = AdaBoostClassifier(n_estimators=51,random_state=42)
    grad = GradientBoostingClassifier(n_estimators=51,random_state=42)
    classifier_list = [rf,bagg,extra,ada,grad]
    classifier_name_list = ['Random Forests','Bagging','Extra Trees','AdaBoost','Gradient Boost']
    return classifier_list,classifier_name_list
    
def print_evaluation_metrics(trained_model,trained_model_name,X_test,y_test):
    print('--------- Model : ', trained_model_name, ' ---------------\n')
    predicted_values = trained_model.predict(X_test)
    print(metrics.classification_report(y_test,predicted_values))
    print("Accuracy Score : ",metrics.accuracy_score(y_test,predicted_values))
    print("---------------------------------------\n")    

In [3]:
rank = pd.read_csv("./rank.csv")
rank = rank[-rank['latitude'].isna()]

In [4]:
rank.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5218 entries, 0 to 5223
Data columns (total 44 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel_id                        5218 non-null   int64  
 1   name                            5218 non-null   object 
 2   address                         5218 non-null   object 
 3   overall_score_OTA               5218 non-null   float64
 4   overall_score                   5218 non-null   float64
 5   score_mean                      5218 non-null   float64
 6   counts_search                   5218 non-null   int64  
 7   counts_click                    5218 non-null   int64  
 8   counts_book                     5218 non-null   int64  
 9   review_score                    5218 non-null   float64
 10  SCORE                           5218 non-null   float64
 11  province_id                     5218 non-null   int64  
 12  district_id                     43

In [5]:
test = rank[['counts_search','tours','score_mean','review_score',
 'relax_spa',
 'currency_exchange',
 'safely_deposit_boxed',
 'luggage_storage',
 'restaurants',
 'concierge',
 'front_desk_24_hour',
 'bar',
 'laundry_service',
 'price_mean']]

In [6]:
rank['front_desk_24_hour'].value_counts()

 1    2444
-1    1764
 0    1010
Name: front_desk_24_hour, dtype: int64

In [35]:
mask1 = test['counts_search'] > 0
test.loc[mask1,'counts_search'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [36]:
test.head(3)

Unnamed: 0,counts_search,tours,score_mean,review_score,relax_spa,currency_exchange,safely_deposit_boxed,luggage_storage,restaurants,concierge,front_desk_24_hour,bar,laundry_service,price_mean
0,1,-1,87.85417,362.0,-1,-1,-1,-1,-1,-1,1,-1,-1,2012733.0
1,1,1,90.5,8101.899995,1,1,1,1,1,1,1,1,1,1537731.0
2,1,1,86.0,216.818182,1,1,1,1,1,1,1,1,1,486794.7


In [37]:
test.groupby('counts_search').size()

counts_search
0    4187
1    1031
dtype: int64

In [38]:
X = test.loc[:, test.columns != 'counts_search']
y = test.loc[:, test.columns == 'counts_search']

In [39]:
X.head(3)

Unnamed: 0,tours,score_mean,review_score,relax_spa,currency_exchange,safely_deposit_boxed,luggage_storage,restaurants,concierge,front_desk_24_hour,bar,laundry_service,price_mean
0,-1,87.85417,362.0,-1,-1,-1,-1,-1,-1,1,-1,-1,2012733.0
1,1,90.5,8101.899995,1,1,1,1,1,1,1,1,1,1537731.0
2,1,86.0,216.818182,1,1,1,1,1,1,1,1,1,486794.7


In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [41]:
classifier_list, classifier_name_list = get_ensemble_models()
for classifier,classifier_name in zip(classifier_list,classifier_name_list):
    classifier.fit(X_train,y_train)
    print_evaluation_metrics(classifier,classifier_name,X_test,y_test)

  classifier.fit(X_train,y_train)


--------- Model :  Random Forests  ---------------

              precision    recall  f1-score   support

           0       0.82      0.97      0.89      1232
           1       0.64      0.22      0.32       334

    accuracy                           0.81      1566
   macro avg       0.73      0.59      0.60      1566
weighted avg       0.78      0.81      0.77      1566

Accuracy Score :  0.8065134099616859
---------------------------------------



  return f(**kwargs)


--------- Model :  Bagging  ---------------

              precision    recall  f1-score   support

           0       0.83      0.94      0.88      1232
           1       0.55      0.27      0.36       334

    accuracy                           0.80      1566
   macro avg       0.69      0.60      0.62      1566
weighted avg       0.77      0.80      0.77      1566

Accuracy Score :  0.7969348659003831
---------------------------------------



  classifier.fit(X_train,y_train)


--------- Model :  Extra Trees  ---------------

              precision    recall  f1-score   support

           0       0.83      0.92      0.87      1232
           1       0.49      0.30      0.37       334

    accuracy                           0.78      1566
   macro avg       0.66      0.61      0.62      1566
weighted avg       0.76      0.78      0.76      1566

Accuracy Score :  0.7848020434227331
---------------------------------------



  return f(**kwargs)


--------- Model :  AdaBoost  ---------------

              precision    recall  f1-score   support

           0       0.81      0.96      0.88      1232
           1       0.55      0.17      0.26       334

    accuracy                           0.79      1566
   macro avg       0.68      0.57      0.57      1566
weighted avg       0.76      0.79      0.75      1566

Accuracy Score :  0.7937420178799489
---------------------------------------



  return f(**kwargs)


--------- Model :  Gradient Boost  ---------------

              precision    recall  f1-score   support

           0       0.82      0.96      0.88      1232
           1       0.60      0.21      0.31       334

    accuracy                           0.80      1566
   macro avg       0.71      0.59      0.60      1566
weighted avg       0.77      0.80      0.76      1566

Accuracy Score :  0.8020434227330779
---------------------------------------

