In [1]:
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
def models():
    #linear = LinearRegression()
    lg = LogisticRegression()
    xgb_model = xgb.XGBClassifier( objective="binary:logistic",random_state=42)
    rf =RandomForestClassifier(n_estimators=51,min_samples_leaf=5,min_samples_split=3)
    bagg = BaggingClassifier(n_estimators=51,random_state=42)
    extra = ExtraTreesClassifier(n_estimators=51,random_state=42)
    ada = AdaBoostClassifier(n_estimators=51,random_state=42)
    grad = GradientBoostingClassifier(n_estimators=51,random_state=42)
    classifier_list = [lg, xgb_model,rf,bagg,extra,ada,grad]
    classifier_name_list = ['Logistic Regression','XGBoost','Random Forests','Bagging','Extra Trees','AdaBoost','Gradient Boost']
    return classifier_list,classifier_name_list
    
def print_evaluation_metrics(trained_model,trained_model_name,X_test,y_test):
    print('--------- Model : ', trained_model_name, ' ---------------\n')
    predicted_values = trained_model.predict(X_test)
    print(metrics.classification_report(y_test,predicted_values))
    print("Accuracy Score : ",metrics.accuracy_score(y_test,predicted_values))
    print("---------------------------------------\n")    

In [3]:
rank = pd.read_csv('./Train_11k/rank_11.csv')
rank = rank[-rank['latitude'].isna()]

In [4]:
rank.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10955 entries, 0 to 10968
Data columns (total 43 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel_id                        10955 non-null  float64
 1   name                            10955 non-null  object 
 2   address                         10955 non-null  object 
 3   overall_score_OTA               9179 non-null   float64
 4   overall_score                   9686 non-null   float64
 5   counts_search                   10955 non-null  float64
 6   counts_click                    10955 non-null  float64
 7   counts_book                     10955 non-null  float64
 8   province_id                     10955 non-null  float64
 9   district_id                     9169 non-null   float64
 10  latitude                        10955 non-null  float64
 11  longitude                       10955 non-null  float64
 12  description                     

In [5]:
cols = ['tours', 'night_club', 'relax_spa', 'relax_massage', 'relax_steam_room', 'relax_outdoor_room',
        'relax_outdoor_pool', 'relax_sauna', 'relax_pool', 'currency_exchange',
       'room_service_24_hour', 'elevator', 'safely_deposit_boxed',
       'luggage_storage', 'poolside_bar', 'airport_transfer', 'restaurants',
       'concierge', 'shops', 'meeting_facilities', 'baby_sitting',
       'facilities_for_disabled_guests', 'private_beach', 'front_desk_24_hour',
       'bar', 'laundry_service', 'shuttle_room', 'price_mean']

In [6]:
arr = []
new_cols = []
for col in cols:
    x = rank[[col]].shape[0]
    y = rank[rank[col] == -1][col].count()
    percent = 100*y // x
    string = str(percent) + '% not info of ' + col
    arr.append(string)
    
    if percent <= 40:
        new_cols.append(col)
        
arr.sort()
arr

['0% not info of price_mean',
 '100% not info of relax_outdoor_room',
 '100% not info of relax_pool',
 '22% not info of laundry_service',
 '24% not info of luggage_storage',
 '27% not info of tours',
 '29% not info of concierge',
 '31% not info of currency_exchange',
 '32% not info of front_desk_24_hour',
 '33% not info of safely_deposit_boxed',
 '35% not info of bar',
 '39% not info of restaurants',
 '40% not info of relax_spa',
 '46% not info of airport_transfer',
 '46% not info of elevator',
 '46% not info of meeting_facilities',
 '46% not info of private_beach',
 '50% not info of room_service_24_hour',
 '52% not info of relax_massage',
 '60% not info of facilities_for_disabled_guests',
 '62% not info of relax_outdoor_pool',
 '68% not info of relax_steam_room',
 '69% not info of baby_sitting',
 '71% not info of relax_sauna',
 '74% not info of shops',
 '94% not info of poolside_bar',
 '97% not info of night_club',
 '98% not info of shuttle_room']

In [7]:
new_cols

['tours',
 'relax_spa',
 'currency_exchange',
 'safely_deposit_boxed',
 'luggage_storage',
 'restaurants',
 'concierge',
 'front_desk_24_hour',
 'bar',
 'laundry_service',
 'price_mean']

In [39]:
df = rank[['counts_click','tours', 'relax_spa','currency_exchange',
 'safely_deposit_boxed', 'luggage_storage', 'restaurants', 'concierge',
 'front_desk_24_hour', 'bar', 'laundry_service', 'price_mean']]

In [40]:
df['price_mean'] = df['price_mean'].fillna(df['price_mean'].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_mean'] = df['price_mean'].fillna(df['price_mean'].mean())


In [41]:
mask1 = df['counts_click'] > 0
df.loc[mask1,'counts_click'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [42]:
df.groupby('counts_click').size()

counts_click
0.0    8370
1.0    2585
dtype: int64

In [46]:
X = df.loc[:, df.columns != 'counts_click']
y = df.loc[:, df.columns == 'counts_click']

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [48]:
classifier_list, classifier_name_list = models()
for classifier,classifier_name in zip(classifier_list,classifier_name_list):
    classifier.fit(X_train,y_train)
    print_evaluation_metrics(classifier,classifier_name,X_test,y_test)

  return f(**kwargs)
  _warn_prf(average, modifier, msg_start, len(result))
  return f(**kwargs)


--------- Model :  Logistic Regression  ---------------

              precision    recall  f1-score   support

         0.0       0.75      1.00      0.86      2471
         1.0       0.00      0.00      0.00       816

    accuracy                           0.75      3287
   macro avg       0.38      0.50      0.43      3287
weighted avg       0.57      0.75      0.65      3287

Accuracy Score :  0.7517493154852449
---------------------------------------

--------- Model :  XGBoost  ---------------

              precision    recall  f1-score   support

         0.0       0.79      0.94      0.86      2471
         1.0       0.56      0.24      0.34       816

    accuracy                           0.76      3287
   macro avg       0.67      0.59      0.60      3287
weighted avg       0.73      0.76      0.73      3287

Accuracy Score :  0.7645269242470337
---------------------------------------



  classifier.fit(X_train,y_train)


--------- Model :  Random Forests  ---------------

              precision    recall  f1-score   support

         0.0       0.78      0.96      0.86      2471
         1.0       0.60      0.20      0.30       816

    accuracy                           0.77      3287
   macro avg       0.69      0.58      0.58      3287
weighted avg       0.74      0.77      0.72      3287

Accuracy Score :  0.7681776696075449
---------------------------------------



  return f(**kwargs)


--------- Model :  Bagging  ---------------

              precision    recall  f1-score   support

         0.0       0.79      0.90      0.84      2471
         1.0       0.47      0.27      0.34       816

    accuracy                           0.74      3287
   macro avg       0.63      0.58      0.59      3287
weighted avg       0.71      0.74      0.72      3287

Accuracy Score :  0.7417097657438394
---------------------------------------



  classifier.fit(X_train,y_train)


--------- Model :  Extra Trees  ---------------

              precision    recall  f1-score   support

         0.0       0.79      0.83      0.81      2471
         1.0       0.39      0.32      0.35       816

    accuracy                           0.70      3287
   macro avg       0.59      0.58      0.58      3287
weighted avg       0.69      0.70      0.70      3287

Accuracy Score :  0.7042896257986005
---------------------------------------



  return f(**kwargs)


--------- Model :  AdaBoost  ---------------

              precision    recall  f1-score   support

         0.0       0.77      0.98      0.86      2471
         1.0       0.64      0.13      0.21       816

    accuracy                           0.77      3287
   macro avg       0.71      0.55      0.54      3287
weighted avg       0.74      0.77      0.70      3287

Accuracy Score :  0.7657438393672041
---------------------------------------



  return f(**kwargs)


--------- Model :  Gradient Boost  ---------------

              precision    recall  f1-score   support

         0.0       0.78      0.97      0.86      2471
         1.0       0.63      0.18      0.28       816

    accuracy                           0.77      3287
   macro avg       0.71      0.57      0.57      3287
weighted avg       0.74      0.77      0.72      3287

Accuracy Score :  0.7706114998478856
---------------------------------------

