In [1]:
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [9]:
rank = pd.read_csv("./Demo_4k/rank.csv")
rank = rank[-rank['latitude'].isna()]

In [3]:
def get_ensemble_models():
    rf = RandomForestRegressor(n_estimators=51,min_samples_leaf=5,min_samples_split=3)
    bagg = BaggingRegressor(n_estimators=51,random_state=42)
    extra = ExtraTreesRegressor(n_estimators=51,random_state=42)
    ada = AdaBoostRegressor(n_estimators=51,random_state=42)
    grad = GradientBoostingRegressor(n_estimators=51,random_state=42)
    regressor_list = [rf,bagg,extra,ada,grad]
    regressor_name_list = ['Random Forests','Bagging','Extra Trees','AdaBoost','Gradient Boost']
    return regressor_list,regressor_name_list
    
def print_evaluation_metrics(trained_model,trained_model_name,X_test,y_test):
    print('--------- Model : ', trained_model_name, ' ---------------\n')
    predicted_values = trained_model.predict(X_test)
    print(metrics.mean_absolute_error(y_test,predicted_values))
    print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test,predicted_values))
    print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test,predicted_values))
    print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test,predicted_values)))
    print('R2_score:', metrics.r2_score(y_test,predicted_values))
    print("---------------------------------------\n")
    

# Fill Missing Value of counts_search

In [32]:
search = rank[['hotel_id', 'score_mean','counts_search','review_score','tours','relax_spa','currency_exchange',
 'safely_deposit_boxed', 'luggage_storage', 'restaurants', 'concierge',
 'front_desk_24_hour', 'bar', 'laundry_service', 'price_mean']]

In [22]:
df1 = search[search['counts_search'] != 0]
df = df1.drop(['hotel_id'], axis = 1)

In [23]:
X = df.loc[:, df.columns != 'counts_search']
y = df.loc[:, df.columns == 'counts_search']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [25]:
xgb_model = xgb.XGBRegressor(objective="reg:linear", random_state=42)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

mse=metrics.mean_absolute_error(y_test, y_pred)

print(mse)
print(metrics.r2_score(y_test,y_pred))

731.3794803527094
-0.30881839652420107


In [26]:
li_r = LinearRegression()
li_r.fit(X_train,y_train)
y_pred = li_r.predict(X_test)
print(metrics.mean_absolute_error(y_test,y_pred))
print(metrics.r2_score(y_test,y_pred))

660.0925066722499
0.01136580245116392


In [27]:
temp = search[search['counts_search'] == 0]
X_1 = temp.loc[:, temp.columns != 'counts_search']
X_2 = X_1.drop(['hotel_id'],axis = 1)
y_pred = li_r.predict(X_2)
y_1 = pd.DataFrame({'counts_search': y_pred[:,0]})
X_1 = X_1.reset_index()
temp_modeled = pd.concat([X_1, y_1], axis = 1)

In [28]:
rank1 = pd.concat([search[search['counts_search'] != 0], temp_modeled], axis = 0)
rank1 = rank1.drop(['index'], axis = 1)

In [29]:
rank2 = rank1.merge(rank[['hotel_id', 'counts_click']], on ='hotel_id')

In [30]:
#lo_r = LogisticRegression()
#lo_r.fit(X_train,y_train)
#print_evaluation_metrics(lo_r,LogisticRegression,X_test,y_test)

In [31]:
#regressor_list,regressor_name_list = get_ensemble_models()
#for regressor,regressor_name in zip(regressor_list,regressor_name_list):
    #regressor.fit(X_train,y_train)
    #print_evaluation_metrics(regressor,regressor_name,X_test,y_test)

# Fill Missing Value of counts_click

In [33]:
click = rank2[['hotel_id', 'score_mean', 'counts_search','counts_click', 'review_score', 'tours','relax_spa','currency_exchange',
 'safely_deposit_boxed', 'luggage_storage', 'restaurants', 'concierge',
 'front_desk_24_hour', 'bar', 'laundry_service', 'price_mean']]

In [34]:
df = click[click['counts_click'] != 0]
df = df.drop(['hotel_id'], axis = 1)

In [35]:
X = df.loc[:, df.columns != 'counts_click']
y = df.loc[:, df.columns == 'counts_click']

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [37]:
li_r = LinearRegression()
li_r.fit(X_train,y_train)
y_pred = li_r.predict(X_test)
print(metrics.mean_absolute_error(y_test,y_pred))
print(metrics.r2_score(y_test,y_pred))

25.872035374868915
0.005293145506928054


In [38]:
temp = click[click['counts_click'] == 0]
X_1 = temp.loc[:, temp.columns != 'counts_click']
X_2 = X_1.drop(['hotel_id'],axis = 1)
y_pred = li_r.predict(X_2)
y_1 = pd.DataFrame({'counts_click': y_pred[:,0]})
X_1 = X_1.reset_index()
temp_modeled = pd.concat([X_1, y_1], axis = 1)

In [39]:
rank2 = pd.concat([click[click['counts_click'] != 0], temp_modeled], axis = 0)
rank2 = rank2.drop(['index'], axis = 1)

In [40]:
rank3 = rank2.merge(rank[['hotel_id', 'counts_book']], on ='hotel_id')

# Fill Missing Value of counts_book

In [42]:
book = rank3[['hotel_id', 'score_mean', 'counts_search', 'counts_click', 'counts_book', 'review_score', 'tours','relax_spa','currency_exchange',
 'safely_deposit_boxed', 'luggage_storage', 'restaurants', 'concierge',
 'front_desk_24_hour', 'bar', 'laundry_service', 'price_mean']]

In [43]:
df = book[book['counts_book'] != 0]
df = df.drop(['hotel_id'], axis = 1)

In [44]:
X = df.loc[:, df.columns != 'counts_book']
y = df.loc[:, df.columns == 'counts_book']

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [46]:
li_r = LinearRegression()
li_r.fit(X_train,y_train)
y_pred = li_r.predict(X_test)
print(metrics.mean_absolute_error(y_test,y_pred))

4.0192214431893465


In [47]:
temp = book[book['counts_book'] == 0]
X_1 = temp.loc[:, temp.columns != 'counts_book']
X_2 = X_1.drop(['hotel_id'],axis = 1)
y_pred = li_r.predict(X_2)
y_1 = pd.DataFrame({'counts_book': y_pred[:,0]})
X_1 = X_1.reset_index()
temp_modeled = pd.concat([X_1, y_1], axis = 1)

In [48]:
rank3 = pd.concat([book[book['counts_book'] != 0], temp_modeled], axis = 0)
rank3 = rank3.drop(['index'], axis = 1)

In [51]:
rank = rank.drop(['counts_click','counts_search','counts_book'], axis = 1).merge(rank3[['hotel_id', 'counts_click','counts_search','counts_book']], on ='hotel_id')

In [52]:
rank.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5208,5209,5210,5211,5212,5213,5214,5215,5216,5217
hotel_id,40415,40412,38408,45545,45572,38420,30436,2101,38437,1680,...,47188,37088,38770,34969,46068,46752,45959,6185,37034,47608
name,Premier Residences Phu Quoc Emerald Bay Manage...,Seashells Hotel & Spa Phú Quốc (Seashells Phu ...,Aaron Hotel,Green Tree Hotel Phú Quốc,Golden Lotus Hotel,Smile Hotel Nha Trang,Cozrum Homes Trương Định,Sun River Hotel,Ibis Styles Nha Trang,Khách Sạn Senriver (Senriver Hotel),...,OYO 592 An Nhien Hotel & Cafe,Nancy Sweet Apartment - A2205,Hoang Ky,Nam Phuong Home - Lily room in the heart of Hanoi,Our Houses Hạ Long,Homestay Des Amis - Bạn Hữu,Touch Da Nang Hostel,Phuong Ngan Hotel Ninh Binh,Nancy Thuy Tien Apartment 1009,Anh Đào Homestay
address,"Khem Beach An Thới, Phú Quốc, Kiên Giang","Vo Thi Sau, Dương Đông, Đảo Phú Quốc, Việt Nam","Số 6 Trần Quang Khải, Thành Phố Nha Trang, Khá...","Khu tổ hợp du lịch Sonasea Villas & Resort, Bã...","96B4 Trần Phú, Lộc Thọ, Thành Phố Nha Trang, K...","45/1 Hùng Vương, Thành Phố Nha Trang, Khánh Hòa","73 Trương Định, Quận 3, Hồ Chí Minh","Số 132-134-136 Bạch Đằng, Quận Hải Châu, Đà Nẵng","86 Hùng Vương, Thành Phố Nha Trang, Khánh Hòa","238 Bạch Đằng, Quận Hải Châu, Đà Nẵng",...,"71 Hà Huy Tập, P.3, Lâm Đồng, Đà Lạt","OSC Land Residence, 110 Vo Thi Sau, Thắng Tam,...","3 Pham Van Dong, Vĩnh Hòa, Nha Trang, Việt Nam","Quận Hoàn Kiếm, Hà Nội, Việt Nam","2A/26A, Anh Đào, Bãi Cháy, Quảng Ninh, Hạ Long","18A Đinh Tiên Hoàng, Tân An, Hội An","K134/26 Lê Hữu Trác, An Hải Đông, Q.Sơn Trà, Đ...","Số 6, Ngõ 48 phố 5, đường Chiến Thắng, phường ...","Thuy Tien Building, 84 Tran Phu, Phường 5, Vũn...","Lử Chô 2, Lâu Thí Ngài, Bắc Hà, Lào Cai, Bắc Hà"
overall_score_OTA,87.7083,90,86,94.5,32.5,90,84,78.0972,88,80,...,20,20,20,20,20,20,20,20,20,10
overall_score,88,91,86,94,32.5,86,88,80,85,83,...,20,20,20,20,20,20,20,20,20,10
score_mean,87.8542,90.5,86,94.25,32.5,88,86,79.0486,86.5,81.5,...,20,20,20,20,20,20,20,20,20,10
review_score,362,8101.9,216.818,419.083,-14,766,448.833,695,348,2367.6,...,-3,5,0,-3,-3,-3,-3,-3,-3,0
SCORE,366396,285112,253527,250369,238066,236996,217779,213044,203098,197408,...,20017,20005,20000,19997,19997,19997,19997,19997,19997,10000
province_id,2,2,43,2,43,43,33,50,43,50,...,20,15,43,11,10,6,50,60,15,21
district_id,446,446,414,446,414,414,481,238,414,238,...,,686,414,259,,,,420,686,


In [58]:
rank[rank['counts_click'] <= 0] = 0
rank[rank['counts_search'] <= 0] = 0
rank[rank['counts_book'] <= 0] = 0