## import libraries

In [1]:
# mount drive 
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import os

import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
import statsmodels.formula.api as smf


from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score

  import pandas.util.testing as tm


In [3]:
os.chdir('/content/gdrive/My Drive/IMT574_grader')

## load data

### description
- additional_info - Special considerations regarding this property.
- amenities - Pipe (|) delimited list of amenities offered at the property.
- check_in_date
- check_out_date
- city
- country
- crawl_date
- description - Textual description of the property, as entered into the - site by the lister.
- highlight_value - Property highlights, as entered into the site by the - lister.
- hotel_star_rating - In case the property is a hotel, its out-of-five - star rating. Not all hotels have ratings.
- image_count - Number of images posted to the site by the lister.
- image_urls
- internet - Does this property have Internet access yes/no.
- landmark
- latitude
- longitude
- occupancy - How many adults and children may book the listing.
- pageurl
- property_address
- property_id
- property_name
- property_type - Home? Hotel? Resort? Etc.
- qts - Crawler timestamp.
- query_time_stamp - Copy of qts.
- room_price
- room_types - Number of beds and baths for the room.
- search_term
- service_value - Whether or not the property is verified with StayZilla - (plus some junk entries).
- similar_hotel - Some similar listings by name.
- sitename
- things_to_do - Nearby activities as entered by the lister.
- things_to_note - Special notes entered by the lister.

In [4]:
# unzip the file
from zipfile import ZipFile
ZipFile("./dataset/properties-on-stayzilla.zip").extractall("./dataset")

In [5]:
df = pd.read_csv("./dataset/stayzilla_com-travel_sample.csv")
df.head(3)

Unnamed: 0,additional_info,amenities,check_in_date,check_out_date,city,country,crawl_date,description,highlight_value,hotel_star_rating,image_count,image_urls,internet,landmark,latitude,longitude,occupancy,pageurl,property_address,property_id,property_name,property_type,qts,query_time_stamp,room_price,room_types,search_term,service_value,similar_hotel,sitename,things_to_do,things_to_note,uniq_id
0,Acceptance Rate:100 percent|Response Time:< 6 ...,Newspaper,04-08-2016,05-08-2016,Kalimpong,India,2016-07-20,Sangsey Valley Resort is Located at the wonder...,,,3.0,http://stay-imgs.stayzilla.com/resize/1200x800...,,,27.087429,88.531357,3 Adults 2 Kids,https://www.stayzilla.com/stays/sangsey-valley...,"South Sikkim, Kalimpong",67368,Sangsey Valley Resort,Resort,2016-07-20 10:59:55 +0000,2016-07-20 10:59:55 +0000,3167per night incl. tax,3 Bedded Room,,Not Verified,,www.stayzilla.com,,"All taxes.|Complimentary Breakfast, Lunch and ...",52a63c632d90923f7f0c394a79dce191
1,Acceptance Rate:100 percent|Response Time:< 6 ...,Parking|AC,04-08-2016,05-08-2016,Kanpur,India,2016-07-20,What should you know? Enjoy unmatched service ...,,,0.0,,,,26.466461,80.34745,2 Adults 2 Kids,https://www.stayzilla.com/stays/hotel-amantran...,"Ratanlal nagar, Kanpur",28733,Hotel Amantran Kanpur,Hotel,2016-07-20 10:59:55 +0000,2016-07-20 10:59:55 +0000,815per night incl. tax,Double Non-A/C Rooms,,Not Verified,Hotel Mangalam|Hotel Maha Roopa Palace,www.stayzilla.com,,Kindly Note:|This Tariff is not applicable for...,49ad3256fc231a4c9fdb6f6f2cc00d16
2,Acceptance Rate:100 percent|Response Time:< 6 ...,Pickup & Drop|Parking|AC,04-08-2016,05-08-2016,Jodhpur,India,2016-07-20,"What should you know? A budget hotel, this acc...",,,18.0,http://stay-imgs.stayzilla.com/resize/1200x800...,,,26.27902,73.019074,2 Adults 2 Kids,https://www.stayzilla.com/stays/hotel-krishna-...,"Shakti Nagar, Jodhpur",53742,Hotel Krishna,Hotel,2016-07-20 10:59:55 +0000,2016-07-20 10:59:55 +0000,1624per night incl. tax,Deluxe AC Double,,Not Verified,Gouri Heritage Haveli|The Regent Hotel|Vista R...,www.stayzilla.com,,All taxes.|Complimentary Tea.,8b5952e578717e20b311846f7aab031f


## preprocess data

### data profiling

In [6]:
df.shape

(1207, 33)

In [7]:
df.dtypes

additional_info       object
amenities             object
check_in_date         object
check_out_date        object
city                  object
country               object
crawl_date            object
description           object
highlight_value       object
hotel_star_rating    float64
image_count          float64
image_urls            object
internet              object
landmark              object
latitude             float64
longitude            float64
occupancy             object
pageurl               object
property_address      object
property_id            int64
property_name         object
property_type         object
qts                   object
query_time_stamp      object
room_price            object
room_types            object
search_term           object
service_value         object
similar_hotel         object
sitename              object
things_to_do          object
things_to_note        object
uniq_id               object
dtype: object

In [8]:
df.isna().sum()

additional_info         4
amenities              53
check_in_date           0
check_out_date          0
city                    0
country                68
crawl_date             68
description           288
highlight_value      1155
hotel_star_rating    1139
image_count           222
image_urls            260
internet             1139
landmark             1139
latitude                0
longitude               0
occupancy              68
pageurl                 0
property_address        0
property_id             0
property_name           0
property_type           0
qts                    68
query_time_stamp        0
room_price             68
room_types              0
search_term          1139
service_value           1
similar_hotel         382
sitename                0
things_to_do         1167
things_to_note        230
uniq_id                 0
dtype: int64

### deal with missing value

In [9]:
df.columns

Index(['additional_info', 'amenities', 'check_in_date', 'check_out_date',
       'city', 'country', 'crawl_date', 'description', 'highlight_value',
       'hotel_star_rating', 'image_count', 'image_urls', 'internet',
       'landmark', 'latitude', 'longitude', 'occupancy', 'pageurl',
       'property_address', 'property_id', 'property_name', 'property_type',
       'qts', 'query_time_stamp', 'room_price', 'room_types', 'search_term',
       'service_value', 'similar_hotel', 'sitename', 'things_to_do',
       'things_to_note', 'uniq_id'],
      dtype='object')

In [10]:
# remove columns with many missing value
cols = ['amenities', 'city', 'latitude', 'longitude', 'occupancy', 'property_type','room_price', 'room_types','service_value', 'similar_hotel']
df = df[cols]
df.head()

Unnamed: 0,amenities,city,latitude,longitude,occupancy,property_type,room_price,room_types,service_value,similar_hotel
0,Newspaper,Kalimpong,27.087429,88.531357,3 Adults 2 Kids,Resort,3167per night incl. tax,3 Bedded Room,Not Verified,
1,Parking|AC,Kanpur,26.466461,80.34745,2 Adults 2 Kids,Hotel,815per night incl. tax,Double Non-A/C Rooms,Not Verified,Hotel Mangalam|Hotel Maha Roopa Palace
2,Pickup & Drop|Parking|AC,Jodhpur,26.27902,73.019074,2 Adults 2 Kids,Hotel,1624per night incl. tax,Deluxe AC Double,Not Verified,Gouri Heritage Haveli|The Regent Hotel|Vista R...
3,WiFi|Newspaper|Pickup & Drop|Gym|Parking|AC,Jalpaiguri,26.808519,88.823601,1 Adult 2 Kids,Hotel,3509per night incl. tax,Premium Single A/c,Not Verified,
4,Newspaper|AC,Kanpur,26.488201,80.32663,2 Adults 2 Kids,Hotel,5802per night incl. tax,Deluxe AC Double,Not Verified,Hotel Royal Cliff


In [11]:
df.isna().sum()

amenities         53
city               0
latitude           0
longitude          0
occupancy         68
property_type      0
room_price        68
room_types         0
service_value      1
similar_hotel    382
dtype: int64

In [12]:
#df['similar_hotel'] = df['similar_hotel'].fillna("no similar hotels")
df = df.dropna(subset=['amenities', 'city', 'occupancy', 'property_type','room_price', 'room_types','service_value'])
df.shape

(1086, 10)

In [13]:
df['amenities'].value_counts()

Newspaper|AC                                                              260
Newspaper                                                                 107
Parking|AC                                                                 61
AC                                                                         52
WiFi|AC                                                                    37
                                                                         ... 
WiFi|Card Payment|Gym|Parking|AC                                            1
WiFi|Swimming Pool|Newspaper|Bar|Card Payment|Pickup & Drop|Parking|AC      1
WiFi|Free Breakfast|Newspaper|Pickup & Drop|Parking|AC                      1
WiFi|Free Breakfast|Parking|AC                                              1
Bar|Card Payment|Elevator|AC                                                1
Name: amenities, Length: 221, dtype: int64

In [14]:
df['similar_hotel'].value_counts()

Hotel IP Palace|Hotel Rangoli|Hotel Ganga Maiya|Hotel Monarch Inn                                 2
Luxury Service Apartment|Luxury Service apartment 3|Wanderer Bed And Breakfast|Salam Residency    2
Orchid Hotel|Hotel Buddha Inn|Hotel Republic|Orange Inn                                           2
Orchid Hotel|Hotel Royal Ville|Hotel Buddha Inn|Hotel Vijay Shree Deluxe                          2
Hotel Aryan                                                                                       2
                                                                                                 ..
Heritage Hotel|Hotel Dera Dundlod Kila                                                            1
Rathore Villas|Dhillon Home Stay                                                                  1
Hotel Sunder Palace|Priya Guest House|Baba Haveli Guest House                                     1
Hotel RB International|Hotel Keshari Bhavan|Hotel Nandini|Hotel New International                 1


### prepare features

In [15]:
df["num_similar_hotel"] = df['similar_hotel'].str.split("|").fillna("").apply(list).apply(lambda x: len(x))

In [16]:
# seperate the amenities
temp_amenities = ""
for row in df['amenities']:
    temp_amenities += ("|" + row)
#temp_df = df['amenities'].str.split("|", expand = True)
amenities_set = set(temp_amenities.split("|"))
amenities_set.pop()
print(amenities_set)
for amenity in amenities_set:
    df[amenity] = np.where(df["amenities"].str.contains(amenity), 1, 0)

{'Veg Only', 'Bar', 'Newspaper', 'Elevator', 'Free Breakfast', 'AC', 'Card Payment', 'Swimming Pool', 'Parking', 'Pickup & Drop', 'WiFi', 'Gym'}


In [17]:
df[["occupancy_adult", "occupancy_kid"]] = df["occupancy"].str.split("Adult", expand=True)
df["occupancy_adult"] = df["occupancy_adult"].astype('int')
df["occupancy_kid"] = df["occupancy_kid"].str.replace("s", "").str.split("Kid").str[0].astype('int')

In [18]:
df["room_price"] = df["room_price"].apply(lambda x: re.findall('\d+', x)[0])
df["room_price"].astype(int)

0       3167
1        815
2       1624
3       3509
4       5802
        ... 
1134    1736
1135    2595
1136    2267
1137    2446
1138     463
Name: room_price, Length: 1086, dtype: int64

In [19]:
# 'service value': encode it to 1 if verified,  0 if not or NA
df['service_value'] = df['service_value'].apply(lambda x: 1 if x == 'Verified' else 0)

In [20]:
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
# Encode labels in column 'property_type'. 
df['encode_property_type']= label_encoder.fit_transform(df['property_type']) 

In [21]:
df.head()

Unnamed: 0,amenities,city,latitude,longitude,occupancy,property_type,room_price,room_types,service_value,similar_hotel,num_similar_hotel,Veg Only,Bar,Newspaper,Elevator,Free Breakfast,AC,Card Payment,Swimming Pool,Parking,Pickup & Drop,WiFi,Gym,occupancy_adult,occupancy_kid,encode_property_type
0,Newspaper,Kalimpong,27.087429,88.531357,3 Adults 2 Kids,Resort,3167,3 Bedded Room,0,,0,0,0,1,0,0,0,0,0,0,0,0,0,3,2,5
1,Parking|AC,Kanpur,26.466461,80.34745,2 Adults 2 Kids,Hotel,815,Double Non-A/C Rooms,0,Hotel Mangalam|Hotel Maha Roopa Palace,2,0,0,0,0,0,1,0,0,1,0,0,0,2,2,2
2,Pickup & Drop|Parking|AC,Jodhpur,26.27902,73.019074,2 Adults 2 Kids,Hotel,1624,Deluxe AC Double,0,Gouri Heritage Haveli|The Regent Hotel|Vista R...,4,0,0,0,0,0,1,0,0,1,1,0,0,2,2,2
3,WiFi|Newspaper|Pickup & Drop|Gym|Parking|AC,Jalpaiguri,26.808519,88.823601,1 Adult 2 Kids,Hotel,3509,Premium Single A/c,0,,0,0,0,1,0,0,1,0,0,1,1,1,1,1,2,2
4,Newspaper|AC,Kanpur,26.488201,80.32663,2 Adults 2 Kids,Hotel,5802,Deluxe AC Double,0,Hotel Royal Cliff,1,0,0,1,0,0,1,0,0,0,0,0,0,2,2,2


In [22]:
df.dtypes

amenities                object
city                     object
latitude                float64
longitude               float64
occupancy                object
property_type            object
room_price               object
room_types               object
service_value             int64
similar_hotel            object
num_similar_hotel         int64
Veg Only                  int64
Bar                       int64
Newspaper                 int64
Elevator                  int64
Free Breakfast            int64
AC                        int64
Card Payment              int64
Swimming Pool             int64
Parking                   int64
Pickup & Drop             int64
WiFi                      int64
Gym                       int64
occupancy_adult           int64
occupancy_kid             int64
encode_property_type      int64
dtype: object

## build models

### use property-related and amenity-related features to predict room prices

#### fit the model

In [23]:
X = df[['num_similar_hotel', 'Gym', 'Card Payment', 'Bar', 'Elevator', 'Veg Only', 
        'Pickup & Drop', 'AC', 'Swimming Pool', 'WiFi', 'Parking', 'Newspaper', 
        'Free Breakfast', 'encode_property_type']]
X = sm.add_constant(X)
X = X.astype(float)
y = df[['room_price']].astype(float)
model = sm.OLS(y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,room_price,R-squared:,0.074
Model:,OLS,Adj. R-squared:,0.062
Method:,Least Squares,F-statistic:,6.119
Date:,"Mon, 22 Mar 2021",Prob (F-statistic):,8.28e-12
Time:,23:27:47,Log-Likelihood:,-10619.0
No. Observations:,1086,AIC:,21270.0
Df Residuals:,1071,BIC:,21340.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4196.0264,520.693,8.059,0.000,3174.333,5217.720
num_similar_hotel,-256.9209,79.703,-3.223,0.001,-413.312,-100.530
Gym,122.5928,807.641,0.152,0.879,-1462.145,1707.330
Card Payment,-570.2472,380.757,-1.498,0.135,-1317.361,176.866
Bar,-42.6137,604.205,-0.071,0.944,-1228.173,1142.945
Elevator,-171.7018,474.003,-0.362,0.717,-1101.782,758.378
Veg Only,-653.7141,525.854,-1.243,0.214,-1685.535,378.107
Pickup & Drop,-475.1078,476.028,-0.998,0.318,-1409.160,458.945
AC,-1047.5823,311.524,-3.363,0.001,-1658.848,-436.316

0,1,2,3
Omnibus:,2186.055,Durbin-Watson:,1.957
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4585901.495
Skew:,15.453,Prob(JB):,0.0
Kurtosis:,319.845,Cond. No.,27.4


### use location and occumancy to predict property types

#### train test split

In [24]:
df.columns

Index(['amenities', 'city', 'latitude', 'longitude', 'occupancy',
       'property_type', 'room_price', 'room_types', 'service_value',
       'similar_hotel', 'num_similar_hotel', 'Veg Only', 'Bar', 'Newspaper',
       'Elevator', 'Free Breakfast', 'AC', 'Card Payment', 'Swimming Pool',
       'Parking', 'Pickup & Drop', 'WiFi', 'Gym', 'occupancy_adult',
       'occupancy_kid', 'encode_property_type'],
      dtype='object')

In [25]:
#Create X and y
X = df[['latitude', 'longitude', 'occupancy_adult', 'occupancy_kid', 'encode_property_type']]
y = df[['encode_property_type']]

#Create training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### fit the model

- svm

In [26]:
#Building the SVC model and fitting the training data
model = SVC(kernel='linear')
model.fit(X_train,y_train.values.ravel())

#Predicting on the test data
predictions = model.predict(X_test)

#Printing the accuracy
print("Accuracy:", accuracy_score(y_test, predictions))
#Printing the confusion matrix
print(confusion_matrix(y_test,predictions))
#Printing the classification report
print(classification_report(y_test,predictions))

Accuracy: 0.9969325153374233
[[  5   0   0   0   0   0   0   0]
 [  0  13   0   0   0   0   0   0]
 [  0   0 243   0   0   0   0   0]
 [  0   0   0  37   0   0   0   0]
 [  0   0   0   0   8   0   0   0]
 [  0   0   0   0   0  15   0   0]
 [  0   0   0   0   0   1   0   0]
 [  0   0   0   0   0   0   0   4]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00       243
           3       1.00      1.00      1.00        37
           4       1.00      1.00      1.00         8
           5       0.94      1.00      0.97        15
           6       0.00      0.00      0.00         1
           7       1.00      1.00      1.00         4

    accuracy                           1.00       326
   macro avg       0.87      0.88      0.87       326
weighted avg       0.99      1.00      1.00       326



  _warn_prf(average, modifier, msg_start, len(result))


- knn

In [27]:
#Creating KNN Classfier model
knn = KNeighborsClassifier(n_neighbors=3)

#Fitting the training data
knn.fit(X_train,y_train)

#Predicting on the test data
predictions = knn.predict(X_test)

print("k=", 3)
#Printing Confusion matrix and accuracy socres 
print(accuracy_score(y_test,predictions))
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test, predictions))

k= 3
0.9386503067484663
[[  3   1   1   0   0   0   0   0]
 [  0  11   2   0   0   0   0   0]
 [  0   2 241   0   0   0   0   0]
 [  0   0   7  30   0   0   0   0]
 [  0   0   1   1   5   1   0   0]
 [  0   0   1   1   0  13   0   0]
 [  0   0   0   0   0   1   0   0]
 [  0   0   0   0   0   1   0   3]]
              precision    recall  f1-score   support

           0       1.00      0.60      0.75         5
           1       0.79      0.85      0.81        13
           2       0.95      0.99      0.97       243
           3       0.94      0.81      0.87        37
           4       1.00      0.62      0.77         8
           5       0.81      0.87      0.84        15
           6       0.00      0.00      0.00         1
           7       1.00      0.75      0.86         4

    accuracy                           0.94       326
   macro avg       0.81      0.69      0.73       326
weighted avg       0.94      0.94      0.94       326



  """
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
# df display
pd.set_option('display.max_colwidth', 15)
pd.set_option('display.max_columns', 12)  
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings("ignore")