# Hotels in Amsterdam

In [1]:
#importing libraries
import pandas as pd
import statsmodels.formula.api as smf
from stargazer.stargazer import Stargazer
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from statsmodels.nonparametric.smoothers_lowess import lowess
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import log_loss
import seaborn as sns
from plotnine import ggplot

## Data Work

In [2]:
#reading csv files
hotel_features = pd.read_csv('hotels-europe_features.csv')
hotel_prices = pd.read_csv('hotels-europe_price.csv')

In [3]:
#df1 where city is Amsterdam
df1 = pd.DataFrame(hotel_features)
df1 = df1.loc[df1['city'] == 'Amsterdam']

In [6]:
#seeing df2
df2 = pd.DataFrame(hotel_prices)
df2.head()

Unnamed: 0,hotel_id,price,offer,offer_cat,year,month,weekend,holiday,nnights,scarce_room
0,1,172,0,0% no offer,2017,11,1,0,1,0
1,1,122,1,15-50% offer,2018,1,1,0,1,0
2,1,122,1,15-50% offer,2017,12,0,1,1,0
3,1,552,1,1-15% offer,2017,12,0,1,4,0
4,1,122,1,15-50% offer,2018,2,1,0,1,0


In [9]:
#merging 2 dfs 
df = pd.merge(df1, df2, on = 'hotel_id')
df.head()

Unnamed: 0,hotel_id,city,distance,stars,rating,country,city_actual,rating_reviewcount,center1label,center2label,...,accommodation_type,price,offer,offer_cat,year,month,weekend,holiday,nnights,scarce_room
0,1,Amsterdam,3.1,4.0,4.3,Netherlands,Amsterdam,1030.0,City centre,Montelbaanstoren,...,Hotel,172,0,0% no offer,2017,11,1,0,1,0
1,1,Amsterdam,3.1,4.0,4.3,Netherlands,Amsterdam,1030.0,City centre,Montelbaanstoren,...,Hotel,122,1,15-50% offer,2018,1,1,0,1,0
2,1,Amsterdam,3.1,4.0,4.3,Netherlands,Amsterdam,1030.0,City centre,Montelbaanstoren,...,Hotel,122,1,15-50% offer,2017,12,0,1,1,0
3,1,Amsterdam,3.1,4.0,4.3,Netherlands,Amsterdam,1030.0,City centre,Montelbaanstoren,...,Hotel,552,1,1-15% offer,2017,12,0,1,4,0
4,1,Amsterdam,3.1,4.0,4.3,Netherlands,Amsterdam,1030.0,City centre,Montelbaanstoren,...,Hotel,122,1,15-50% offer,2018,2,1,0,1,0


In [10]:
#use hotel user ratings to create a binary variable: highly_rated=1 if rating ≥ 4, 0 otherwise 0 .
#Examine how high rating is related to the other hotel features in the data. Make sure to
#document and argue for the data cleaning/filtering decisions you make.
df.columns

Index(['hotel_id', 'city', 'distance', 'stars', 'rating', 'country',
       'city_actual', 'rating_reviewcount', 'center1label', 'center2label',
       'neighbourhood', 'ratingta', 'ratingta_count', 'distance_alter',
       'accommodation_type', 'price', 'offer', 'offer_cat', 'year', 'month',
       'weekend', 'holiday', 'nnights', 'scarce_room'],
      dtype='object')

In [11]:
#creating dummy variable
df['highly_rated'] = (df['rating'] >= 4).astype(int)
df = df.dropna(subset=['rating'])

In [12]:
df.highly_rated

0       1
1       1
2       1
3       1
4       1
       ..
2053    0
2054    0
2055    0
2056    0
2057    0
Name: highly_rated, Length: 2045, dtype: int64

In [13]:
#dropping na values for distance and stars
df = df.dropna(subset=['distance'])
df = df.dropna(subset=['stars'])

In [16]:
df.head()

Unnamed: 0,hotel_id,city,distance,stars,rating,country,city_actual,rating_reviewcount,center1label,center2label,...,price,offer,offer_cat,year,month,weekend,holiday,nnights,scarce_room,highly_rated
0,1,Amsterdam,3.1,4.0,4.3,Netherlands,Amsterdam,1030.0,City centre,Montelbaanstoren,...,172,0,0% no offer,2017,11,1,0,1,0,1
1,1,Amsterdam,3.1,4.0,4.3,Netherlands,Amsterdam,1030.0,City centre,Montelbaanstoren,...,122,1,15-50% offer,2018,1,1,0,1,0,1
2,1,Amsterdam,3.1,4.0,4.3,Netherlands,Amsterdam,1030.0,City centre,Montelbaanstoren,...,122,1,15-50% offer,2017,12,0,1,1,0,1
3,1,Amsterdam,3.1,4.0,4.3,Netherlands,Amsterdam,1030.0,City centre,Montelbaanstoren,...,552,1,1-15% offer,2017,12,0,1,4,0,1
4,1,Amsterdam,3.1,4.0,4.3,Netherlands,Amsterdam,1030.0,City centre,Montelbaanstoren,...,122,1,15-50% offer,2018,2,1,0,1,0,1
