#### Prepared for Gabor's Data Analysis

### Data Analysis for Business, Economics, and Policy
by Gabor Bekes and Gabor Kezdi
 
Cambridge University Press 2021

**[gabors-data-analysis.com ](https://gabors-data-analysis.com/)**

 License: Free to share, modify and use for educational purposes. 
 Not to be used for commercial purposes.

### Chapter 02
**CH02A Finding a good deal among hotels: data preparation**

using the hotels-vienna dataset

version 1.0 2021-05-05

In [1]:
import os
import sys
import warnings

import pandas as pd

warnings.filterwarnings("ignore")

In [2]:
# Current script folder
current_path = os.getcwd()
dirname = current_path.split("da_case_studies")[0]

# location folders
data_in = dirname + "da_data_repo/hotels-vienna"
data_out = dirname + "da_case_studies/ch02-hotels-data-prep/"
output = dirname + "da_case_studies/ch02-hotels-data-prep/output/"
func = dirname + "da_case_studies/ch00-tech-prep/"
sys.path.append(func)

In [3]:
# Import the prewritten helper functions
import py_helper_functions as da

In [4]:
# we'll use both clean and raw files in this case study
# separate data_in directpories for these two
#  same for options 1 and 2 (once you have set $data_in)
data_in_clean = data_in + "/clean/"
data_in_raw =data_in + "/raw/"

### Read data

In [5]:
# load in clean and tidy data and create workfile
data = pd.read_csv(data_in_clean + "hotels-vienna.csv")
#data = pd.read_csv("https://osf.io/y6jvb/download")


In [6]:
data = data[
    [
        "hotel_id",
        "accommodation_type",
        "distance",
        "stars",
        "rating",
        "rating_count",
        "price",
    ]
]

In [7]:
# look at accomodation types
data["accommodation_type"].value_counts()

accommodation_type
Hotel                  264
Apartment              124
Pension                 16
Guest House              8
Hostel                   6
Bed and breakfast        4
Apart-hotel              4
Vacation home Condo      2
Name: count, dtype: int64

**********************************************
### Table 1.1
**********************************************


In [8]:
data.head()

Unnamed: 0,hotel_id,accommodation_type,distance,stars,rating,rating_count,price
0,21894,Apartment,2.7,4.0,4.4,36.0,81
1,21897,Hotel,1.7,4.0,3.9,189.0,81
2,21901,Hotel,1.4,4.0,3.7,53.0,85
3,21902,Hotel,1.7,3.0,4.0,55.0,83
4,21903,Hotel,1.2,4.0,3.9,33.0,82


**********************************************
### Table 2.2
**********************************************


In [9]:
# data types stored in the data file 
# different way of "type", than in the book
data.dtypes

hotel_id                int64
accommodation_type     object
distance              float64
stars                 float64
rating                float64
rating_count          float64
price                   int64
dtype: object

In [10]:
data.iloc[1]

hotel_id              21897
accommodation_type    Hotel
distance                1.7
stars                   4.0
rating                  3.9
rating_count          189.0
price                    81
Name: 1, dtype: object

**********************************************
### Table 2.3
**********************************************


In [11]:
data = data.loc[data["accommodation_type"] == "Hotel"]

In [12]:
data.shape[0]

264

In [13]:
data[["hotel_id","price","distance"]].head(3)

Unnamed: 0,hotel_id,price,distance
1,21897,81,1.7
2,21901,85,1.4
3,21902,83,1.7


## PART B: repeat part of the cleaning code
using the raw csv data file
 includes some additional output
*********************************************************

*IMPORT AND PREPARE DATA*

variables downoaded as string, often in form that is not helpful
need to transform then to numbers that we can use

In [14]:
data = pd.read_csv(data_in_raw + "hotelbookingdata-vienna.csv")
#data = pd.read_csv( "https://osf.io/g5dmw/download" )

In [15]:
# distance to center entered as string in miles with one decimal
data["distance"] = data["center1distance"].str.split(" ").apply(lambda x: float(x[0]))
data["distance_alter"] = (
    data["center2distance"].str.split(" ").apply(lambda x: float(x[0]))
)

In [16]:
data["accommodation_type"] = (
    data["accommodationtype"].str.split("@").apply(lambda x: x[1]).str.strip()
)

In [17]:
data["nnight"] = data["price_night"].str.split(" ").apply(lambda x: int(x[2]))

In [18]:
# generate numerical variable of rating variable from string variable
data["rating"] = (
    data["guestreviewsrating"]
    .str.split(" ")
    .apply(lambda x: float(x[0]) if type(x) == list else None)
)

In [19]:
# check: frequency table of all values incl. missing varlues
da.tabulate(data["rating"])

Unnamed: 0_level_0,Freq.,Perc.,Cum
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,3,0.007,0.007
2.0,4,0.009,0.016
2.2,4,0.009,0.026
2.5,1,0.002,0.028
2.7,2,0.005,0.033
3.0,12,0.028,0.06
3.2,14,0.033,0.093
3.4,6,0.014,0.107
3.5,30,0.07,0.177
3.7,43,0.1,0.277


In [20]:
# check: frequency table of all values incl. missing varlues
da.tabulate(data["rating_reviewcount"])

Unnamed: 0_level_0,Freq.,Perc.,Cum
rating_reviewcount,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,15,0.035,0.035
2.0,9,0.021,0.056
3.0,9,0.021,0.077
4.0,1,0.002,0.079
5.0,5,0.012,0.091
...,...,...,...
923.0,1,0.002,0.912
985.0,1,0.002,0.914
1122.0,1,0.002,0.916
1541.0,1,0.002,0.919


In [21]:
data["rating_count"] = data["rating_reviewcount"].apply(float)
data["rating_count"].describe().to_frame().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rating_count,395.0,155.293671,191.296684,1.0,26.5,84.0,203.0,1541.0


*RENAME VARIABLES*

In [22]:
data = data.rename(
    columns={
        "rating2_ta": "ratingta",
        "rating2_ta_reviewcount": "ratingta_count",
        "addresscountryname": "country",
        "s_city": "city",
        "starrating": "stars",
    }
)

In [23]:
# look at key variables
da.tabulate(data["stars"])

Unnamed: 0_level_0,Freq.,Perc.,Cum
stars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,1,0.002,0.002
2.0,47,0.109,0.112
2.5,5,0.012,0.123
3.0,141,0.328,0.451
3.5,57,0.133,0.584
4.0,144,0.335,0.919
4.5,8,0.019,0.937
5.0,27,0.063,1.0


In [24]:
da.tabulate(data["rating"],drop_missing=True)

Unnamed: 0_level_0,Freq.,Perc.,Cum
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,3,0.008,0.008
2.0,4,0.01,0.018
2.2,4,0.01,0.028
2.5,1,0.003,0.03
2.7,2,0.005,0.035
3.0,12,0.03,0.066
3.2,14,0.035,0.101
3.4,6,0.015,0.116
3.5,30,0.076,0.192
3.7,43,0.109,0.301


In [25]:
data = data.drop(
    columns=[
        "center2distance",
        "center1distance",
        "price_night",
        "guestreviewsrating",
        "rating_reviewcount",
    ]
)

**********************************************
### Table 2.10
**********************************************


In [26]:
# Look for perfect duplicates
data = data.sort_values(by=["hotel_id"])
data[data["hotel_id"].duplicated(keep=False)][
    [
        "hotel_id",
        "accommodation_type",
        "price",
        "distance",
        "stars",
        "rating",
        "rating_count",
    ]
]

Unnamed: 0,hotel_id,accommodation_type,price,distance,stars,rating,rating_count
128,22050,Hotel,242,0.0,4.0,4.8,404.0
129,22050,Hotel,242,0.0,4.0,4.8,404.0
242,22185,Hotel,84,0.8,3.0,2.2,3.0
241,22185,Hotel,84,0.8,3.0,2.2,3.0


In [27]:
# drop the duplicate values
data = data.drop_duplicates()

**********************************************
### Missing values in text
***********************************************

In [28]:
data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price,428.0,131.366822,91.580545,27.0,83.0,109.5,146.0,1012.0
stars,428.0,3.434579,0.772278,1.0,3.0,3.5,4.0,5.0
ratingta,325.0,3.990769,0.482638,2.0,3.5,4.0,4.5,5.0
ratingta_count,325.0,556.516923,586.874582,2.0,129.0,335.0,811.0,3171.0
scarce_room,428.0,0.598131,0.49085,0.0,0.0,1.0,1.0,1.0
hotel_id,428.0,22153.502336,146.858477,21894.0,22027.75,22155.5,22279.25,22409.0
offer,428.0,0.679907,0.467058,0.0,0.0,1.0,1.0,1.0
year,428.0,2017.0,0.0,2017.0,2017.0,2017.0,2017.0,2017.0
month,428.0,11.0,0.0,11.0,11.0,11.0,11.0,11.0
weekend,428.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
print(data["rating"].isnull().sum())
data["misrating"] = data["rating"].isnull()

35


In [30]:
da.tabulate(data["misrating"])

Unnamed: 0_level_0,Freq.,Perc.,Cum
misrating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,393,0.918,0.918
True,35,0.082,1.0


In [31]:
pd.crosstab(data["accommodation_type"], data["misrating"], margins=True)

misrating,False,True,All
accommodation_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apart-hotel,4,0,4
Apartment,92,32,124
Bed and breakfast,4,0,4
Guest House,7,1,8
Hostel,6,0,6
Hotel,263,1,264
Pension,16,0,16
Vacation home Condo,1,1,2
All,393,35,428


In [32]:
pd.crosstab(
    index=data["accommodation_type"],
    columns=data["misrating"],
    values=data["price"],
    aggfunc="mean",
    margins=True,
).round(2)

misrating,False,True,All
accommodation_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apart-hotel,121.25,,121.25
Apartment,136.45,179.06,147.44
Bed and breakfast,118.25,,118.25
Guest House,71.0,103.0,75.0
Hostel,53.67,,53.67
Hotel,130.02,106.0,129.93
Pension,96.06,,96.06
Vacation home Condo,107.0,116.0,111.5
All,127.66,173.0,131.37


In [33]:
data.loc[
    (data["misrating"] == 1) & (data["accommodation_type"] == "Hotel"),
    [
        "hotel_id",
        "accommodation_type",
        "price",
        "distance",
        "stars",
        "rating",
        "rating_count",
    ],
]

Unnamed: 0,hotel_id,accommodation_type,price,distance,stars,rating,rating_count
14,21916,Hotel,106,0.7,2.5,,
