In [1]:
# Import libraries
from collections import Counter, OrderedDict
from itertools import chain
import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 3000)
pd.options.display.max_colwidth = 1000
import random

In [2]:
# Path to file
hotels_path = "../data/hotels.csv"

# Dataframe
hotels_df = pd.read_csv(hotels_path, header = None)

# Sanity check
hotels_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,"Mutterstadt, Germany",RheinCity Hotel,4.0,4,"Zollhofstraße 11, Ludwigshafen, RP, 67059, Germany",Ludwigshafen,"5.4 miles to Mutterstadt, 3.3 miles to Mannheim (MHG)",107.0,https://www.hotels.com/ho804141952/?q-check-out=2020-08-29&FPQ=3&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=1&tab=description&JHR=3&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3
1,"Mutterstadt, Germany",NYX Hotel Mannheim by Leonardo Hotels,4.0,14,"F4, 4-11, Mannheim, BW, 68159, Germany",Mannheim,"6.2 miles to Mutterstadt, 2.8 miles to Mannheim (MHG)",78.0,https://www.hotels.com/ho408334/?q-check-out=2020-08-29&FPQ=2&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=2&tab=description&JHR=2&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3
2,"Mutterstadt, Germany","Radisson Blu Hotel, Mannheim",4.5,387,"Quadrant Q7, 27, Mannheim, 68161, Germany",Mannheim,"6.4 miles to Mutterstadt, 2.3 miles to Mannheim (MHG)",na,https://www.hotels.com/ho626280/?pa=3&tab=description&q-room-0-adults=2&intlid=SoldOutListing&ZSX=0&SYE=3&q-room-0-children=0
3,"Mutterstadt, Germany",ACHAT Comfort Frankenthal/Pfalz,3.5,64,"Mahlastrasse 18, Frankenthal, RP, 67227, Germany",Frankenthal,"5.9 miles to Mutterstadt, 8.3 miles to Mannheim (MHG)",83.0,https://www.hotels.com/ho180949/?q-check-out=2020-08-29&FPQ=2&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=4&tab=description&JHR=2&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3
4,"Mutterstadt, Germany",Relax Apartment,na,na,"F4 14-15, Mannheim, 68159, Germany",Mannheim,"6.2 miles to Mutterstadt, 2.7 miles to Mannheim (MHG)",na,https://www.hotels.com/ho685840/?q-check-out=2020-08-29&FPQ=2&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=5&tab=description&JHR=2&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3


In [3]:
# Rename columns
hotels_df.rename({0: "location", 1: "hotel_name", 2: "rating", 3: "popularity_rating", 
                      4: "address", 5: "locality", 6: "landmark", 
                      7: "price", 8: "URL"}, inplace = True, axis = 1)

In [4]:
# Check for null values
hotels_df.isna().sum()

location                0
hotel_name              0
rating                  0
popularity_rating       0
address                 0
locality                0
landmark             2215
price                   0
URL                     0
dtype: int64

In [5]:
# Check variable types
hotels_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480020 entries, 0 to 480019
Data columns (total 9 columns):
location             480020 non-null object
hotel_name           480020 non-null object
rating               480020 non-null object
popularity_rating    480020 non-null object
address              480020 non-null object
locality             480020 non-null object
landmark             477805 non-null object
price                480020 non-null object
URL                  480020 non-null object
dtypes: object(9)
memory usage: 33.0+ MB


In [6]:
# Split location
location_df = hotels_df["location"].str.split(",", expand = True)

In [7]:
# Rename landmark columns
location_df.rename({0: "city", 1: "country"}, inplace = True, axis = 1)

In [8]:
# Merge landmarks_df with hotels_df
hotels_df = pd.merge(location_df, hotels_df[["hotel_name", "rating", "address", "popularity_rating",
                                             "locality", "price", "landmark", "URL"]], left_index = True, 
                          right_index = True, how = "right")

In [9]:
# Lambda function to convert objects to floats
f = lambda x: float(x) if x != np.nan else np.nan

# Replace na values with np.nan
hotels_df["price"] = hotels_df["price"].replace("na", np.nan)

# Convert price to int
hotels_df["price"] = hotels_df["price"].map(f)

# Replace na values with np.nan
hotels_df["rating"] = hotels_df["rating"].replace("na", np.nan)

# Convert rating to float
hotels_df["rating"] = hotels_df["rating"].map(f)

# Replace na values with np.nan
hotels_df["popularity_rating"] = hotels_df["popularity_rating"].replace("na", np.nan)

# Convert rating to float
hotels_df["popularity_rating"] = hotels_df["popularity_rating"].map(f)

# Lambda function to strip strings of whitespace
g = lambda x: x.strip()

# Strip country column of whitespace
hotels_df["country"] = hotels_df["country"].map(g)

In [12]:
# Check the null values in the landmark column
print(hotels_df[hotels_df["landmark"].isnull()])

                               city         country     2  \
12529                      Montréal          Canada  None   
12530                      Montréal          Canada  None   
12531                      Montréal          Canada  None   
12532                      Montréal          Canada  None   
12979                       Elbasan         Albania  None   
12980                       Elbasan         Albania  None   
12981                       Elbasan         Albania  None   
13281                Deutschneudorf         Germany  None   
13282                Deutschneudorf         Germany  None   
29059                        Venaco          France  None   
31119                      Silvassa           India  None   
31120                      Silvassa           India  None   
31121                      Silvassa           India  None   
34055                    Persenbeug         Austria  None   
34056                    Persenbeug         Austria  None   
34057                   

In [11]:
# Input ~ 200 values manually

# Austria
hotels_df.loc[hotels_df.index[39319], "landmark"] = "6.1 miles to Serfaus-Fiss-Ladis"
hotels_df.loc[hotels_df.index[39320], "landmark"] = "6.2 miles to Arlberg Pass"
hotels_df.loc[hotels_df.index[39321], "landmark"] = "5.2 miles to Arlberg Pass"
hotels_df.loc[hotels_df.index[39322], "landmark"] = "5.0 miles to Arlberg Pass"
hotels_df.loc[hotels_df.index[39323], "landmark"] = "4.2 miles to St. Christoph am Arlberg Ski Area "
hotels_df.loc[hotels_df.index[39324], "landmark"] = "3.2 miles to Acherkogel Gondola"
hotels_df.loc[hotels_df.index[39325], "landmark"] = "10.0 miles to Serfaus-Fiss-Ladis"

# Ecuador
hotels_df.loc[hotels_df.index[300016], "landmark"] = "21.0 miles to Quilotoa Lagoon"
hotels_df.loc[hotels_df.index[300017], "landmark"] = "7.3 miles to Plaza Sucre"
hotels_df.loc[hotels_df.index[300018], "landmark"] = "0.3 miles to Mausoleum of Montalvo"
hotels_df.loc[hotels_df.index[300019], "landmark"] = "13.9 miles to Plaza Sucre"
hotels_df.loc[hotels_df.index[300020], "landmark"] = "13.3 miles to Plaza Sucre"
hotels_df.loc[hotels_df.index[300021], "landmark"] = "21.3 miles to Quilotoa Lagoon"
hotels_df.loc[hotels_df.index[300022], "landmark"] = "7.3 miles to Mausoleum of Montalvo"
hotels_df.loc[hotels_df.index[300023], "landmark"] = "13.9 miles to Church of the Virgin of the Holy Water"
hotels_df.loc[hotels_df.index[300024], "landmark"] = "7.5 miles to Cotopaxi National Park"
hotels_df.loc[hotels_df.index[300025], "landmark"] = "9.1 miles to Mausoleum of Montalvo"
hotels_df.loc[hotels_df.index[300026], "landmark"] = "0.2 miles to Col Nac Bolivar Museum"
hotels_df.loc[hotels_df.index[300027], "landmark"] = "9.0 miles to Quilotoa Lagoon"
hotels_df.loc[hotels_df.index[300028], "landmark"] = "16.9 miles to Cotopaxi National Park"

# Australia
hotels_df.loc[hotels_df.index[162364], "landmark"] = "6.1 miles to Cooper Cove Marina"
hotels_df.loc[hotels_df.index[162365], "landmark"] = "13.6 miles to Moonta Golf Course"
hotels_df.loc[hotels_df.index[162366], "landmark"] = "3.2 miles to Wallaroo Golf Course"

# Italy
hotels_df.loc[hotels_df.index[154538], "landmark"] = "7.1 miles to Fort of Fortezza"
hotels_df.loc[hotels_df.index[154541], "landmark"] = "2.4 miles to Kurhaus"
hotels_df.loc[hotels_df.index[154542], "landmark"] = "0.1 miles to Kurhaus"
hotels_df.loc[hotels_df.index[154543], "landmark"] = "1.0 miles to Kurhaus"
hotels_df.loc[hotels_df.index[154544], "landmark"] = "1.7 miles to Tirol Castle"
hotels_df.loc[hotels_df.index[154546], "landmark"] = "0.4 miles to Kurhaus"
hotels_df.loc[hotels_df.index[154547], "landmark"] = "0.4 miles to Kurhaus"

# Turkey
hotels_df.loc[hotels_df.index[150970], "landmark"] = "2.5 miles to Marmaris Beach"
hotels_df.loc[hotels_df.index[150971], "landmark"] = "1.0 miles to Marmaris Beach"
hotels_df.loc[hotels_df.index[150972], "landmark"] = "19.4 miles to Marmaris Beach"
hotels_df.loc[hotels_df.index[150973], "landmark"] = "2.0 miles to Marmaris Beach"
hotels_df.loc[hotels_df.index[150974], "landmark"] = "1.5 miles to Marmaris Beach"
hotels_df.loc[hotels_df.index[150975], "landmark"] = "1.5 miles to Marmaris Beach"
hotels_df.loc[hotels_df.index[150976], "landmark"] = "0.5 miles to Atlantis Water Park"
hotels_df.loc[hotels_df.index[150977], "landmark"] = "0.2 miles to Marmaris Beach"
hotels_df.loc[hotels_df.index[150978], "landmark"] = "0.8 miles to Marmaris Beach"
hotels_df.loc[hotels_df.index[150979], "landmark"] = "0.5 miles to Atlantis Water Park"
hotels_df.loc[hotels_df.index[150980], "landmark"] = "4.0 miles to Atlantis Water Park"
hotels_df.loc[hotels_df.index[150981], "landmark"] = "0.5 miles to Marmaris Beach"
hotels_df.loc[hotels_df.index[150982], "landmark"] = "0.1 miles to Marmaris Beach"
hotels_df.loc[hotels_df.index[150983], "landmark"] = "1.2 miles to Atlantis Water Park"
hotels_df.loc[hotels_df.index[150984], "landmark"] = "0.1 miles to Atlantis Water Park"
hotels_df.loc[hotels_df.index[150985], "landmark"] = "0.2 miles to Marmaris Beach"
hotels_df.loc[hotels_df.index[150986], "landmark"] = "0.1 miles to Marmaris Beach"
hotels_df.loc[hotels_df.index[150987], "landmark"] = "0.1 miles to Marmaris Beach"
hotels_df.loc[hotels_df.index[150988], "landmark"] = "0.4 miles to Atlantis Water Park"
hotels_df.loc[hotels_df.index[150989], "landmark"] = "1.3 miles to Atlantis Water Park"
hotels_df.loc[hotels_df.index[150990], "landmark"] = "0.6 miles to Atlantis Water Park"
hotels_df.loc[hotels_df.index[150991], "landmark"] = "0.1 miles to Marmaris Beach"
hotels_df.loc[hotels_df.index[150992], "landmark"] = "0.2 miles to Icmeler Beach"
hotels_df.loc[hotels_df.index[150993], "landmark"] = "1.2 miles to Icmeler Beach"
hotels_df.loc[hotels_df.index[150994], "landmark"] = "2.6 miles to Marmaris Beach"
hotels_df.loc[hotels_df.index[150995], "landmark"] = "1.5 miles to Icmeler Beach"
hotels_df.loc[hotels_df.index[150996], "landmark"] = "6.9 miles to Icmeler Beach"
hotels_df.loc[hotels_df.index[150997], "landmark"] = "0.3 miles to Turunc Beach"
hotels_df.loc[hotels_df.index[150998], "landmark"] = "0.2 miles to Turunc Beach"
hotels_df.loc[hotels_df.index[150999], "landmark"] = "0.3 miles to Marmaris Beach"
hotels_df.loc[hotels_df.index[151000], "landmark"] = "6.3 miles to Girl Sand Beach"
hotels_df.loc[hotels_df.index[151001], "landmark"] = "0.1 miles to Marmaris Beach"
hotels_df.loc[hotels_df.index[151002], "landmark"] = "13.5 miles to Girl Sand Beach"
hotels_df.loc[hotels_df.index[151003], "landmark"] = "6.8 miles to Turgut Falls"
hotels_df.loc[hotels_df.index[151004], "landmark"] = "1.2 miles to Akyaka Beach"
hotels_df.loc[hotels_df.index[151005], "landmark"] = "0.5 miles to Akyaka Azmak Deresi"
hotels_df.loc[hotels_df.index[151006], "landmark"] = "0.3 miles to Akyaka Azmak Deresi"
hotels_df.loc[hotels_df.index[151007], "landmark"] = "11.5 miles to Turgut Falls"
hotels_df.loc[hotels_df.index[151008], "landmark"] = "3.6 miles to Sulungur Lake"
hotels_df.loc[hotels_df.index[151009], "landmark"] = "3.8 miles to Sulungur Lake"
hotels_df.loc[hotels_df.index[151010], "landmark"] = "3.8 miles to Sulungur Lake"
hotels_df.loc[hotels_df.index[151011], "landmark"] = "4.7 miles to Sulungur Lake"
hotels_df.loc[hotels_df.index[151012], "landmark"] = "3.9 miles to Sulungur Lake"
hotels_df.loc[hotels_df.index[151013], "landmark"] = "4.2 miles to Sulungur Lake"
hotels_df.loc[hotels_df.index[151014], "landmark"] = "0.2 miles to Dalyan Mosque"
hotels_df.loc[hotels_df.index[151015], "landmark"] = "0.5 miles to Dalyan Mosque"
hotels_df.loc[hotels_df.index[151016], "landmark"] = "0.7 miles to Dalyan Mosque"
hotels_df.loc[hotels_df.index[151017], "landmark"] = "3.9 miles to Sulungur Lake"
hotels_df.loc[hotels_df.index[151018], "landmark"] = "0.1 miles to Atlantis Water Park"

# France
hotels_df.loc[hotels_df.index[29049], "landmark"] = "3.8 miles to Gorges de Tavignano"
hotels_df.loc[hotels_df.index[29050], "landmark"] = "7.4 miles to Punta Di Ciaccone"
hotels_df.loc[hotels_df.index[29051], "landmark"] = "7.4 miles to Punta Di Ciaccone"
hotels_df.loc[hotels_df.index[29052], "landmark"] = "2.3 miles to Gaffory Square"
hotels_df.loc[hotels_df.index[29053], "landmark"] = "2.4 miles to Cascade des Anglais"
hotels_df.loc[hotels_df.index[29054], "landmark"] = "8.9 miles to Gaffory Square"
hotels_df.loc[hotels_df.index[29055], "landmark"] = "16.2 miles to Vergio Pass"
hotels_df.loc[hotels_df.index[29056], "landmark"] = "12.9 miles to Vergio Pass "
hotels_df.loc[hotels_df.index[29057], "landmark"] = "13.1 miles to Pascal Paoli Museum"
hotels_df.loc[hotels_df.index[29058], "landmark"] = "8.9 miles to Cascade des Anglais"
hotels_df.loc[hotels_df.index[29060], "landmark"] = "8.3 miles to Cervione Church"
hotels_df.loc[hotels_df.index[29061], "landmark"] = "11.1 miles to Plage de Solenzara"
hotels_df.loc[hotels_df.index[29062], "landmark"] = "2.0 miles to Pascal Paoli Museum"
hotels_df.loc[hotels_df.index[29063], "landmark"] = "3.1 miles to Parc Galea"
hotels_df.loc[hotels_df.index[29064], "landmark"] = "4.6 miles to Parc Galea"
hotels_df.loc[hotels_df.index[29065], "landmark"] = "2.4 miles to Church of San Nicolao"
hotels_df.loc[hotels_df.index[29066], "landmark"] = "2.7 miles to Church of San Nicolao"
hotels_df.loc[hotels_df.index[29067], "landmark"] = "10.3 miles to Ostriconi Beach"
hotels_df.loc[hotels_df.index[29068], "landmark"] = "3.0 miles to Parc Galea"
hotels_df.loc[hotels_df.index[29069], "landmark"] = "3.3 miles to Church of Canonica"
hotels_df.loc[hotels_df.index[29070], "landmark"] = "7.0 miles to Parc Galea"
hotels_df.loc[hotels_df.index[29071], "landmark"] = "5.7 miles to Calanches"
hotels_df.loc[hotels_df.index[29072], "landmark"] = "3.5 miles to Church of Canonica"
hotels_df.loc[hotels_df.index[29073], "landmark"] = "10.6 miles to Place Paoli"
hotels_df.loc[hotels_df.index[29074], "landmark"] = "4.4 miles to Genoese Tower"
hotels_df.loc[hotels_df.index[29075], "landmark"] = "5.6 miles to Plage de Lozari"
hotels_df.loc[hotels_df.index[29076], "landmark"] = "4.0 miles to Cupulatta"
hotels_df.loc[hotels_df.index[29077], "landmark"] = "3.9 miles to Calanques de Piana"
hotels_df.loc[hotels_df.index[29078], "landmark"] = "8.1 miles to Calanques de Piana"
hotels_df.loc[hotels_df.index[29079], "landmark"] = "1.9 miles to Calanques de Piana"
hotels_df.loc[hotels_df.index[29080], "landmark"] = "3.9 miles to Pascal Paoli Museum"

# Slovakia
hotels_df.loc[hotels_df.index[7045], "landmark"] = "5.6 miles to Ethnographic Museum"
hotels_df.loc[hotels_df.index[7046], "landmark"] = "4.4 miles to Liptovsky Mara"
hotels_df.loc[hotels_df.index[7047], "landmark"] = "4.1 miles to Chopok"
hotels_df.loc[hotels_df.index[7048], "landmark"] = "5.8 miles to Gothal"
hotels_df.loc[hotels_df.index[7049], "landmark"] = "3.9 miles to Museum of Nature Protection and Speleology"
hotels_df.loc[hotels_df.index[7050], "landmark"] = "3.1 miles to Museum of Nature Protection and Speleology"
hotels_df.loc[hotels_df.index[7051], "landmark"] = "4.9 miles to Namestie Osloboditelov"
hotels_df.loc[hotels_df.index[7052], "landmark"] = "9.0 miles to Wooden church Hronsek UNESCO"
hotels_df.loc[hotels_df.index[7053], "landmark"] = "6.0 miles to Archaeological Museum Havránok"

# Spain
hotels_df.loc[hotels_df.index[9123], "landmark"] = "3.5 miles to Las Rozas Dam"
hotels_df.loc[hotels_df.index[9124], "landmark"] = "13.2 miles to Poza Julia Museum"
hotels_df.loc[hotels_df.index[9125], "landmark"] = "3.2 miles to Muniellos Nature Reserve"
hotels_df.loc[hotels_df.index[9126], "landmark"] = "6.7 miles to Muniellos Nature Reserve"
hotels_df.loc[hotels_df.index[9127], "landmark"] = "7.4 miles to Dominio de Tares"
hotels_df.loc[hotels_df.index[9128], "landmark"] = "7.3 miles to Dominio de Tares"
hotels_df.loc[hotels_df.index[9129], "landmark"] = "9.1 miles to Saliencia Lakes"

# Brazil
hotels_df.loc[hotels_df.index[10682], "landmark"] = "3.4 miles to Itapaiva Castle"
hotels_df.loc[hotels_df.index[10683], "landmark"] = "6.8 miles to Itapaiva Castle"
hotels_df.loc[hotels_df.index[10684], "landmark"] = "1.0 miles to Imperial Museum"
hotels_df.loc[hotels_df.index[10685], "landmark"] = "7.2 miles to Itapaiva Castle"
hotels_df.loc[hotels_df.index[10686], "landmark"] = "1.2 miles to Imperial Museum"
hotels_df.loc[hotels_df.index[10687], "landmark"] = "4.7 miles to Itapaiva Castle"
hotels_df.loc[hotels_df.index[10688], "landmark"] = "8.2 miles to Ferreira da Cunha Museum of Arms"
hotels_df.loc[hotels_df.index[10689], "landmark"] = "2.3 miles to Judith Fountain"

# Germany
hotels_df.loc[hotels_df.index[13272], "landmark"] = "27.6 miles to Ore Mountain Museum"
hotels_df.loc[hotels_df.index[13273], "landmark"] = "5.7 miles to Gottfried Silbermann Museum"
hotels_df.loc[hotels_df.index[13274], "landmark"] = "2.4 miles to Altenberg Bobsleigh"
hotels_df.loc[hotels_df.index[13275], "landmark"] = "3.9 miles to Pferdegöpel auf dem Rudolphschacht"
hotels_df.loc[hotels_df.index[13276], "landmark"] = "6.6 miles to Ore Mountain Museum"
hotels_df.loc[hotels_df.index[13277], "landmark"] = "4.5 miles to Saidenbach Dam"
hotels_df.loc[hotels_df.index[13278], "landmark"] = "3.5 miles to Saidenbach Dam"
hotels_df.loc[hotels_df.index[13279], "landmark"] = "9.1 miles to Wolkenstein Castle"
hotels_df.loc[hotels_df.index[13280], "landmark"] = "4.2 miles to Wolkenstein Castle"
hotels_df.loc[hotels_df.index[13283], "landmark"] = "21.2 miles to Ore Mountain Museum"

# Brazil
hotels_df.loc[hotels_df.index[15161], "landmark"] = "2.8 miles to Santana do Riacho Waterfall"
hotels_df.loc[hotels_df.index[15162], "landmark"] = "13.2 miles to Andorinhas Waterfall"
hotels_df.loc[hotels_df.index[15163], "landmark"] = "4.0 miles to Santana do Riacho Waterfall"
hotels_df.loc[hotels_df.index[15164], "landmark"] = "13.2 miles to Andorinhas Waterfall"
hotels_df.loc[hotels_df.index[15165], "landmark"] = "14.8 miles to Peter Lund Museum"
hotels_df.loc[hotels_df.index[15166], "landmark"] = "2.4 miles to Santana do Riacho Waterfall"

# Sri Lanka
hotels_df.loc[hotels_df.index[15214], "landmark"] = "2.8 miles to Kushtarajagala Statue"
hotels_df.loc[hotels_df.index[15215], "landmark"] = "5.9 miles to Mirissa Beach"
hotels_df.loc[hotels_df.index[15216], "landmark"] = "2.0 miles to Mirissa Beach"
hotels_df.loc[hotels_df.index[15217], "landmark"] = "5.9 miles to Mirissa Beach "
hotels_df.loc[hotels_df.index[15218], "landmark"] = "2.8 miles to Kushtarajagala Statue"
hotels_df.loc[hotels_df.index[15219], "landmark"] = "4.8 miles to Mirissa Beach"

# Russia
hotels_df.loc[hotels_df.index[18137], "landmark"] = "1.2 miles to City Center"
hotels_df.loc[hotels_df.index[18138], "landmark"] = "6.8 miles to City Center"
hotels_df.loc[hotels_df.index[18139], "landmark"] = "4.8 miles to City Center"

# Spain
hotels_df.loc[hotels_df.index[18545], "landmark"] = "0.2 miles to Les Platgetes"
hotels_df.loc[hotels_df.index[18546], "landmark"] = "0.5 miles to Arenal-Bol Beach"
hotels_df.loc[hotels_df.index[18547], "landmark"] = "8.3 miles to Denia Marina"
hotels_df.loc[hotels_df.index[18548], "landmark"] = "6.9 miles to Denia Marina"
hotels_df.loc[hotels_df.index[18549], "landmark"] = "2.4 miles Denia Marina"
hotels_df.loc[hotels_df.index[18550], "landmark"] = "2.1 miles to Mirador Cronistas de Espana"
hotels_df.loc[hotels_df.index[18551], "landmark"] = "2.1 miles to Cova de L'Aigua"
hotels_df.loc[hotels_df.index[18552], "landmark"] = "3.0 miles to Cova de L'Aigua"
hotels_df.loc[hotels_df.index[18553], "landmark"] = "2.7 miles to Albir Beach"
hotels_df.loc[hotels_df.index[18554], "landmark"] = "0.5 miles to Mirador Cronistas de Espana"
hotels_df.loc[hotels_df.index[18555], "landmark"] = "2.0 miles to Albir Beach"

# Chile
hotels_df.loc[hotels_df.index[20462], "landmark"] = "8.2 miles to Lake Frio"
hotels_df.loc[hotels_df.index[20463], "landmark"] = "15.8 miles to Lake Frio"
hotels_df.loc[hotels_df.index[20464], "landmark"] = "15.8 miles to Lake Frio"

In [30]:
# Drop Null rows from landmark column"
hotels_df = hotels_df.dropna(axis = 0, subset = ["landmark"])

In [13]:
hotels_df.isna().sum()

city                      0
country                   0
2                    479999
hotel_name                0
rating               170435
address                   0
popularity_rating    170435
locality                  0
price                284676
landmark               2042
URL                       0
dtype: int64

In [14]:
# Convert all landmarks to strings
hotels_df["landmark"] = hotels_df["landmark"].apply(lambda x: str(x))

In [15]:
# Convert USA to United States
hotels_df["country"] = hotels_df["country"].replace(" USA", "United States")

In [16]:
# Check the USA values in country column
print(hotels_df[hotels_df["country"] == " USA"])

Empty DataFrame
Columns: [city, country, 2, hotel_name, rating, address, popularity_rating, locality, price, landmark, URL]
Index: []


In [19]:
# Check the strange values in the country column
hotels_df.loc[hotels_df.index[49615], "country"] = "United Kingdom"
hotels_df.loc[hotels_df.index[49616], "country"] = "United Kingdom"
hotels_df.loc[hotels_df.index[49617], "country"] = "United Kingdom"
hotels_df.loc[hotels_df.index[49618], "country"] = "United Kingdom"
hotels_df.loc[hotels_df.index[49619], "country"] = "United Kingdom"
hotels_df.loc[hotels_df.index[49620], "country"] = "United Kingdom"

In [20]:
print(hotels_df[hotels_df["city"] == "London"])

         city         country     2  \
1808   London          Canada  None   
1809   London          Canada  None   
1810   London          Canada  None   
1811   London          Canada  None   
1812   London          Canada  None   
1813   London          Canada  None   
6361   London  United Kingdom  None   
6362   London  United Kingdom  None   
6363   London  United Kingdom  None   
6364   London  United Kingdom  None   
6365   London  United Kingdom  None   
6366   London  United Kingdom  None   
6367   London  United Kingdom  None   
49615  London  United Kingdom  None   
49616  London  United Kingdom  None   
49617  London  United Kingdom  None   
49618  London  United Kingdom  None   
49619  London  United Kingdom  None   
49620  London  United Kingdom  None   

                                               hotel_name  rating  \
1808                                 London Extended Stay     2.5   
1809   Country Inn & Suites by Radisson, London South, ON     4.5   
1810        

In [21]:
print(hotels_df[hotels_df["city"] == "Paris"])

         city country     2  \
217     Paris  France  None   
218     Paris  France  None   
219     Paris  France  None   
220     Paris  France  None   
221     Paris  France  None   
222     Paris  France  None   
223     Paris  France  None   
224     Paris  France  None   
225     Paris  France  None   
226     Paris  France  None   
227     Paris  France  None   
228     Paris  France  None   
229     Paris  France  None   
230     Paris  France  None   
231     Paris  France  None   
232     Paris  France  None   
233     Paris  France  None   
234     Paris  France  None   
235     Paris  France  None   
236     Paris  France  None   
237     Paris  France  None   
238     Paris  France  None   
239     Paris  France  None   
240     Paris  France  None   
241     Paris  France  None   
242     Paris  France  None   
243     Paris  France  None   
244     Paris  France  None   
245     Paris  France  None   
246     Paris  France  None   
247     Paris  France  None   
248     

In [22]:
# Check countries again addresses for discrepancies
def discrepancy_1(x):
    return (x.country not in x.address) and (x.city not in x.address) and (x.city not in x.locality) and (x.city not in x.landmark)

# Create 
hotels_df["discrepancy_1"] = hotels_df.apply(discrepancy_1, axis = 1)

In [23]:
discrepancy = hotels_df[hotels_df["discrepancy_1"] == True]

In [24]:
discrepancy

Unnamed: 0,city,country,2,hotel_name,rating,address,popularity_rating,locality,price,landmark,URL,discrepancy_1
1299,St. Moritz,Switzerland,,Ayres Suites Mission Viejo,4.5,"28941 Los Alisos Blvd, Mission Viejo, CA, 92692, United States",482.0,Mission Viejo,184.0,"8.3 miles to City center, 20 miles to Disneyland®",https://www.hotels.com/travelads/trackredirect.html?trackingUrl=H4sIAAAAAAAAAD2Tx66raBCE3-ZsmHvgJ3Mla0TOGDDBZjMiG5NzePo5uiPNpj5VtVrqRdd7WYb5NwwvU7zlTZzN33WfZdV32rew9ydjs_mRT1uV5vCGw0q_5M38_4hvqrT--2c5rauuFOIlvvHt8Ov1ueiW9Xw9sbsWW2QmfAhA4M8dNz_OzKcGdb_rS9WwvXN1XU4cHfv-EIX6Qiu5rC88jVmHw6C6WQGwBiCEviRPOWQ_NMU1T0LLHFI6N6A-p2KEroUjqeCUNYmB7wtvpBdSdIlkkIGioTTlmdPnYYO04wVJ8Zdg8YbQHkoU66F9QDaOxuXHdRK0Fx9Ji5l6KMs9oHQSL9uMnjywcgDsXkvOdOiBxowfxROjgieSZ7tCTRaLi6c8c_bzE1sX8zETowsob3SSwp2CQqfSdwfhxYrNaA38WBV1j1VWYV_m5CRGvUBHtmACGZWmSy9UM3zSq-8o1sA9aEQ23pdnj9J9CO5zZJbMuW36vqvum3Hf6MXX1-aVFOntPG00r7lqSlkzIa1Q7345zjv5IaEt9pRopPrZ_TTcnKjJZmRz8Hphu63gNtg8q3kQOEzwpIxNgrSIqXdFAFYH0WKtk4l95nghvX6lUdiRzljGH2dCj2Y_I0asLIrjw7YNN-wswdNYcQZ6tXBCsHtnzJxnmf7n_pzgyp5Erdl9WVtDz8w9tKZjHuvwjwmfQMbLe3Rh4rlb1dA3k3KwHDc9ezqEV_Yd-birdqf1TuaS0RKfqzXGGRJJ5O5Ij6v0W8YYFcV8OG_3i5qdqHrpMgG_nkexiMsx1HgLyAD2zxHItb0pQylsVGLqQm9V1OX4I3KCRHtHssA_KbWO2zI5ISkIOFJj9SqVeaJdzXKPX0NoHlwD-657pPpDTzkIyM09rqpQCM2mhu2TYQ4Epja5u0ta2djKKY7R...,True
1300,St. Moritz,Switzerland,,Atrium Hotel at Orange County Airport,3.0,"18700 Macarthur Blvd, Irvine, CA, 92612, United States",369.0,Irvine,129.0,"5.4 miles to City center, 9.9 miles to Disneyland®",https://www.hotels.com/ho117521/?q-check-out=2020-08-29&FPQ=3&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=2&tab=description&JHR=3&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3,True
1301,St. Moritz,Switzerland,,Anaheim Majestic Garden Hotel,,"900 S Disneyland Dr, Anaheim, CA, 92802, United States",,Anaheim Resort,,"13 miles to City center, 0.7 miles to Disneyland®",https://www.hotels.com/ho105644/?pa=3&tab=description&q-room-0-adults=2&intlid=SoldOutListing&ZSX=0&SYE=3&q-room-0-children=0,True
1302,St. Moritz,Switzerland,,Ayres Hotel & Suites Costa Mesa/Newport Beach,,"325 Bristol Street, Costa Mesa, CA, 92626, United States",,Costa Mesa,182.0,"6.6 miles to City center, 10 miles to Disneyland®",https://www.hotels.com/ho126646/?q-check-out=2020-08-29&FPQ=3&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=4&tab=description&JHR=4&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3,True
1303,St. Moritz,Switzerland,,Ayres Hotel Orange,,"200 N. The City Drive, Orange, CA, 92868, United States",,Anaheim Resort,169.0,"10 miles to City center, 2.3 miles to Disneyland®",https://www.hotels.com/ho471596/?q-check-out=2020-08-29&FPQ=3&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=5&tab=description&JHR=4&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3,True
1304,St. Moritz,Switzerland,,"Country Inn & Suites by Radisson, John Wayne Airport, CA",4.0,"2701 Hotel Terrace, Santa Ana, CA, 92705, United States",445.0,Santa Ana,109.0,"5.3 miles to City center, 7.9 miles to Disneyland®",https://www.hotels.com/ho560780/?q-check-out=2020-08-29&FPQ=3&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=6&tab=description&JHR=3&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3,True
1305,St. Moritz,Switzerland,,Ayres Hotel Fountain Valley/Huntington Beach,4.5,"17550 Brookhurst Street, Fountain Valley, CA, 92708, United States",651.0,Fountain Valley,189.0,"11 miles to City center, 7.3 miles to Disneyland®",https://www.hotels.com/ho442731/?q-check-out=2020-08-29&FPQ=3&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=7&tab=description&JHR=4&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3,True
1537,Lachi,Cyprus,,The St. George Inn,3.5,"135 Franklin Boulevard, St. George Island, FL, 32328, United States",244.0,St. George Island,105.0,"15 miles to Apalachicola, 8.5 miles to Apalachicola Chocalate Company",https://www.hotels.com/ho551028/?q-check-out=2020-08-29&FPQ=3&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=1&tab=description&JHR=3&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3,True
1538,Lachi,Cyprus,,Cape San Blas Inn,5.0,"4950 Cape San Blas Road, Port St. Joe, FL, 32456, United States",349.0,Port St. Joe,205.0,"17 miles to Apalachicola, 24 miles to Apalachicola Chocalate Company",https://www.hotels.com/ho689422016/?q-check-out=2020-08-29&FPQ=4&q-check-in=2020-08-27&WOE=6&WOD=4&q-room-0-children=0&pa=2&tab=description&JHR=5&q-room-0-adults=2&YGF=14&MGT=2&ZSX=0&SYE=3,True
1539,Lachi,Cyprus,,Port Inn,4.0,"501 Monument Ave., Port St. Joe, FL, 32456, United States",316.0,Port St. Joe,,"14 miles to Apalachicola, 20 miles to Apalachicola Chocalate Company",https://www.hotels.com/ho499940/?pa=3&tab=description&q-room-0-adults=2&intlid=SoldOutListing&ZSX=0&SYE=3&q-room-0-children=0,True


In [25]:
discrepancy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57640 entries, 1299 to 479961
Data columns (total 12 columns):
city                 57640 non-null object
country              57640 non-null object
2                    20 non-null object
hotel_name           57640 non-null object
rating               33962 non-null float64
address              57640 non-null object
popularity_rating    33962 non-null float64
locality             57640 non-null object
price                22281 non-null float64
landmark             57640 non-null object
URL                  57640 non-null object
discrepancy_1        57640 non-null bool
dtypes: bool(1), float64(3), object(8)
memory usage: 5.3+ MB


In [26]:
# Drop discrepancies
hotels_df = hotels_df[~hotels_df.discrepancy_1]

# Convert all landmarks to lowercase
hotels_df["landmark"] = hotels_df["landmark"].apply(lambda x: x.lower())

# Split landmark
hotels_df["landmark"] = hotels_df["landmark"].str.split("\n")

In [27]:
# Sanity Check
hotels_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 422380 entries, 0 to 480019
Data columns (total 12 columns):
city                 422380 non-null object
country              422380 non-null object
2                    1 non-null object
hotel_name           422380 non-null object
rating               275623 non-null float64
address              422380 non-null object
popularity_rating    275623 non-null float64
locality             422380 non-null object
price                173063 non-null float64
landmark             422380 non-null object
URL                  422380 non-null object
discrepancy_1        422380 non-null bool
dtypes: bool(1), float64(3), object(8)
memory usage: 39.1+ MB


In [28]:
# Check for null values again
hotels_df.isna().sum()

city                      0
country                   0
2                    422379
hotel_name                0
rating               146757
address                   0
popularity_rating    146757
locality                  0
price                249317
landmark                  0
URL                       0
discrepancy_1             0
dtype: int64

In [29]:
hotels_df.describe()

Unnamed: 0,rating,popularity_rating,price
count,275623.0,275623.0,173063.0
mean,4.113869,174.496555,129.713324
std,0.66881,215.692887,109.072913
min,1.0,1.0,1.0
25%,4.0,22.0,72.0
50%,4.0,84.0,100.0
75%,4.5,244.0,148.0
max,5.0,999.0,998.0


In [30]:
# # Replace np.nan with values from .describe()
hotels_df["price"] = hotels_df["price"].replace(np.nan, 130)
hotels_df["rating"] = hotels_df["rating"].replace(np.nan, 4.0)
hotels_df["popularity_rating"] = hotels_df["popularity_rating"].replace(np.nan, 175)

In [31]:
hotels_df.to_csv("../data/clean_hotels_scraped_v2.csv")