In [1]:
# Import libraries
import datetime
import functools
import matplotlib.pyplot as plt
import numpy as np
import operator
import pandas as pd
pd.set_option("display.max_rows", 500)
pd.options.display.max_colwidth = 1000
import seaborn as sns

In [2]:
# .csv path
hotels_path = "../data/hotels_com_scrape.csv"
trivago_path = "../data/train.csv"

# Create dataframe
hotels_df = pd.read_csv(hotels_path, header = None)
trivago_df = pd.read_csv(trivago_path)

In [3]:
# Sanity check
hotels_df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,"Mutterstadt, Germany",RheinCity Hotel,3.5,"Zollhofstraße 11, Ludwigshafen, RP, 67059, Germany",Ludwigshafen,5.4 miles to Mutterstadt\n3.3 miles to Mannheim (MHG),107.0
1,"Mutterstadt, Germany",NYX Hotel Mannheim by Leonardo Hotels,4.0,"F4, 4-11, Mannheim, BW, 68159, Germany",Mannheim,6.2 miles to Mutterstadt\n2.8 miles to Mannheim (MHG),78.0
2,"Mutterstadt, Germany","Radisson Blu Hotel, Mannheim",4.5,"Quadrant Q7, 27, Mannheim, 68161, Germany",Mannheim,6.4 miles to Mutterstadt\n2.3 miles to Mannheim (MHG),na
3,"Mutterstadt, Germany",ACHAT Comfort Frankenthal/Pfalz,3.5,"Mahlastrasse 18, Frankenthal, RP, 67227, Germany",Frankenthal,5.9 miles to Mutterstadt\n8.3 miles to Mannheim (MHG),84.0
4,"Mutterstadt, Germany",Leonardo Royal Hotel Mannheim,3.5,"Augustaanlage 4-8, Mannheim, BW, 68165, Germany",Mannheim,6.5 miles to Mutterstadt\n2.0 miles to Mannheim (MHG),80.0


In [4]:
# Rename columns
hotels_df.rename({0: "location", 1: "hotel_name", 2: "rating", 
                      3: "address", 4: "locality", 5: "landmark", 
                      6: "price"}, inplace = True, axis = 1)

In [5]:
# Check variable types per column
hotels_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6128 entries, 0 to 6127
Data columns (total 7 columns):
location      6128 non-null object
hotel_name    6128 non-null object
rating        6128 non-null object
address       6128 non-null object
locality      6128 non-null object
landmark      6096 non-null object
price         6128 non-null object
dtypes: object(7)
memory usage: 335.2+ KB


In [6]:
# Replace na values with np.nan
hotels_df["price"] = hotels_df["price"].replace("na", np.nan)

In [7]:
# Convert price to float
f = lambda x: float(x) if x != np.nan else np.nan
hotels_df["price"] = hotels_df["price"].map(f)

In [8]:
# Replace na values with np.nan
hotels_df["rating"] = hotels_df["rating"].replace("na", np.nan)

# Convert rating to float
hotels_df["rating"] = hotels_df["rating"].map(f)

In [9]:
# Check mean for price and rating
hotels_df.describe()

Unnamed: 0,rating,price
count,5180.0,2126.0
mean,4.168243,124.920978
std,0.560257,96.220313
min,1.0,10.0
25%,4.0,68.0
50%,4.0,100.0
75%,4.5,149.0
max,5.0,977.0


In [10]:
# Check null values
hotels_df.isna().sum()

location         0
hotel_name       0
rating         948
address          0
locality         0
landmark        32
price         4002
dtype: int64

In [11]:
# Replace np.nan values with mean
hotels_df["price"] = hotels_df["price"].replace(np.nan, 125.0)

In [12]:
# Split landmark
landmarks_df = hotels_df["landmark"].str.split("\n", expand = True)

In [13]:
# Merge landmarks_df with hotels_df
hotels_df = pd.merge(landmarks_df, hotels_df[["location", "hotel_name", "rating", "address", 
                                             "locality", "price"]], left_index = True, 
                          right_index = True, how = "right")

In [14]:
# Rename landmark columns
hotels_df.rename({0: "landmark_one", 1: "landmark_two"}, inplace = True, axis = 1)

In [15]:
# Most popular destination
most_popular_cities = trivago_df.city.value_counts().iloc[:100].index

In [16]:
# Best reviews
best_reviews = hotels_df[hotels_df["rating"] == 5]

In [17]:
best_reviews

Unnamed: 0,landmark_one,landmark_two,location,hotel_name,rating,address,locality,price
49,8.1 miles to Chicago O'Hare International Airport (ORD),7.9 miles to Chicago Midway Airport (MDW),"Chicago, USA",Bishops Hall,5.0,"605 Iowa Street, Oak Park, IL, 60302, United States",Oak Park,175.0
55,19 miles to Chicago O'Hare International Airport (ORD),6.3 miles to Chicago Midway Airport (MDW),"Chicago, USA",Welcome Inn Manor,5.0,"4563 S Michigan Ave, Chicago, IL, 60653, United States",Chicago,125.0
60,3.3 miles to City center,3.7 miles to Whitsand Bay,"Torpoint, United Kingdom",Rame Barton,5.0,"Military Road, Cawsand, Torpoint, England, PL10 1LG, United Kingdom",Torpoint,125.0
75,0.2 miles to City center,4.4 miles to Palma de Mallorca (PMI),"Palma, Spain",Fil Suites Turismo de Interior,5.0,"Carrer de la Ferreria 14, Palma de Mallorca, Illes Balears, 07002, Spain",Palma de Mallorca,153.0
76,0.4 miles to City center,4.2 miles to Palma de Mallorca (PMI),"Palma, Spain",Hotel Es Princep,5.0,"Carrer Bala Roja, 1, Palma de Mallorca, 07001, Spain",Palma de Mallorca,358.0
86,25 miles to Arosa,25 miles to Obersee Park,"Arosa, Switzerland",Hotel Waldhaus,5.0,"Via da Fex 3, Sils im Engadin-Segl, GR, 7514, Switzerland",Sils im Engadin-Segl,410.0
104,4.3 miles to Albuquerque International Sunport (ABQ),4.3 miles to Albuquerque Convention Center,"Albuquerque, USA",Red Horse Vineyard Bed & Breakfast,5.0,"2155 Londene Ln. SW, Albuquerque, NM, 87105, United States",Albuquerque,146.0
111,3.5 miles to Puerto del Rosario (FUE-Fuerteventura),1.0 mile to Caleta de Fuste Marina,"Caleta de Fuste, Spain",Castillo Beach Club35610,5.0,"Calle Virgen de Guadalupe, Green Door nº5, Caleta de Fuste, Antigua, Las Palmas, 35610, Spain",Caleta de Fuste,125.0
126,0.6 miles to City center,15 miles to Rome (FCO-Fiumicino - Leonardo da Vinci Intl.),"Rome, Italy",Relais Donna Lucrezia,5.0,"Via Emilia 88, Rome, 00188, Italy",Rome City Centre,148.0
136,0.3 miles to Vatican City,13 miles to Rome (FCO-Fiumicino - Leonardo da Vinci Intl.),"Vatican City, Vatican City",Bijoux de Rome Apartment Vatican,5.0,"Piazza della Rovere 97, Trastevere, Rome, Rome, 00165, Italy","Rome, Italy",125.0


In [85]:
# Unpersonalized recommendations
def hotel_recommendation_one():
    # Get list of most popular cities
    cities = list(most_popular_cities)
    indexes = []
    for city in most_popular_cities:
        # Get the hotels with 5.0 ratings that are at each of the top cities
        idxs = best_reviews[best_reviews["location"].str.contains(city)].index
        indexes.append(idxs)
    flatten = functools.reduce(operator.iconcat, indexes, [])
    
    return 

    
    