# Hotel Reviews in Europe

__Guest nationality analysis__

__Import libraries__

In [None]:
import pandas as pd
import numpy as np
import re
import folium

__Load data__

In [None]:
df = pd.read_csv("data/Hotel_Reviews.csv")

__Create new column for hotel country__

In [None]:
df['Hotel_Country'] = df.apply(lambda row: row['Hotel_Address'].split(" ")[-1], axis=1)
df.head()

__Filter by Spanish hotels__

In [None]:
df_spain = df[df['Hotel_Country'] == 'Spain']
print(df_spain.shape)
df_spain.head()

__Remove leading and trailling spaces from the guest nationality__

In [None]:
df_spain['Reviewer_Nationality'] = df_spain['Reviewer_Nationality'].apply(lambda x: x.strip())
df_spain['Reviewer_Nationality'].unique()

__Check if any guest nationality is missing__

In [None]:
df_spain['Reviewer_Nationality'].isnull().values.any()

In [None]:
df_spain[df_spain['Reviewer_Nationality'] == ''].head()

__Explore the most frequent nationalities for each number of reviews__

In [None]:
df_spain.groupby('Total_Number_of_Reviews_Reviewer_Has_Given')['Reviewer_Nationality'].value_counts()

__Fill in the missing nationalities__

In [None]:
# Fill missing nationality with the most frequent one for the given Total_Number_of_Reviews_Reviewer_Has_Given
df_spain['Reviewer_Nationality'] = df_spain.groupby('Total_Number_of_Reviews_Reviewer_Has_Given')['Reviewer_Nationality'].transform(lambda x: x.replace('', x.value_counts().idxmax()))

__Verify there are no longer missing nationalities__

In [None]:
df_spain[df_spain['Reviewer_Nationality'] == '']

__Calculate percentage of each guest nationality__

In [None]:
df_spain_by_guest_nationality = df_spain.groupby(['Reviewer_Nationality']).size().reset_index()
df_spain_by_guest_nationality.columns = ['Nationality', 'Guest Amount']
df_spain_by_guest_nationality['Guest ratio'] = df_spain_by_guest_nationality.apply(lambda x: round(100 * x['Guest Amount'] / df_spain.shape[0], 2), axis='columns')
df_spain_by_guest_nationality.head(10)

__Most popular guest nationalities in Spanish hotels__

In [None]:
df_spain_by_guest_nationality.sort_values('Guest ratio', ascending=False).head(10)

In [None]:
df_spain_by_guest_nationality.sort_values('Guest ratio', ascending=True).head(10)

__Plot world map__

In [None]:
# load world countries with coords
world_geo = r'data/world-countries.json' # geojson file

# create a numpy array of length 6 and has linear spacing from the minimum fraud ratio to the maximum value
threshold_scale = np.linspace(start=0, stop=1, num=6, endpoint=True, dtype=float)
threshold_scale = threshold_scale.tolist() # change the numpy array to a list
threshold_scale[-1] = threshold_scale[-1] + 0.001 # ensure last value of list is greater than max fraud ratio

# let Folium determine the scale.
world_map = folium.Map(location=[0, 0], zoom_start=2) #, tiles='Mapbox Bright')
world_map.choropleth(
    geo_data=world_geo,
    data=df_spain_by_guest_nationality,
    columns=['Nationality', 'Guest ratio'],
    key_on='feature.properties.name',
    threshold_scale=[0, 1, 3, 10, 30, 100],
    fill_color='YlOrRd',
    nan_fill_color='white',
    fill_opacity=0.7, 
    line_opacity=0.2,
    legend_name='Nationality of hotel guests in Spain (%)',
    reset=True
)
world_map