# Load Data

## Load Properties

In [None]:
import yaml

with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

## Set up the environment

In [None]:
import json
import os

filepath = config["kaggle"]["api_filepath"]

# Load Kaggle API token
try:
    with open(file=filepath) as f:
        api_token = json.load(fp=f)
except FileNotFoundError:
    print(f"File '{filepath}' not found.")
    print(
        f"Download the API key from Kaggle and save to {filepath} or adjust the config.yaml as necessary."
    )
    print(f"See https://www.kaggle.com/docs/api for more information.")
    raise FileNotFoundError  # Stop execution of the script in order to signal the user to fix the issue.

# Define environment variables (Kaggle API client expects these)
os.environ["KAGGLE_USERNAME"] = api_token["username"]
os.environ["KAGGLE_KEY"] = api_token["key"]

## download and unzip the data

In [None]:
from kaggle.api.kaggle_api_extended import KaggleApi
import time

# Initialize the API
api = KaggleApi()
api.authenticate()

# Define the dataset path
dataset_path = "jiashenliu/515k-hotel-reviews-data-in-europe"
download_path = config["kaggle"]["download_path"]


# Download the dataset
start = time.time()
api.dataset_download_files(dataset=dataset_path, path=download_path, unzip=True)
end = time.time()
print(f"Time to download and extract: {round(end - start, 2)} seconds")

## load the data

In [None]:
import pandas as pd

filename = download_path + "/Hotel_Reviews.csv"

start = time.time()
df = pd.read_csv(filepath_or_buffer=filename)
end = time.time()
print(f"Loading took {round(end - start, 2)} seconds")

# Cleaning Data

check for missing values

In [None]:
df.isna().sum()

We will drop the null rows because we can't replace the lat or lng with mean or median value, that will change the right information hotel adderss

In [None]:
df.dropna(inplace=True,axis=0)

check for duplicates and drop them

In [None]:
print("Duplicated rows before: ", df.duplicated().sum())
df.drop_duplicates(inplace=True)
print("Duplicated rows after: ", df.duplicated().sum())

Replace all the addresses with a shortened, more useful form

In [None]:
def replace_address(row):
    if "Netherlands" in row["Hotel_Address"]:
        return "Amsterdam, Netherlands"
    elif "Barcelona" in row["Hotel_Address"]:
        return "Barcelona, Spain"
    elif "United Kingdom" in row["Hotel_Address"]:
        return "London, United Kingdom"
    elif "Milan" in row["Hotel_Address"]:
        return "Milan, Italy"
    elif "France" in row["Hotel_Address"]:
        return "Paris, France"
    elif "Vienna" in row["Hotel_Address"]:
        return "Vienna, Austria"
    else:
        return row.Hotel_Address


df["Hotel_Address"] = df.apply(func=replace_address, axis=1)

Replace `Total_Number_of_Reviews` and `Average_Score` with own calculated values

In [None]:
df.drop(["Additional_Number_of_Scoring"], axis=1, inplace=True)
df["Total_Number_of_Reviews"] = df.groupby("Hotel_Name")["Hotel_Name"].transform(
    "count"
)
df["Average_Score"] = (
    df.groupby("Hotel_Name")["Reviewer_Score"].transform("mean").round(1)
)

Get the most useful tags from the `Tags` column

In [None]:
# Remove opening and closing brackets and all quotes too, split the strings into a list
tag_list_df = (
    df.Tags.str.strip("[']")
    .str.replace(" ', '", ",", regex=False)
    .str.split(",", expand=True)
)

# Remove leading and trailing spaces from each column and assign them back to the dataframe
for i in range(6):
    df[f"Tag_{i+1}"] = tag_list_df[i].str.strip()

# Merge the 6 columns into one with melt
df_tags = df.melt(value_vars=[f"Tag_{i+1}" for i in range(6)])

# Print the shape of the tags with no filtering
print("The shape of the tags with no filtering:", df_tags.shape)

# Filter the tags, get the value counts
df_tags = df_tags[
    ~df_tags.value.str.contains(
        "Standard|room|Stayed|device|Beds|Suite|Studio|King|Superior|Double",
        na=False,
        case=False,
    )
]
tag_vc = df_tags.value.value_counts().reset_index(name="count").query("count > 1000")

# Print the top 10 (there should only be 9 and we'll use these in the filtering section)
tag_vc["value"] = tag_vc["value"].apply(
    lambda x: x.strip().replace(" ", "_") if isinstance(x, str) else x
)
print(tag_vc[:10])

Process tags into new columns with one-hot encoding

In [None]:
# Process the Tags into new columns
for tag in tag_vc["value"]:
    df[tag] = df.Tags.apply(lambda x: 1 if tag in x else 0)

Drop unnecessary columns

In [None]:
df.drop(
    [
        "Review_Date",
        "Review_Total_Negative_Word_Counts",
        "Review_Total_Positive_Word_Counts",
        "days_since_review",
        "Total_Number_of_Reviews_Reviewer_Has_Given",
        "Tags",
        "Tag_1",
        "Tag_2",
        "Tag_3",
        "Tag_4",
        "Tag_5",
        "Tag_6",
    ],
    axis=1,
    inplace=True,
)

In [None]:
df.head()

# EDA (Exploratory Data Analysis) 

## Since Lea and I also can't decide which city to choose the capital of the language of love is a good start for some romantic activities but also fot the analysis

In [None]:
paris_df = df[df["Hotel_Address"] == "Paris, France"]
paris_df = paris_df.drop_duplicates(subset="Hotel_Name")
paris_df.shape

Plot a map of the hotels in Paris

In [None]:
import folium
from tqdm import tqdm


def assign_color(score: float) -> str:
    if score >= 9.3:
        return "green"
    elif score >= 8.3:
        return "lightgreen"
    elif score >= 7.3:
        return "orange"
    else:
        return "red"


# Initialize map with starting location
paris_map = folium.Map(
    location=[paris_df["lat"].mean(), paris_df["lng"].mean()], zoom_start=12
)

for idx, row in tqdm(iterable=paris_df.iterrows(), total=paris_df.shape[0]):
    folium.Marker(
        location=[row["lat"], row["lng"]],
        popup=f'{row["Hotel_Name"]}: {row["Average_Score"]}',
        icon=folium.Icon(color=assign_color(row["Average_Score"]), icon="info-sign"),
    ).add_to(paris_map)

# Show the map
paris_map

## Analysis

In [None]:
import nltk as nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from keras.models import load_model

nltk.download("vader_lexicon")
nltk.download("stopwords")
model_path = config["model"]["model_path"]

# Load the saved model
loaded_model = load_model(model_path)

Define functions to calculate sentiment and to remove the stopwords

In [None]:
vader_sentiment = SentimentIntensityAnalyzer()
cache = set(stopwords.words("english"))


def calc_sentiment(review: str) -> float:
    """
    The `calc_sentiment` function takes a string as input and returns a float representing the sentiment of the input text.
    """
    if review == "No Negative" or review == "No Positive":
        return 0
    return vader_sentiment.polarity_scores(text=review)["compound"]


def remove_stopwords(review: str) -> str:
    """
    The `remove_stopwords` function takes a string as input and returns a string where all English stopwords have been removed.
    It does this by splitting the input text into individual words, filtering out the stopwords,
    and then joining the remaining words back together into a single string.
    """
    text = " ".join([word for word in review.split() if word not in cache])
    return text

Remove stop words

In [None]:
start = time.time()

# Remove the stop words from both columns
df.Negative_Review = df.Negative_Review.apply(remove_stopwords)   
df.Positive_Review = df.Positive_Review.apply(remove_stopwords)
end = time.time()
print("Removing stop words took " + str(round(end - start, 2)) + " seconds")

Add a negative sentiment and positive sentiment column

In [None]:
print("Calculating sentiment columns for both positive and negative reviews")
start = time.time()
df["Negative_Sentiment"] = df.Negative_Review.apply(calc_sentiment)
df["Positive_Sentiment"] = df.Positive_Review.apply(calc_sentiment)
end = time.time()
print(f"Calculating sentiment took {(round(end - start, 2))} seconds")

Sort the columns ascending by negative sentiment and positive sentiment

In [None]:
df = df.sort_values(by=["Negative_Sentiment"], ascending=True)
print(df[["Negative_Review", "Negative_Sentiment"]])
df = df.sort_values(by=["Positive_Sentiment"], ascending=True)
print(df[["Positive_Review", "Positive_Sentiment"]])

In [None]:
# Reorder the columns (This is cosmetic, but to make it easier to explore the data later)
df = df.reindex(
    [
        "Hotel_Name",
        "Hotel_Address",
        "Total_Number_of_Reviews",
        "Average_Score",
        "Reviewer_Score",
        "Negative_Sentiment",
        "Positive_Sentiment",
        "Reviewer_Nationality",
        "Leisure_trip",
        "Couple",
        "Solo_traveler",
        "Business_trip",
        "Group",
        "Family_with_young_children",
        "Family_with_older_children",
        "With_a_pet",
        "Negative_Review",
        "Positive_Review",
    ],
    axis=1,
)