# Load Data

## Load Properties

In [122]:
import yaml

with open("config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

## Set up the environment

In [123]:
import json
import os

filepath = config["kaggle"]["api_filepath"]

# Load Kaggle API token
try:
    with open(file=filepath) as f:
        api_token = json.load(fp=f)
except FileNotFoundError:
    print(f"File '{filepath}' not found.")
    print(
        f"Download the API key from Kaggle and save to {filepath} or adjust the config.yaml as necessary."
    )
    print(f"See https://www.kaggle.com/docs/api for more information.")
    raise FileNotFoundError  # Stop execution of the script in order to signal the user to fix the issue.

# Define environment variables (Kaggle API client expects these)
os.environ["KAGGLE_USERNAME"] = api_token["username"]
os.environ["KAGGLE_KEY"] = api_token["key"]

## download and unzip the data

In [124]:
from kaggle.api.kaggle_api_extended import KaggleApi
import time

# Initialize the API
api = KaggleApi()
api.authenticate()

# Define the dataset path
dataset_path = "jiashenliu/515k-hotel-reviews-data-in-europe"
download_path = config["kaggle"]["download_path"]


# Download the dataset
start = time.time()
api.dataset_download_files(dataset=dataset_path, path=download_path, unzip=True)
end = time.time()

## load the data

In [125]:
import pandas as pd

filename = download_path + "/Hotel_Reviews.csv"

start = time.time()
df = pd.read_csv(filepath_or_buffer=filename)
end = time.time()
print(f"Loading took {round(end - start, 2)} seconds")

Loading took 1.72 seconds


# Cleaning Data

check for missing values

In [126]:
df.isna().sum()

Hotel_Address                                    0
Additional_Number_of_Scoring                     0
Review_Date                                      0
Average_Score                                    0
Hotel_Name                                       0
Reviewer_Nationality                             0
Negative_Review                                  0
Review_Total_Negative_Word_Counts                0
Total_Number_of_Reviews                          0
Positive_Review                                  0
Review_Total_Positive_Word_Counts                0
Total_Number_of_Reviews_Reviewer_Has_Given       0
Reviewer_Score                                   0
Tags                                             0
days_since_review                                0
lat                                           3268
lng                                           3268
dtype: int64

We will drop the null rows because we can't replace the lat or lng with mean or median value, that will change the right information hotel adderss

In [127]:
df.dropna(inplace=True,axis=0)

check for duplicates and drop them

In [128]:
print("Duplicated rows before: ", df.duplicated().sum())
df.drop_duplicates(inplace=True)
print("Duplicated rows after: ", df.duplicated().sum())

Duplicated rows before:  526
Duplicated rows after:  0


Replace all the addresses with a shortened, more useful form

In [129]:
def replace_address(row):
    if "Netherlands" in row["Hotel_Address"]:
        return "Amsterdam, Netherlands"
    elif "Barcelona" in row["Hotel_Address"]:
        return "Barcelona, Spain"
    elif "United Kingdom" in row["Hotel_Address"]:
        return "London, United Kingdom"
    elif "Milan" in row["Hotel_Address"]:
        return "Milan, Italy"
    elif "France" in row["Hotel_Address"]:
        return "Paris, France"
    elif "Vienna" in row["Hotel_Address"]:
        return "Vienna, Austria"
    else:
        return row.Hotel_Address


df["Hotel_Address"] = df.apply(func=replace_address, axis=1)

Replace `Total_Number_of_Reviews` and `Average_Score` with own calculated values

In [130]:
df.drop(["Additional_Number_of_Scoring"], axis=1, inplace=True)
df["Total_Number_of_Reviews"] = df.groupby("Hotel_Name")["Hotel_Name"].transform(
    "count"
)
df["Average_Score"] = (
    df.groupby("Hotel_Name")["Reviewer_Score"].transform("mean").round(1)
)

Get the most useful tags from the `Tags` column

In [131]:
# Remove opening and closing brackets and all quotes too, split the strings into a list
tag_list_df = (
    df.Tags.str.strip("[']")
    .str.replace(" ', '", ",", regex=False)
    .str.split(",", expand=True)
)

# Remove leading and trailing spaces from each column and assign them back to the dataframe
for i in range(6):
    df[f"Tag_{i+1}"] = tag_list_df[i].str.strip()

# Merge the 6 columns into one with melt
df_tags = df.melt(value_vars=[f"Tag_{i+1}" for i in range(6)])

# Print the shape of the tags with no filtering
print("The shape of the tags with no filtering:", df_tags.shape)

# Filter the tags, get the value counts
df_tags = df_tags[
    ~df_tags.value.str.contains(
        "Standard|room|Stayed|device|Beds|Suite|Studio|King|Superior|Double",
        na=False,
        case=False,
    )
]
tag_vc = df_tags.value.value_counts().reset_index(name="count").query("count > 1000")

# Print the top 10 (there should only be 9 and we'll use these in the filtering section)
tag_vc["value"] = tag_vc["value"].apply(
    lambda x: x.strip().replace(" ", "_") if isinstance(x, str) else x
)
print(tag_vc[:10])

The shape of the tags with no filtering: (3071664, 2)
                        value   count
0                Leisure_trip  414707
1                      Couple  250467
2               Solo_traveler  107730
3               Business_trip   82341
4                       Group   64890
5  Family_with_young_children   60563
6  Family_with_older_children   26167
7      Travelers_with_friends    2127
8                  With_a_pet    1385


Process tags into new columns with one-hot encoding

In [132]:
# Process the Tags into new columns
for tag in tag_vc["value"]:
    df[tag] = df.Tags.apply(lambda x: 1 if tag in x else 0)

Drop unnecessary columns

In [133]:
df.drop(
    [
        "Review_Date",
        "Review_Total_Negative_Word_Counts",
        "Review_Total_Positive_Word_Counts",
        "days_since_review",
        "Total_Number_of_Reviews_Reviewer_Has_Given",
        "Tags",
        "Tag_1",
        "Tag_2",
        "Tag_3",
        "Tag_4",
        "Tag_5",
        "Tag_6",
    ],
    axis=1,
    inplace=True,
)

In [134]:
df.head()

Unnamed: 0,Hotel_Address,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Total_Number_of_Reviews,Positive_Review,Reviewer_Score,lat,lng,Leisure_trip,Couple,Solo_traveler,Business_trip,Group,Family_with_young_children,Family_with_older_children,Travelers_with_friends,With_a_pet
0,"Amsterdam, Netherlands",7.8,Hotel Arena,Russia,I am so angry that i made this post available...,405,Only the park outside of the hotel was beauti...,2.9,52.360576,4.915968,0,1,0,0,0,0,0,0,0
1,"Amsterdam, Netherlands",7.8,Hotel Arena,Ireland,No Negative,405,No real complaints the hotel was great great ...,7.5,52.360576,4.915968,0,1,0,0,0,0,0,0,0
2,"Amsterdam, Netherlands",7.8,Hotel Arena,Australia,Rooms are nice but for elderly a bit difficul...,405,Location was good and staff were ok It is cut...,7.1,52.360576,4.915968,0,0,0,0,0,0,0,0,0
3,"Amsterdam, Netherlands",7.8,Hotel Arena,United Kingdom,My room was dirty and I was afraid to walk ba...,405,Great location in nice surroundings the bar a...,3.8,52.360576,4.915968,0,0,0,0,0,0,0,0,0
4,"Amsterdam, Netherlands",7.8,Hotel Arena,New Zealand,You When I booked with your company on line y...,405,Amazing location and building Romantic setting,6.7,52.360576,4.915968,0,1,0,0,0,0,0,0,0


# EDA (Exploratory Data Analysis) 

## Since Lea and I also can't decide which city to choose the capital of the language of love is a good start for some romantic activities but also fot the analysis

In [135]:
paris_df = df[df["Hotel_Address"] == "Paris, France"]
paris_df = paris_df.drop_duplicates(subset="Hotel_Name")
paris_df.shape

(455, 19)

Plot a map of the hotels in Paris

In [136]:
import folium
from tqdm import tqdm


def assign_color(score: float) -> str:
    if score >= 9.3:
        return "green"
    elif score >= 8.3:
        return "lightgreen"
    elif score >= 7.3:
        return "orange"
    else:
        return "red"


# Initialize map with starting location
paris_map = folium.Map(
    location=[paris_df["lat"].mean(), paris_df["lng"].mean()], zoom_start=12
)

for idx, row in tqdm(iterable=paris_df.iterrows(), total=paris_df.shape[0]):
    folium.Marker(
        location=[row["lat"], row["lng"]],
        popup=f'{row["Hotel_Name"]}: {row["Average_Score"]}',
        icon=folium.Icon(color=assign_color(row["Average_Score"]), icon="info-sign"),
    ).add_to(paris_map)

# Show the map
paris_map

100%|██████████| 455/455 [00:00<00:00, 21916.08it/s]


## Analysis

Define functions to calculate sentiment and to remove the stopwords

In [137]:
import nltk as nltk
from nltk import word_tokenize
from nltk import download
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from keras import models
import numpy as np

from nltk import word_tokenize
from nltk import download
from keras.preprocessing import sequence
from keras.datasets import imdb

max_review_length = config["num_words"]
word2index = imdb.get_word_index()

nltk.download("vader_lexicon")
nltk.download("stopwords")
download("punkt")

vader_sentiment = SentimentIntensityAnalyzer()
cache = set(stopwords.words("english"))


def calc_sentiment_vader(review: str) -> float:
    """
    The `calc_sentiment` function takes a string as input and returns a float representing the sentiment of the input text.
    """
    if review == "No Negative" or review == "No Positive":
        return 0
    return vader_sentiment.polarity_scores(text=review)["compound"]


def remove_stopwords(review: str) -> str:
    """
    The `remove_stopwords` function takes a string as input and returns a string where all English stopwords have been removed.
    It does this by splitting the input text into individual words, filtering out the stopwords,
    and then joining the remaining words back together into a single string.
    """
    text = " ".join([word for word in review.split() if word not in cache])
    return text

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/simi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/simi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/simi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [138]:
class Model:
    def __init__(self, name: str, keras_model: models.Sequential):
        self.name = name
        self.keras_model = keras_model
    
    def tokenizee(self, review: str) -> np.ndarray:
        """
        The `tokenizee` function takes a string as input and returns a numpy array of integers.
        It's called this way to avoid a name conflict and confusion with the `tokenize` function from the `nltk` package.
        """
        word2index_list = []
        for word in word_tokenize(review):
            if word.isalpha():  # Only process if word is alphabetical
                word = word.lower()  # convert to lower case
                if word in word2index:  # only add if word exists in word2index
                    word2index_list.append(word2index[word])
                else:
                    word2index_list.append(0)  # or some other value to denote unknown words
        return sequence.pad_sequences([word2index_list], maxlen=max_review_length)

    def calc_sentiment(self, review: str) -> float:
        """
        The `calc_sentiment_model` function takes a string as input and returns a float representing the sentiment of the input text.
        """
        if review == "No Negative" or review == "No Positive":
            return 0
        review = remove_stopwords(review)
        tokenized_review = self.tokenizee(review)
        sentiment=self.keras_model.predict([tokenized_review], verbose=0)[0][0]
        if sentiment == 0:
            sentiment = -1.0
        return sentiment

In [139]:
from typing import List

loaded_models: List[Model] = []

# Load the models
model_config_dir = os.path.join(os.getcwd(), "config", "model")
model_dir = os.path.join(os.getcwd(), "model")
for model_config in os.listdir(model_config_dir):
    model_config_path = os.path.join(model_config_dir, model_config)
    with open(model_config_path, "r") as f:
        model_config = yaml.safe_load(f)
        model_name = model_config["model"]["name"]
        if "zzz" in model_name.lower():
            print(f"Skipping model '{model_name}' in config : {model_config_path}")
            continue
        model_path = os.path.join(model_dir, model_name)
        keras_model = models.load_model(model_path)
        model = Model(model_name, keras_model)
        loaded_models.append(model)

print("all loded models:")
for model in loaded_models:
    print(f"\tloaded model: {model.name}")

Skipping model 'basic_model_zzz' in config : /Users/simi/workarea/vscode/python/ml2/ml2_project/config/model/zzz_ignored_example.yaml
all loded models:
	loaded model: basic_model_many_epochs
	loaded model: basic_model


Remove stop words

In [140]:
start = time.time()

# Remove the stop words from both columns
df.Negative_Review = df.Negative_Review.apply(remove_stopwords)   
df.Positive_Review = df.Positive_Review.apply(remove_stopwords)
end = time.time()
print("Removing stop words took " + str(round(end - start, 2)) + " seconds")

Removing stop words took 1.67 seconds


Add a negative sentiment and positive sentiment column

In [141]:
print("Calculating sentiment columns for both positive and negative reviews")
start = time.time()
df["Negative_Sentiment_Vader"] = df.Negative_Review.apply(calc_sentiment_vader)
df["Positive_Sentiment_Vader"] = df.Positive_Review.apply(calc_sentiment_vader)
end = time.time()
print(f"Calculating sentiment took {(round(end - start, 2))} seconds")

Calculating sentiment columns for both positive and negative reviews
Calculating sentiment took 58.5 seconds


In [143]:
#wip
df = df.sample(frac=1).head(100)

for model in loaded_models:
    print(f"Calculating sentiment columns for both positive and negative reviews using model '{model.name}'")
    start = time.time()
    df[f"Negative_Sentiment_{model.name}"] = df.Negative_Review.apply(model.calc_sentiment)
    df[f"Positive_Sentiment_{model.name}"] = df.Positive_Review.apply(model.calc_sentiment)
    end = time.time()
    print(f"Calculating sentiment took {(round(end - start, 2))} seconds for model '{model.name}'")
    print("....................................................")

Calculating sentiment columns for both positive and negative reviews using model 'basic_model_many_epochs'
Calculating sentiment took 3.91 seconds for model 'basic_model_many_epochs'
....................................................
Calculating sentiment columns for both positive and negative reviews using model 'basic_model'
Calculating sentiment took 3.71 seconds for model 'basic_model'
....................................................
