##### Load data

In [1]:
import pandas as pd

data = pd.read_csv("../data/Hotel_Reviews.csv")
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Additional_Number_of_Scoring,515738.0,498.081836,500.538467,1.0,169.0,341.0,660.0,2682.0
Average_Score,515738.0,8.397487,0.548048,5.2,8.1,8.4,8.8,9.8
Review_Total_Negative_Word_Counts,515738.0,18.53945,29.690831,0.0,2.0,9.0,23.0,408.0
Total_Number_of_Reviews,515738.0,2743.743944,2317.464868,43.0,1161.0,2134.0,3613.0,16670.0
Review_Total_Positive_Word_Counts,515738.0,17.776458,21.804185,0.0,5.0,11.0,22.0,395.0
Total_Number_of_Reviews_Reviewer_Has_Given,515738.0,7.166001,11.040228,1.0,1.0,3.0,8.0,355.0
Reviewer_Score,515738.0,8.395077,1.637856,2.5,7.5,8.8,9.6,10.0
lat,512470.0,49.442439,3.466325,41.328376,48.214662,51.499981,51.516288,52.400181
lng,512470.0,2.823803,4.579425,-0.369758,-0.143372,0.010607,4.834443,16.429233


In [2]:
data.isna().sum()

Hotel_Address                                    0
Additional_Number_of_Scoring                     0
Review_Date                                      0
Average_Score                                    0
Hotel_Name                                       0
Reviewer_Nationality                             0
Negative_Review                                  0
Review_Total_Negative_Word_Counts                0
Total_Number_of_Reviews                          0
Positive_Review                                  0
Review_Total_Positive_Word_Counts                0
Total_Number_of_Reviews_Reviewer_Has_Given       0
Reviewer_Score                                   0
Tags                                             0
days_since_review                                0
lat                                           3268
lng                                           3268
dtype: int64

In [3]:
print("duplicated rows:", data.duplicated().sum())
data.drop_duplicates(inplace=True)

duplicated rows: 526


In [4]:
negative_reviews = data.loc[
    (data["Negative_Review"] != "No Negative"), "Negative_Review"
]
positive_reviews = data.loc[
    (data["Positive_Review"] != "No Positive"), "Positive_Review"
]

df = pd.DataFrame(
    {
        "review": pd.concat([negative_reviews, positive_reviews]),
        "sentiment": ["negative"] * len(negative_reviews)
        + ["positive"] * len(positive_reviews),
    }
).reset_index(drop=True)

df

Unnamed: 0,review,sentiment
0,I am so angry that i made this post available...,negative
1,Rooms are nice but for elderly a bit difficul...,negative
2,My room was dirty and I was afraid to walk ba...,negative
3,You When I booked with your company on line y...,negative
4,Backyard of the hotel is total mess shouldn t...,negative
...,...,...
866758,helpful staff allowed me to check in early as...,positive
866759,location,positive
866760,Breakfast was ok and we got earlier check in,positive
866761,The rooms are enormous and really comfortable...,positive


##### Preprocess data

In [6]:
import re
import nltk
from tqdm.notebook import tqdm
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

nltk.download("wordnet")
nltk.download("punkt_tab")
nltk.download("averaged_perceptron_tagger_eng")

lemmatizer = WordNetLemmatizer()
special_char_pattern = re.compile("[^a-zA-Z]")


def get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "R": wordnet.ADV,
    }
    return tag_dict.get(tag, wordnet.NOUN)


def preprocess_text_with_pos(text: str):
    text = text.lower()
    text = special_char_pattern.sub(" ", text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens]
    return " ".join(tokens)


processed_reviews = [
    preprocess_text_with_pos(review) for review in tqdm(df["review"].values)
]
df["processed_review"] = processed_reviews

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


  0%|          | 0/866763 [00:00<?, ?it/s]

In [7]:
df = df[["review", "processed_review", "sentiment"]]
df.head()

Unnamed: 0,review,processed_review,sentiment
0,I am so angry that i made this post available...,i be so angry that i make this post available ...,negative
1,Rooms are nice but for elderly a bit difficul...,room be nice but for elderly a bit difficult a...,negative
2,My room was dirty and I was afraid to walk ba...,my room be dirty and i be afraid to walk baref...,negative
3,You When I booked with your company on line y...,you when i book with your company on line you ...,negative
4,Backyard of the hotel is total mess shouldn t...,backyard of the hotel be total mess shouldn t ...,negative


##### Drop low-quality reviews

In [8]:
import numpy as np

df.replace("", np.nan, inplace=True)
df.replace("null", np.nan, inplace=True)
df[df["processed_review"].isna()]

Unnamed: 0,review,processed_review,sentiment
591,,,negative
1201,,,negative
2623,,,negative
2631,,,negative
2846,,,negative
...,...,...,...
855357,,,positive
862295,10 10,,positive
862730,,,positive
865655,8,,positive


In [9]:
df.dropna(inplace=True)
df.to_csv("../data/processed_reviews.csv", index=False)