In [5]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from collections import defaultdict, Counter
import nltk
from nltk.tokenize import word_tokenize
import re
import spacy

### Exercicio 1

In [2]:
data = pd.read_csv('data/Hotel_Reviews.csv')

data = data.dropna(subset=['reviews.title'])

data.head()

Unnamed: 0,address,categories,city,country,latitude,longitude,name,postalCode,province,reviews.date,reviews.dateAdded,reviews.doRecommend,reviews.id,reviews.rating,reviews.text,reviews.title,reviews.userCity,reviews.username,reviews.userProvince
0,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-09-22T00:00:00Z,2016-10-24T00:00:25Z,,,4.0,Pleasant 10 min walk along the sea front to th...,Good location away from the crouds,,Russ (kent),
1,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-04-03T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,Really lovely hotel. Stayed on the very top fl...,Great hotel with Jacuzzi bath!,,A Traveler,
2,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2014-05-13T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,Ett mycket bra hotell. Det som drog ner betyge...,Lugnt l��ge,,Maud,
3,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-10-27T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,We stayed here for four nights in October. The...,Good location on the Lido.,,Julie,
4,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-03-05T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,We stayed here for four nights in October. The...,������ ���������������,,sungchul,


In [27]:
target_words = {"location", "price", "cleaning", "service"}
window_size = 5

context_counts = defaultdict(Counter)

for text in data['reviews.text'].dropna().astype(str):
    tokens = word_tokenize(text.lower())
    for i, word in enumerate(tokens):
        if word in target_words:
            start = max(i - window_size, 0)
            end = min(i + window_size + 1, len(tokens))
            context_window = tokens[start:i] + tokens[i+1:end]
            context_counts[word].update(context_window)

matrix_df = pd.DataFrame(context_counts).fillna(0).astype(int)
matrix_df = matrix_df.loc[matrix_df.sum(axis=1) > 0]  

print(matrix_df)

              location  price  service  cleaning
breakfast          133    114      176        14
was               1138    780     1133       105
great             1338    338      619        13
!                  320    158      317        22
and               1462    642     1344       134
...                ...    ...      ...       ...
250+                 0      0        0         1
completed            0      0        0         1
maintainance         0      0        0         1
filter               0      0        0         1
pans                 0      0        0         1

[4978 rows x 4 columns]


In [28]:
targets = ['location', 'price', 'cleaning', 'service']
target_matrix = matrix_df[targets].T 

similarity_matrix = cosine_similarity(target_matrix)

similarity_df = pd.DataFrame(similarity_matrix, index=targets, columns=targets)


print("location vs price:", similarity_df.loc['location', 'price']) 
print("cleaning vs service:", similarity_df.loc['cleaning', 'service'])


location vs price: 0.908266619372256
cleaning vs service: 0.908984893420201


### Exercicio 2

In [3]:
def tolower(column_data):
    if not isinstance(column_data, str):
        return column_data
    lower_text= column_data.lower()

    return lower_text

def remove_special_char(text):
    if isinstance(text, str):
        return re.sub(r'[^a-zA-Z0-9]', ' ', text)
    return text

def tokenizer(text):

  if not isinstance(text, str):
    return text

  sentences = nltk.sent_tokenize(text)

  all_sentences = []
  for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    all_sentences.append(words)

  return all_sentences

def remove_stopwords_spacy(all_sentences):
    model = spacy.load('en_core_web_sm')
        
    stopwords_spacy_pt = model.Defaults.stop_words
    
    for i in range(len(all_sentences)):
        all_sentences[i] = [word for word in all_sentences[i] if word not in stopwords_spacy_pt]
    
    return all_sentences

In [4]:
data_processed = (
    data[['reviews.text']].head(1000)
    .pipe(lambda df: df.assign(tolower=df["reviews.text"].apply(tolower)))
    .pipe(lambda df: df.assign(texto_lower=df["tolower"].apply(remove_special_char)))
    .pipe(lambda df: df.assign(tokenizer=df["texto_lower"].apply(tokenizer)))
    .pipe(lambda df: df.assign(clean_sentences=df["tokenizer"].apply(remove_stopwords_spacy)))
)

#### BOW

In [9]:
data_processed["clean_text"] = data_processed["clean_sentences"].apply(
    lambda sent_list: " ".join(word for sentence in sent_list for word in sentence)
)

# Bag of Words
vectorizer = CountVectorizer(max_features=60)
bag_of_word_data = vectorizer.fit_transform(data_processed["clean_text"])

bag_of_word_df = pd.DataFrame(bag_of_word_data.toarray(), columns=vectorizer.get_feature_names_out())

# Salvando como JSON
bag_of_word_df.to_json('data/bag_of_word_df.json', orient='records', lines=False)

### TF-IDF 

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(max_features=60)
tf_idf_data = tfidf_vect.fit_transform(data_processed["clean_text"])

tf_idf_df = pd.DataFrame(tf_idf_data.toarray(), columns=tfidf_vect.get_feature_names_out())

# Salvando como JSON
tf_idf_df.to_json('data/tf_idf.json', orient='records', lines=False)

#### Bag-N-grams

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

bag_n_vect = CountVectorizer(ngram_range=(2, 3), max_features=60)
bag_n_data = bag_n_vect.fit_transform(data_processed["clean_text"])

bag_n_df = pd.DataFrame(bag_n_data.toarray(), columns=bag_n_vect.get_feature_names_out())

# Salvando como JSON
bag_n_df.to_json('data/bag_of_ngrams.json', orient='records', lines=False)