In [1]:
# Write your imports here
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ferad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ferad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ferad\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

# Working with Text Lab
## Information retrieval, preprocessing, and feature extraction

In this lab, you'll be looking at and exploring European restaurant reviews. The dataset is rather tiny, but that's just because it has to run on any machine. In real life, just like with images, texts can be several terabytes long.

The dataset is located [here](https://www.kaggle.com/datasets/gorororororo23/european-restaurant-reviews) and as always, it's been provided to you in the `data/` folder.

### Problem 1. Read the dataset (1 point)
Read the dataset, get acquainted with it. Ensure the data is valid before you proceed.

How many observations are there? Which country is the most represented? What time range does the dataset represent?

Is the sample balanced in terms of restaurants, i.e., do you have an equal number of reviews for each one? Most importantly, is the dataset balanced in terms of **sentiment**?

In [6]:
restaurant_reviews = pd.read_csv("data/European Restaurant Reviews.csv")
restaurant_reviews

Unnamed: 0,Country,Restaurant Name,Sentiment,Review Title,Review Date,Review
0,France,The Frog at Bercy Village,Negative,Rude manager,May 2024 •,The manager became agressive when I said the c...
1,France,The Frog at Bercy Village,Negative,A big disappointment,Feb 2024 •,"I ordered a beef fillet ask to be done medium,..."
2,France,The Frog at Bercy Village,Negative,Pretty Place with Bland Food,Nov 2023 •,"This is an attractive venue with welcoming, al..."
3,France,The Frog at Bercy Village,Negative,Great service and wine but inedible food,Mar 2023 •,Sadly I used the high TripAdvisor rating too ...
4,France,The Frog at Bercy Village,Negative,Avoid- Worst meal in Rome - possibly ever,Nov 2022 •,From the start this meal was bad- especially g...
...,...,...,...,...,...,...
1497,Cuba,Old Square (Plaza Vieja),Negative,The Tourism Trap,Oct 2016 •,Despite the other reviews saying that this is ...
1498,Cuba,Old Square (Plaza Vieja),Negative,the beer factory,Oct 2016 •,beer is good. food is awfull The only decent...
1499,Cuba,Old Square (Plaza Vieja),Negative,brewery,Oct 2016 •,"for terrible service of a truly comedic level,..."
1500,Cuba,Old Square (Plaza Vieja),Negative,It's nothing exciting over there,Oct 2016 •,We visited the Havana's Club Museum which is l...


In [7]:
restaurant_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1502 entries, 0 to 1501
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Country          1502 non-null   object
 1   Restaurant Name  1502 non-null   object
 2   Sentiment        1502 non-null   object
 3   Review Title     1502 non-null   object
 4   Review Date      1502 non-null   object
 5   Review           1502 non-null   object
dtypes: object(6)
memory usage: 70.5+ KB


In [8]:
restaurant_reviews.Country.value_counts()

Country
France     512
Italy      318
Morroco    210
Cuba       146
Poland     135
Russia     100
India       81
Name: count, dtype: int64

There are a total of 1502 observations and the most represented country is France. Lets first change the column names to snake case.

In [10]:
column_names = [col.replace(" ", "_").lower() for col in restaurant_reviews.columns]
restaurant_reviews.columns = column_names
restaurant_reviews.columns

Index(['country', 'restaurant_name', 'sentiment', 'review_title',
       'review_date', 'review'],
      dtype='object')

 To see the dates range, I will need to first convert the Review Date column to **datetime** type. There also seems to be a dot after the date, which also has to be removed

In [12]:
restaurant_reviews.review_date = restaurant_reviews.review_date.str.replace(" •", "")

In [13]:
restaurant_reviews.review_date =  pd.to_datetime(restaurant_reviews.review_date, format = "mixed")
restaurant_reviews.review_date

0      2024-05-01
1      2024-02-01
2      2023-11-01
3      2023-03-01
4      2022-11-01
          ...    
1497   2016-10-01
1498   2016-10-01
1499   2016-10-01
1500   2016-10-01
1501   2016-10-01
Name: review_date, Length: 1502, dtype: datetime64[ns]

In [14]:
print("Start date: " + str(restaurant_reviews.review_date.max()))
print("End date: " + str(restaurant_reviews.review_date.min()))

period = restaurant_reviews.review_date.max() - restaurant_reviews.review_date.min()
years = period.days // 365
months = (period.days % 365) // 30
days = (period.days % 365) % 30
print(f"Total period is {years} years, {months} months and {days} days")

Start date: 2024-07-01 00:00:00
End date: 2010-09-01 00:00:00
Total period is 13 years, 10 months and 7 days


In [15]:
restaurant_reviews.restaurant_name.value_counts()

restaurant_name
The Frog at Bercy Village                512
Ad Hoc Ristorante (Piazza del Popolo)    318
The LOFT                                 210
Old Square (Plaza Vieja)                 146
Stara Kamienica                          135
Pelmenya                                 100
Mosaic                                    81
Name: count, dtype: int64

The sample is not balanced in terms of restaurants. There is a big differnce in the number of reviews for each one.

In [17]:
restaurant_reviews.sentiment.value_counts()

sentiment
Positive    1237
Negative     265
Name: count, dtype: int64

It's also not balanced in terms of reviews. The positive reviews are a lot more than the negative ones.

### Problem 2. Getting acquainted with reviews (1 point)
Are positive comments typically shorter or longer? Try to define a good, robust metric for "length" of a text; it's not necessary just the character count. Can you explain your findings?

To see the length of each review, lets try to get the word count of each one, but remove the punctuation symbols. The result will be saved in a new column.

In [21]:
def count_words(sentence):
    tokens = word_tokenize(sentence) #Create a list of all separate words or characters
    filtered_words = [word for word in tokens if word.isalnum()] #Filters out the punctuation symbols
    return len(filtered_words) #Get the length of the filtered words list

restaurant_reviews["review_length"] = restaurant_reviews.review.apply(count_words) 

In [22]:
restaurant_reviews.review_length

0        28
1        57
2        40
3       276
4       246
       ... 
1497    144
1498     30
1499     27
1500     69
1501     14
Name: review_length, Length: 1502, dtype: int64

Now lets check which review type is typically longer - Positive or Negative. To do that I will get the average length of reviews of both types.

In [24]:
average_word_count = restaurant_reviews.groupby('sentiment')['review_length'].mean()
average_word_count

sentiment
Negative    139.358491
Positive     49.666128
Name: review_length, dtype: float64

It looks like the negative reviews on average are *waaay* longer than the positive ones. I think that makes sense, considering that people like to talk more and longer about something when they are unhappy about it, than when they are happy.

### Problem 3. Preprocess the review content (2 points)
You'll likely need to do this while working on the problems below, but try to synthesize (and document!) your preprocessing here. Your tasks will revolve around words and their connection to sentiment. While preprocessing, keep in mind the domain (restaurant reviews) and the task (sentiment analysis).

To process the review content, I will remove the punctuation symbols and the words that carry no information. To do so I will clear the sentences of **stopwords** *(from stopwords in nltk.corpus)*

In [28]:
stop_words = set(stopwords.words('english'))
for word in stop_words:
    print(word, end = " ")

did hadn weren so about below its over until into have how hadn't mustn't out you'll off such those m yourselves should've through itself can i his didn isn't my now of there ours when myself nor at same we it don't above that re mightn why again or for me most hasn't yours what they y ll each just couldn't doing been haven mightn't wouldn't she you've don is where needn't who himself shouldn him was being hers shan't t them you're herself further shan our ain more by and any isn not the ve on mustn does d having both couldn should because to it's while had yourself she's her no than will after ourselves won didn't o whom be am doesn own shouldn't has needn once ma wasn were this too but s during very before few if against in their hasn won't here these he other doesn't haven't weren't you'd up all some aren only an do wasn't which between as aren't that'll your with theirs then are you down themselves from a wouldn under 

In [29]:
def remove_stop_words_and_punctuation(sentence):
    tokens = word_tokenize(sentence)
    filtered_words = [word.lower() for word in tokens if word.lower() not in stop_words and word.isalnum()]
    return ' '.join(filtered_words)

restaurant_reviews["clean_reviews"] = restaurant_reviews.review.apply(remove_stop_words_and_punctuation)

In [30]:
restaurant_reviews["clean_reviews"]

0       manager became agressive said carbonara good r...
1       ordered beef fillet ask done medium got well d...
2       attractive venue welcoming albeit somewhat slo...
3       sadly used high tripadvisor rating literally f...
4       start meal especially given price visited husb...
                              ...                        
1497    despite reviews saying place hang especially b...
1498    beer good food awfull decent thing shish kabob...
1499    terrible service truly comedic level full pint...
1500    visited havana club museum located old havana ...
1501    food service awful pretty stop good photos bad...
Name: clean_reviews, Length: 1502, dtype: object

### Problem 3. Top words (1 point)
Use a simple word tokenization and count the top 10 words in positive reviews; then the top 10 words in negative reviews*. Once again, try to define what "top" words means. Describe and document your process. Explain your results.

\* Okay, you may want to see top N words (with $N \ge 10$).

I will first create lists of all words used in the positive reviews and all words used in the negative reviews. Then count the top values in both of them using **Counter** from **collections** library.

In [33]:
positive_reviews_words = []
for review in restaurant_reviews[restaurant_reviews.sentiment == "Positive"]['clean_reviews']:
    positive_reviews_words.extend(word_tokenize(review))
    
negative_reviews_words = []
for review in restaurant_reviews[restaurant_reviews.sentiment == "Negative"]['clean_reviews']:
    negative_reviews_words.extend(review.split(" "))

In [34]:
positive_words_counter = Counter(positive_reviews_words)
positive_words_counter.most_common(10)

[('food', 739),
 ('great', 571),
 ('service', 538),
 ('good', 512),
 ('place', 373),
 ('restaurant', 352),
 ('nice', 307),
 ('staff', 258),
 ('wine', 239),
 ('menu', 235)]

In [35]:
negative_words_counter = Counter(negative_reviews_words)
negative_words_counter.most_common(100)

[('food', 247),
 ('us', 205),
 ('restaurant', 185),
 ('wine', 175),
 ('good', 153),
 ('table', 147),
 ('service', 146),
 ('one', 134),
 ('menu', 131),
 ('would', 130),
 ('rome', 107),
 ('could', 101),
 ('place', 98),
 ('meal', 95),
 ('nice', 87),
 ('waitress', 86),
 ('really', 86),
 ('like', 86),
 ('time', 85),
 ('ad', 84),
 ('asked', 83),
 ('hoc', 82),
 ('even', 73),
 ('minutes', 72),
 ('tasting', 72),
 ('took', 71),
 ('truffle', 68),
 ('many', 67),
 ('restaurants', 66),
 ('go', 66),
 ('course', 65),
 ('made', 63),
 ('main', 63),
 ('night', 62),
 ('served', 62),
 ('evening', 62),
 ('reviews', 62),
 ('well', 60),
 ('never', 60),
 ('reservation', 60),
 ('glass', 60),
 ('next', 59),
 ('average', 58),
 ('left', 54),
 ('said', 53),
 ('way', 53),
 ('experience', 52),
 ('great', 52),
 ('bill', 52),
 ('told', 51),
 ('pasta', 51),
 ('cold', 51),
 ('back', 50),
 ('also', 50),
 ('price', 48),
 ('without', 48),
 ('came', 48),
 ('ordered', 46),
 ('arrived', 46),
 ('get', 46),
 ('got', 45),
 ('bett

The top words for both review types are pretty much the same and as expected are words like food, wine, service, restaurant etc. However the negative reviews are not what I expected to get. In order to actually get to the *negative* words, I need to showcase the first **100** just to get words like *disappointed', 'terrible','slow'*. There are a lot of seemingly positive words in the top words for the negative reviews, such as *'good', 'nice', 'like'*, but the words are most likely in combinations like *'not good'*, *'not nice'* etc:

In [37]:
restaurant_reviews[(restaurant_reviews['clean_reviews'].str.contains("good", case=False)) & (restaurant_reviews['sentiment'] == "Negative")]

Unnamed: 0,country,restaurant_name,sentiment,review_title,review_date,review,review_length,clean_reviews
0,France,The Frog at Bercy Village,Negative,Rude manager,2024-05-01,The manager became agressive when I said the c...,28,manager became agressive said carbonara good r...
3,France,The Frog at Bercy Village,Negative,Great service and wine but inedible food,2023-03-01,Sadly I used the high TripAdvisor rating too ...,276,sadly used high tripadvisor rating literally f...
4,France,The Frog at Bercy Village,Negative,Avoid- Worst meal in Rome - possibly ever,2022-11-01,From the start this meal was bad- especially g...,246,start meal especially given price visited husb...
5,France,The Frog at Bercy Village,Negative,"Shocking management, TERRIBLE service by mum a...",2021-07-01,We visited this restaurant on Saturday night a...,336,visited restaurant saturday night disappointin...
8,France,The Frog at Bercy Village,Negative,Expensive mediocre food and service,2019-06-01,We got the Tasting Menu and wine pairing. Fran...,43,got tasting menu wine pairing frankly food tas...
...,...,...,...,...,...,...,...,...
1485,Cuba,Old Square (Plaza Vieja),Negative,Not much there but expensivwe shops,2016-10-01,We went there but did not stay long as there w...,64,went stay long really nothing expensive clothi...
1489,Cuba,Old Square (Plaza Vieja),Negative,the beer factory,2016-10-01,beer is good. food is awfull The only decent...,30,beer good food awfull decent thing shish kabob...
1493,Cuba,Old Square (Plaza Vieja),Negative,Tourist Trap,2016-10-01,Food and service was awful. Very pretty stop. ...,14,food service awful pretty stop good photos bad...
1498,Cuba,Old Square (Plaza Vieja),Negative,the beer factory,2016-10-01,beer is good. food is awfull The only decent...,30,beer good food awfull decent thing shish kabob...


### Problem 4. Review titles (2 point)
How do the top words you found in the last problem correlate to the review titles? Do the top 10 words (for each sentiment) appear in the titles at all? Do reviews which contain one or more of the top words have the same words in their titles?

Does the title of a comment present a good summary of its content? That is, are the titles descriptive, or are they simply meant to catch the attention of the reader?

In [39]:
#Get sets of the top 10 words
top_10_positive = set(positive_words_counter.most_common(10)[0])
top_10_negative = set(negative_words_counter.most_common(10)[0])

In [40]:
#Create a function to get a set of words
def get_words(text):
    tokens = word_tokenize(text.lower())
    return set(word for word in tokens if word.isalnum())

#Create an additional column, containing a set of the title words
restaurant_reviews['title_words'] = restaurant_reviews.review_title.apply(get_words)

In [41]:
restaurant_reviews.title_words

0                                        {manager, rude}
1                               {disappointment, big, a}
2                     {food, bland, pretty, place, with}
3       {and, great, inedible, food, but, wine, service}
4                {in, ever, meal, rome, worst, possibly}
                              ...                       
1497                                {trap, the, tourism}
1498                                {beer, the, factory}
1499                                           {brewery}
1500                {exciting, it, there, over, nothing}
1501                                     {trap, tourist}
Name: title_words, Length: 1502, dtype: object

In [42]:
#Get lists containing common words between title and review for positive and negative reviews
positive_common_words = (restaurant_reviews[restaurant_reviews.sentiment == "Positive"]
                         .apply(lambda row: len(row.title_words & top_10_positive), axis = 1))
negative_common_words = (restaurant_reviews[restaurant_reviews.sentiment == "Negative"]
                         .apply(lambda row: len(row.title_words & top_10_negative), axis = 1))

In [43]:
#Get only the values that are not 0
positive_common_words = positive_common_words[positive_common_words > 0]
negative_common_words = negative_common_words[negative_common_words > 0]
print(f"Number of titles, containing at least one of the top 10 words for:\nnegative reviews: {len(negative_common_words)}\npositive reviews: {len(positive_common_words)}")

Number of titles, containing at least one of the top 10 words for:
negative reviews: 32
positive reviews: 178


In [44]:
restaurant_reviews.sentiment.value_counts()

sentiment
Positive    1237
Negative     265
Name: count, dtype: int64

We can see that only a small percentage of titles contain one of the top words.

In [46]:
restaurant_reviews.loc[negative_common_words.index]

Unnamed: 0,country,restaurant_name,sentiment,review_title,review_date,review,review_length,clean_reviews,title_words
2,France,The Frog at Bercy Village,Negative,Pretty Place with Bland Food,2023-11-01,"This is an attractive venue with welcoming, al...",40,attractive venue welcoming albeit somewhat slo...,"{food, bland, pretty, place, with}"
3,France,The Frog at Bercy Village,Negative,Great service and wine but inedible food,2023-03-01,Sadly I used the high TripAdvisor rating too ...,276,sadly used high tripadvisor rating literally f...,"{and, great, inedible, food, but, wine, service}"
8,France,The Frog at Bercy Village,Negative,Expensive mediocre food and service,2019-06-01,We got the Tasting Menu and wine pairing. Fran...,43,got tasting menu wine pairing frankly food tas...,"{and, food, mediocre, expensive, service}"
14,France,The Frog at Bercy Village,Negative,Great wines but average food,2018-07-01,We both ate the truffle tasting menu with wine...,53,ate truffle tasting menu wines appetizer good ...,"{great, food, average, but, wines}"
22,France,The Frog at Bercy Village,Negative,Regrettable food-- cold pasta carbonara in Rome,2017-06-01,I don't write negative reviews. I'd rather say...,396,write negative reviews rather say nothing ever...,"{in, food, rome, carbonara, regrettable, pasta..."
46,France,The Frog at Bercy Village,Negative,POOR SERVICE. Great atmosphere. Ok food,2015-08-01,Can't understand it is number two of the price...,112,ca understand number two pricey restaurants tr...,"{great, food, poor, service, ok, atmosphere}"
51,France,The Frog at Bercy Village,Negative,Great ambience but the food quality ???,2015-04-01,Ad Hoc restaurant had a top 60 rating in Rome ...,235,ad hoc restaurant top 60 rating rome family ex...,"{great, food, the, but, ambience, quality}"
55,France,The Frog at Bercy Village,Negative,"Great staff, terrible food",2022-02-01,The only reason I'm giving it a poor and not t...,223,reason giving poor terrible staff wonderful at...,"{staff, great, food, terrible}"
59,France,The Frog at Bercy Village,Negative,Overpriced for subpar food,2022-02-01,I saw the amazing reviews on TA so decided to ...,74,saw amazing reviews ta decided book first nigh...,"{overpriced, for, subpar, food}"
60,France,The Frog at Bercy Village,Negative,Bad service and mediocre food,2022-02-01,"Bad service, literally had to flag down the wa...",89,bad service literally flag waiter multiple tim...,"{and, food, mediocre, service, bad}"


In [47]:
restaurant_reviews.loc[positive_common_words.index]

Unnamed: 0,country,restaurant_name,sentiment,review_title,review_date,review,review_length,clean_reviews,title_words
156,France,The Frog at Bercy Village,Positive,Great fast Service and Delicious food,2024-05-01,I tried Frog for the first time since my hotel...,78,tried frog first time since hotel near must sa...,"{delicious, and, great, food, fast, service}"
170,France,The Frog at Bercy Village,Positive,Service and food amazing!,2024-03-01,Ellen made our group feel comfortable and welc...,27,ellen made group feel comfortable welcome rest...,"{and, amazing, service, food}"
173,France,The Frog at Bercy Village,Positive,Amazing food and great service.,2024-02-01,Had a very good service by Ellen who served me...,86,good service ellen served wonderful energy rec...,"{amazing, and, great, food, service}"
178,France,The Frog at Bercy Village,Positive,Excellent pub with great vibe offers a range o...,2023-12-01,"Stopped in for coffee and dessert, wish we had...",66,stopped coffee dessert wish come lunch instead...,"{beers, and, great, of, food, pub, offers, exc..."
179,France,The Frog at Bercy Village,Positive,drinks and food <3,2023-12-01,The service was really good. preeyansh was my ...,21,service really good preeyansh server made sure...,"{3, drinks, food, and}"
...,...,...,...,...,...,...,...,...,...
1346,Morroco,The LOFT,Positive,"Great food, fascinating setting.",2018-09-01,I had walked past it and was interested and sa...,76,walked past interested saw reviews tempted vis...,"{setting, fascinating, great, food}"
1350,Morroco,The LOFT,Positive,Stunning Food,2018-08-01,Absolutely delicious! A nice change from tradi...,35,absolutely delicious nice change traditional m...,"{stunning, food}"
1351,Morroco,The LOFT,Positive,"Very cool decor and music, excellent food",2018-08-01,Sometimes when a restaurant decor and music ar...,115,sometimes restaurant decor music consciously c...,"{and, food, cool, music, very, decor, excellent}"
1354,Morroco,The LOFT,Positive,Amazing food,2018-07-01,We went to this place as a treat night as its ...,86,went place treat night little expensive restau...,"{amazing, food}"


It looks like most titles that contain at least one word from the top 10 list use words like 'food', 'service' and 'wine'. I would say that most of these titles are descriptive about the review.

### Problem 5. Bag of words (1 point)
Based on your findings so far, come up with a good set of settings (hyperparameters) for a bag-of-words model for review titles and contents. It's easiest to treat them separately (so, create two models); but you may also think about a unified representation. I find the simplest way of concatenating the title and content too simplistic to be useful, as it doesn't allow you to treat the title differently (e.g., by giving it more weight).

The documentation for `CountVectorizer` is [here](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html). Familiarize yourself with all settings; try out different combinations and come up with a final model; or rather - two models :).

I will use bags of one to three words for the reviews and exclude words that are met in less than 5 reviews and ones that appear in more than 90% of reviews.

In [51]:
review_vectorizer = CountVectorizer(ngram_range=(1,3), max_df=0.90, min_df=5)
review_vectorizer.fit(restaurant_reviews.review)

In [52]:
review_vectors = review_vectorizer.transform(restaurant_reviews.review)
review_vectors

<1502x5886 sparse matrix of type '<class 'numpy.int64'>'
	with 111047 stored elements in Compressed Sparse Row format>

Lets do the same for the titles.

In [54]:
title_vectorizer = CountVectorizer(ngram_range=(1,3), max_df = 0.90, min_df = 5)
title_vectorizer.fit(restaurant_reviews.review)

In [55]:
title_vectors = title_vectorizer.transform(restaurant_reviews.review_title)
title_vectors

<1502x5886 sparse matrix of type '<class 'numpy.int64'>'
	with 7219 stored elements in Compressed Sparse Row format>

### Problem 6. Deep sentiment analysis models (1 point)
Find a suitable model for sentiment analysis in English. Without modifying, training, or fine-tuning the model, make it predict all contents (or better, combinations of titles and contents, if you can). Meaure the accuracy of the model compared to the `sentiment` column in the dataset.

I will use the **SentimentIntensityAnalyzer** model from the **NLTK** library and create a function that calculates the sentiment using this model, then apply it to the **reviews** column.

In [58]:
analyzer = SentimentIntensityAnalyzer()
def analyze_sentiment(text):
    sentiment = analyzer.polarity_scores(text)
    return sentiment['compound']

In [59]:
restaurant_reviews["predicted_sentiment"] = restaurant_reviews.review.apply(analyze_sentiment)

In [60]:
restaurant_reviews[["sentiment", "predicted_sentiment"]].sample(30)

Unnamed: 0,sentiment,predicted_sentiment
43,Negative,-0.9162
986,Positive,0.8908
617,Positive,0.9644
1481,Negative,0.8126
557,Negative,0.5538
1190,Positive,0.9657
959,Positive,0.9017
1298,Positive,0.8356
180,Positive,0.9429
1262,Positive,0.9731


The model guesses all the positive reviews correctly, but does mistake some negative reviews as positive.

### Problem 7. Deep features (embeddings) (1 point)
Use the same model to perform feature extraction on the review contents (or contents + titles) instead of direct predictions. You should already be familiar how to do that from your work on images.

Use the cosine similarity between texts to try to cluster them. Are there "similar" reviews (you'll need to find a way to measure similarity) across different restaurants? Are customers generally in agreement for the same restaurant?

In [63]:
def extract_features(text):
    sentiment = analyzer.polarity_scores(text)
    length = len(text)
    word_count = len(text.split())
    pos_count = sum(1 for word in text.split() if analyzer.polarity_scores(word)['compound'] > 0)
    neg_count = sum(1 for word in text.split() if analyzer.polarity_scores(word)['compound'] < 0)
    
    features = {
        'length': length,
        'word_count': word_count,
        'pos_count': pos_count,
        'neg_count': neg_count,
        'neg': sentiment['neg'],
        'neu': sentiment['neu'],
        'pos': sentiment['pos'],
        'compound': sentiment['compound']
    }
    return features

features_df = restaurant_reviews['review'].apply(lambda x: pd.Series(extract_features(x)))
restaurant_reviews = pd.concat([restaurant_reviews, features_df], axis=1)

In [64]:
features_df

Unnamed: 0,length,word_count,pos_count,neg_count,neg,neu,pos,compound
0,146.0,28.0,1.0,3.0,0.377,0.623,0.000,-0.9231
1,281.0,58.0,1.0,2.0,0.107,0.866,0.027,-0.6705
2,260.0,40.0,3.0,1.0,0.158,0.758,0.084,-0.6601
3,1513.0,279.0,25.0,8.0,0.091,0.677,0.232,0.9930
4,1351.0,243.0,7.0,8.0,0.079,0.845,0.076,0.0224
...,...,...,...,...,...,...,...,...
1497,776.0,147.0,6.0,1.0,0.035,0.844,0.121,0.9345
1498,169.0,30.0,1.0,0.0,0.000,0.909,0.091,0.4404
1499,150.0,28.0,2.0,2.0,0.180,0.654,0.166,-0.1189
1500,356.0,70.0,3.0,0.0,0.036,0.879,0.085,0.5493


In [65]:
similarity_matrix = cosine_similarity(features_df)
similarity_matrix

array([[1.        , 0.99979811, 0.99916742, ..., 0.99992622, 0.99974999,
        0.99960636],
       [0.99979811, 1.        , 0.99866869, ..., 0.99975153, 0.9999125 ,
        0.99899692],
       [0.99916742, 0.99866869, 1.        , ..., 0.99944391, 0.99911851,
        0.99883178],
       ...,
       [0.99992622, 0.99975153, 0.99944391, ..., 1.        , 0.99985096,
        0.99957101],
       [0.99974999, 0.9999125 , 0.99911851, ..., 0.99985096, 1.        ,
        0.99898656],
       [0.99960636, 0.99899692, 0.99883178, ..., 0.99957101, 0.99898656,
        1.        ]])

### \* Problem 8. Explore and model at will
In this lab, we focused on preprocessing and feature extraction and we didn't really have a chance to train (or compare) models. The dataset is maybe too small to be conclusive, but feel free to play around with ready-made models, and train your own.