In [131]:
from collections import namedtuple

from tqdm import tqdm
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import (
    explained_variance_score,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
)

nltk.download("vader_lexicon")

plt.style.use("ggplot")


def return_regr_score(y_true, y_pred):
    """
    Return regression scores.
    """
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    evs = explained_variance_score(y_true, y_pred)

    Scores = namedtuple("Scores", ["mse", "mae", "rmse", "r2", "evs"])
    return Scores(mse, mae, rmse, r2, evs)


def normalize_pred(preds, min_target, max_target):
    """
    Normalize predictions to the given range.
    """
    min_pred = preds.min()
    max_pred = preds.max()
    range_pred = max_pred - min_pred
    range_target = max_target - min_target

    preds_normal = ((preds - min_pred) / range_pred * range_target) + min_target
    return preds_normal

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/anj/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Load Data

In [121]:
df = pd.read_csv("input/ReviewsSmall.csv", usecols=["Text", "Summary", "Score"])
print(df.shape)
df.columns = df.columns.str.lower()
df.head(5)

(1000, 3)


Unnamed: 0,score,summary,text
0,5,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,4,"""Delight"" says it all",This is a confection that has been around a fe...
3,2,Cough Medicine,If you are looking for the secret ingredient i...
4,5,Great taffy,Great taffy at a great price. There was a wid...


In [122]:
np.unique(df["score"], return_counts=True)

(array([1, 2, 3, 4, 5]), array([ 98,  47,  75, 138, 642]))

In [123]:
for score in sorted(df["score"].unique()):
    sample = df.loc[df["score"] == score, "text"].iloc[0]
    print(f"Score {score}: {sample}")

Score 1: Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".
Score 2: If you are looking for the secret ingredient in Robitussin I believe I have found it.  I got this in addition to the Root Beer Extract I ordered (which was good) and made some cherry soda.  The flavor is very medicinal.
Score 3: This seems a little more wholesome than some of the supermarket brands, but it is somewhat mushy and doesn't have quite as much flavor either.  It didn't pass muster with my kids, so I probably won't buy it again.
Score 4: This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are fam

# VADER

In [124]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

vader = SentimentIntensityAnalyzer()
print(vader.polarity_scores("I love my wife!"))
print(vader.polarity_scores("Nothing in particular."))
print(vader.polarity_scores("I don't love you!"))

{'neg': 0.0, 'neu': 0.308, 'pos': 0.692, 'compound': 0.6696}
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
{'neg': 0.647, 'neu': 0.353, 'pos': 0.0, 'compound': -0.5661}


In [125]:
vader_preds = np.zeros(df.shape[0])

for i, text in enumerate(df["text"].values):
    vader_preds[i] = vader.polarity_scores(text).get("compound")

In [126]:
vader_preds_normal = normalize_pred(vader_preds, 1, 5)
vader_preds_normal.min(), vader_preds_normal.max()

(1.0, 5.0)

In [127]:
return_regr_score(df["score"].values, vader_preds_normal)

Scores(mse=1.312321207809819, mae=0.7465035252345931, rmse=1.1455658897723078, r2=0.2522211585513854, evs=0.25536666650896)

In [128]:
for score in sorted(df["score"].unique()):
    sample_idx = df.loc[df["score"] == score].sample(1).index.values[0]
    text = df.loc[sample_idx, "text"]
    true_score = df.loc[sample_idx, "score"]
    vader_score = vader_preds_normal[sample_idx]
    print(f"True Score/Vader = {true_score:.3f}/{vader_score:.3f}")
    print(f"{text}")
    print()

True Score/Vader = 1.000/3.641
Kettle Branch Potato Chips New York Cheddar:  These are good if you like kettle fried potato chips that are waaaay salty, on the burnt side, and taste rancid, either because the cheese flavoring or the oil it was fried in was already old.  I want to like this brand of chips and try their new and other flavors every now and then.  But, after having tried all sorts of other brands of kettle cooked chips, these just don't hit the spot for me.

True Score/Vader = 2.000/4.868
I was extremely disappionted when I opened my shipment of Enjoy Life On The Go Bars (Very Berry, Carmel Apple, and Cocoa Loco). The boxes are now a plain, generic design..you know, the kind all brands change to when they want to make their products look cheaper. I then compared the ingredients of the new shipment with the older boxes I still had at home. Guess what?! Cheaper ingredients! Next, I taste-tested, and let my 3 year old help. He wouldn't eat the new ones...and I can't blame him

# Roberta Pretarined Model

In [129]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax


def combine_scores(scores):
    negative, neutral, positive = scores
    if positive > neutral and positive > negative:
        return positive  # Strong positive sentiment
    elif negative > positive and negative > neutral:
        return -negative  # Strong negative sentiment
    else:
        return neutral  # Neutral sentiment


MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
roberta_model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [138]:
sample = "I love Cherrybrook Kitchen and have tried almost all of their products.  I was excited to try the Ready to Spread frosting, as I do not usually keep the needed ingredients on hand for their box frosting.  I am not a picky eater, especially if the item contains sugar... but this was horrible!  The consistency is absolutely nothing like regular canned frosting.  It is not light and fluffy.  It is very gooey.  I cannot even imagine trying to spread it on a cake - it would rip it to pieces because it is so thick and gooey.  I did not even try it on a cake, because the taste and texture were so off-putting.  Seriously, do not waste your money!"

endoded_sample = tokenizer(sample, return_tensors="pt")
output = roberta_model(**endoded_sample)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores = combine_scores(scores)
scores

0.73450536

In [132]:
roberta_preds = np.zeros(df.shape[0])

for i, text in tqdm(enumerate(df["text"].values), total=len(df), desc="Processing"):
    try:
        endoded_text = tokenizer(text, return_tensors="pt")
        output = roberta_model(**endoded_text)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        score = combine_scores(scores)
    except RuntimeError:
        score = 0
    roberta_preds[i] = score

Processing:   0%|          | 0/1000 [00:00<?, ?it/s]

Processing: 100%|██████████| 1000/1000 [02:11<00:00,  7.63it/s]


In [133]:
roberta_preds_normal = normalize_pred(roberta_preds, 1, 5)
roberta_preds_normal.min(), roberta_preds_normal.max()

(1.0, 5.0)

In [134]:
return_regr_score(df["score"].values, roberta_preds_normal)

Scores(mse=0.7827122513520923, mae=0.5085055407485638, rmse=0.8847102640707252, r2=0.5539996938093183, evs=0.5552486866284492)

In [139]:
for score in sorted(df["score"].unique()):
    sample_idx = df.loc[df["score"] == score].sample(1).index.values[0]
    text = df.loc[sample_idx, "text"]
    true_score = df.loc[sample_idx, "score"]
    vader_score = vader_preds_normal[sample_idx]
    roberta_score = roberta_preds_normal[sample_idx]
    print(f"True Score/Vader/Roberta = {true_score:.3f}/{vader_score:.3f}/{roberta_score:.3f}")
    print(f"{text}")
    print()

True Score/Vader/Roberta = 1.000/1.868/1.008
This oatmeal is not good. Its mushy, soft, I don't like it. Quaker Oats is the way to go.

True Score/Vader/Roberta = 2.000/2.187/1.159
I did not care for this product at all, I thought it was the same tea my sisters and I use to drink when we were in elementary school. I found out the tea we use to drink actually came from the root of a particular type of tree.

True Score/Vader/Roberta = 3.000/1.995/2.176
The pork chops from Omaha Steaks were very tasty but at the same time exceptionally dry. Pork is usually dry meat but these were to the extreme. Possibly shipping them frozen and keeping them frozen was partly the problem.

True Score/Vader/Roberta = 4.000/4.409/4.955
This kettle chips taste "Good , Crispy & Crunchy " too ! U will enjoy it also! Moreover,it's thinly cut & sliced!

True Score/Vader/Roberta = 5.000/4.529/4.964
I ordered these for my coffee themed wedding. When they arrived I had to fight off friends because they smelled and