In [62]:
from collections import namedtuple

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import (
    explained_variance_score,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
)

nltk.download("vader_lexicon")

plt.style.use("ggplot")


def return_regr_score(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    evs = explained_variance_score(y_true, y_pred)

    Scores = namedtuple("Scores", ["mse", "mae", "rmse", "r2", "evs"])
    return Scores(mse, mae, rmse, r2, evs)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/anj/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Load Data

In [9]:
df = pd.read_csv("input/ReviewsSmall.csv", usecols=["Text", "Summary", "Score"])
print(df.shape)
df.columns = df.columns.str.lower()
df.head(5)

(10000, 3)


Unnamed: 0,score,summary,text
0,5,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,4,"""Delight"" says it all",This is a confection that has been around a fe...
3,2,Cough Medicine,If you are looking for the secret ingredient i...
4,5,Great taffy,Great taffy at a great price. There was a wid...


In [11]:
np.unique(df["score"], return_counts=True)

(array([1, 2, 3, 4, 5]), array([ 932,  590,  862, 1433, 6183]))

In [19]:
for score in sorted(df["score"].unique()):
    sample = df.loc[df["score"] == score, "text"].iloc[0]
    print(f"Score {score}: {sample}")

Score 1: Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".
Score 2: If you are looking for the secret ingredient in Robitussin I believe I have found it.  I got this in addition to the Root Beer Extract I ordered (which was good) and made some cherry soda.  The flavor is very medicinal.
Score 3: This seems a little more wholesome than some of the supermarket brands, but it is somewhat mushy and doesn't have quite as much flavor either.  It didn't pass muster with my kids, so I probably won't buy it again.
Score 4: This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are fam

# VADER

In [39]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

vader = SentimentIntensityAnalyzer()
print(vader.polarity_scores("I love my wife!"))
print(vader.polarity_scores("Nothing in particular."))
print(vader.polarity_scores("I don't love you!"))

{'neg': 0.0, 'neu': 0.308, 'pos': 0.692, 'compound': 0.6696}
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
{'neg': 0.647, 'neu': 0.353, 'pos': 0.0, 'compound': -0.5661}


In [45]:
vader_preds = np.zeros(df.shape[0])

for i, text in enumerate(df["text"].values):
    vader_preds[i] = vader.polarity_scores(text).get("compound")

In [58]:
min_pred = vader_preds.min()
max_pred = vader_preds.max()
range_pred = max_pred - min_pred
range_target = 5 - 1

vader_preds_normal = ((vader_preds - min_pred) / range_pred * range_target) + 1

vader_preds_normal.min(), vader_preds_normal.max()

(1.0, 5.0)

In [63]:
return_regr_score(df["score"].values, vader_preds_normal)

Scores(mse=1.4091873332646165, mae=0.7885480273752014, rmse=1.18709196495664, r2=0.19987535086913033, evs=0.21319808103608962)

In [86]:
for score in sorted(df["score"].unique()):
    sample_idx = df.loc[df["score"] == score].sample(1).index.values[0]
    text = df.loc[sample_idx, "text"]
    true_score = df.loc[sample_idx, "score"]
    vader_score = vader_preds_normal[sample_idx]
    print(f"True Score/Vader = {true_score:.3f}/{vader_score:.3f}")
    print(f"{text}")
    print()

True Score/Vader = 1.000/1.624
The worst!!! it is just plan awful bitter and strong and you cannot taste the Hazel Nut flavor at all!!!!!  Do not buy this product save your money!!!!!

True Score/Vader = 2.000/4.547
This coffee brews very weak - like a light roast.  W.P. Chef's reserve pods are just over half full.  Put pod on scale and it's 15% less coffee wgt than normal pods.  Stick with normal pods or extra bold if you like coffee that taste like coffee.

True Score/Vader = 3.000/2.876
The cookies came sealed and seem to be high quality in there ingredients.  The only thing is they just don't taste that good. My girlfriend and I thought these might be a good dessert-like option while doing the Paleo diet, but after trying one of each kind over a couple days we just don't find ourselves going back for more.  I would much rather not have a dessert and really check that box on a cheat day than being left wanting more after eating these.  If you have a true gluten allergy, maybe this i