In [1]:
import importlib
from collections import namedtuple

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import (
    explained_variance_score,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
)
from tqdm import tqdm

import cleaning
import preprocessing

for module in [cleaning, preprocessing]:
    try:
        importlib.reload(module)  # reload module
    except NameError:
        pass


nltk.download("vader_lexicon")

plt.style.use("ggplot")


def return_regr_score(y_true, y_pred):
    """
    Return regression scores.
    """
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    evs = explained_variance_score(y_true, y_pred)

    Scores = namedtuple("Scores", ["mse", "mae", "rmse", "r2", "evs"])
    return Scores(mse, mae, rmse, r2, evs)


def normalize_pred(preds, min_target, max_target):
    """
    Normalize predictions to the given range.
    """
    min_pred = preds.min()
    max_pred = preds.max()
    range_pred = max_pred - min_pred
    range_target = max_target - min_target

    preds_normal = ((preds - min_pred) / range_pred * range_target) + min_target
    return preds_normal

[nltk_data] Error loading vader_lexicon: <urlopen error [Errno 101]
[nltk_data]     Network is unreachable>


# Load Data

In [2]:
def load_data(subset):
    if subset == "train":
        path = "input/ReviewsSmallTrain.csv"
    elif subset == "test":
        path = "input/ReviewsSmallTest.csv"

    df = pd.read_csv(path, usecols=["Text", "Summary", "Score"])
    df.columns = df.columns.str.lower()
    return df

In [3]:
df_train = load_data("train")
print(df_train.shape)
df_train.head(5)

(10000, 3)


Unnamed: 0,score,summary,text
0,4,My dog loves these but....,I am so convinced these are human animal crack...
1,5,She loves them...,I have a whole box of peanut butter dog cookie...
2,3,Not healthy but they taste good,These little animal crackers taste good & my d...
3,5,My dog loves these!!!!,My chihuahua loves these lil snacks. When she ...
4,5,The Puppy Dogs Love Them!,I purchased these little treats as stocking st...


In [4]:
df_test = load_data("test")
print(df_test.shape)
df_test.head(5)

(1000, 3)


Unnamed: 0,score,summary,text
0,5,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,4,"""Delight"" says it all",This is a confection that has been around a fe...
3,2,Cough Medicine,If you are looking for the secret ingredient i...
4,5,Great taffy,Great taffy at a great price. There was a wid...


In [5]:
np.unique(df_train["score"], return_counts=True)

(array([1, 2, 3, 4, 5]), array([ 938,  600,  855, 1419, 6188]))

In [6]:
np.unique(df_test["score"], return_counts=True)

(array([1, 2, 3, 4, 5]), array([ 98,  47,  75, 138, 642]))

In [7]:
for score in sorted(df_train["score"].unique()):
    sample = df_train.loc[df_train["score"] == score, "text"].iloc[0]
    print(f"Score {score}: {sample}")

Score 1: The noodles in the box were all broken.  The sauce was over salted and did not have a good flavor.  I threw out most of the skillet.  I would recommend not purchasing this product.
Score 2: Ive been craving some deer jerky for a while so i gave this one and some others on amazon a try. This one was very sinue and a little too sweet for my taste. Not going to buy again. Im still on my search for the salty dry version im after
Score 3: These little animal crackers taste good & my doggies like them, but I won't be buying again. They are like the human animal crackers & have flour & sugar & other bad stuff for dogs. I will continue to buy the healthy treats for my doggies which they also love without sugar. You might want to try Wellness, Blue Buffalo, Natural Balance, Zukes, just to name a few.
Score 4: I am so convinced these are human animal crackers. They look and taste EXACTLY like human ones (yes, I tasted it) and they contain sugar, which is not one of the last ingredients 

# VADER

In [8]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

vader = SentimentIntensityAnalyzer()
print(vader.polarity_scores("I love my wife!"))
print(vader.polarity_scores("Nothing in particular."))
print(vader.polarity_scores("I don't love you!"))

{'neg': 0.0, 'neu': 0.308, 'pos': 0.692, 'compound': 0.6696}
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
{'neg': 0.647, 'neu': 0.353, 'pos': 0.0, 'compound': -0.5661}


In [9]:
vader_preds = np.zeros(df_test.shape[0])

for i, text in enumerate(df_test["text"].values):
    vader_preds[i] = vader.polarity_scores(text).get("compound")

In [10]:
vader_preds_normal = normalize_pred(vader_preds, 1, 5)
vader_preds_normal.min(), vader_preds_normal.max()

(1.0, 5.0)

In [11]:
vader_clf_scores = return_regr_score(df_test["score"].values, vader_preds_normal)
vader_clf_scores

Scores(mse=1.312321207809819, mae=0.7465035252345931, rmse=1.1455658897723078, r2=0.2522211585513854, evs=0.25536666650896)

In [12]:
for score in sorted(df_test["score"].unique()):
    sample_idx = df_test.loc[df_test["score"] == score].sample(1).index.values[0]
    text = df_test.loc[sample_idx, "text"]
    true_score = df_test.loc[sample_idx, "score"]
    vader_score = vader_preds_normal[sample_idx]
    print(f"True Score/Vader = {true_score:.3f}/{vader_score:.3f}")
    print(f"{text}")
    print()

True Score/Vader = 1.000/1.199
These chips are nasty.  I thought someone had spilled a drink in the bag, no the chips were just soaked with grease.  Nasty!!

True Score/Vader = 2.000/2.973
The individual Pocky sticks are not separately packaged and melt and stick together while being shipped.

True Score/Vader = 3.000/4.949
I liked getting this one for my twin 10-month olds because it has such interesting ingredients, particularly the zucchini and the garbanzo beans.  I like the fact that it's vegetarian yet I believe it has a complete protein since it also has brown rice.  This list of ingredients looks excellent and it's a green color naturally.<br /><br />The babies liked it.  They have liked everything that has carrots.<br /><br />I have the subscription for this one as I really like the ingredients and the fact that I'm giving the babies food that is naturally green.<br /><br />Ingredients per the label: Water, organic corn, organic zucchini, organic green beans, organic brown ric

# Classic ML Model Trained from Scratch

In [13]:
from sklearn.linear_model import LogisticRegression

x_train = [cleaning.clean(text) for text in df_train["text"]]
x_test = [cleaning.clean(text) for text in df_test["text"]]
y_train = df_train["score"].values
y_test = df_test["score"].values

  soup = BeautifulSoup(text, "html.parser")


In [14]:
for i in range(5):
    print(f"Before: {df_train.iloc[i]["text"]}")
    print(f"After: {x_train[i]}")
    print()

Before: I am so convinced these are human animal crackers. They look and taste EXACTLY like human ones (yes, I tasted it) and they contain sugar, which is not one of the last ingredients in the nutrition facts either. So ill just give my dog very small amounts.
After: i am so convinced these are human animal crackers they look and taste exactly like human ones yes i tasted it and they contain sugar which is not one of the last ingredients in the nutrition facts either so ill just give my dog very small amounts

Before: I have a whole box of peanut butter dog cookies and she wont touch them. She loves these and begs for them so it was a good buy. She is a little chihuhua and they are not too big for her mouth. About 20 per box.
After: i have a whole box of peanut butter dog cookies and she wont touch them she loves these and begs for them so it was a good buy she is a little chihuhua and they are not too big for her mouth about 20 per box

Before: These little animal crackers taste good

In [15]:
x_train_preprocessed = [preprocessing.preprocess(text) for text in x_train]
x_test_preprocessed = [preprocessing.preprocess(text) for text in x_test]

In [16]:
for i in range(5):
    print(f"Before: {x_train[i]}")
    print(f"After: {x_train_preprocessed[i]}")
    print()

Before: i am so convinced these are human animal crackers they look and taste exactly like human ones yes i tasted it and they contain sugar which is not one of the last ingredients in the nutrition facts either so ill just give my dog very small amounts
After: ['convinced', 'human', 'animal', 'cracker', 'look', 'taste', 'exactly', 'like', 'human', 'one', 'yes', 'tasted', 'contain', 'sugar', 'one', 'last', 'ingredient', 'nutrition', 'fact', 'either', 'ill', 'give', 'dog', 'small', 'amount']

Before: i have a whole box of peanut butter dog cookies and she wont touch them she loves these and begs for them so it was a good buy she is a little chihuhua and they are not too big for her mouth about 20 per box
After: ['whole', 'box', 'peanut', 'butter', 'dog', 'cooky', 'wont', 'touch', 'love', 'begs', 'good', 'buy', 'little', 'chihuhua', 'big', 'mouth', 'per', 'box']

Before: these little animal crackers taste good my doggies like them but i will not be buying again they are like the human an

In [17]:
import gensim

x_train_tagged = []
for i in range(len(x_train_preprocessed)):
    tagged_doc = gensim.models.doc2vec.TaggedDocument(x_train_preprocessed[i], [i])
    x_train_tagged.append(tagged_doc)

In [18]:
x_train_tagged[:3]

[TaggedDocument(words=['convinced', 'human', 'animal', 'cracker', 'look', 'taste', 'exactly', 'like', 'human', 'one', 'yes', 'tasted', 'contain', 'sugar', 'one', 'last', 'ingredient', 'nutrition', 'fact', 'either', 'ill', 'give', 'dog', 'small', 'amount'], tags=[0]),
 TaggedDocument(words=['whole', 'box', 'peanut', 'butter', 'dog', 'cooky', 'wont', 'touch', 'love', 'begs', 'good', 'buy', 'little', 'chihuhua', 'big', 'mouth', 'per', 'box'], tags=[1]),
 TaggedDocument(words=['little', 'animal', 'cracker', 'taste', 'good', 'doggy', 'like', 'buying', 'like', 'human', 'animal', 'cracker', 'flour', 'sugar', 'bad', 'stuff', 'dog', 'continue', 'buy', 'healthy', 'treat', 'doggy', 'also', 'love', 'without', 'sugar', 'might', 'want', 'try', 'wellness', 'blue', 'buffalo', 'natural', 'balance', 'zukes', 'name'], tags=[2])]

In [19]:
doc2vec_model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=100)
doc2vec_model.build_vocab(x_train_tagged)

In [20]:
print(
    f"Word 'human' appeared {doc2vec_model.wv.get_vecattr('human', 'count')} times in the training corpus."
)

Word 'human' appeared 46 times in the training corpus.


In [21]:
doc2vec_model.corpus_count == len(x_train)

True

In [22]:
doc2vec_model.train(x_train_tagged, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

sample_1 = x_train_tagged[2]
sample_2 = x_train_tagged[5]
sample_3 = ["vanquish", "that", "which", "kill", "you"]

vector_1 = doc2vec_model.infer_vector(sample_1.words)
vector_2 = doc2vec_model.infer_vector(sample_2.words)
vector_3 = doc2vec_model.infer_vector(sample_3)

print(cosine_similarity(vector_1.reshape(1, -1), vector_2.reshape(1, -1)))
print(cosine_similarity(vector_1.reshape(1, -1), vector_3.reshape(1, -1)))

[[0.44460624]]
[[0.1262174]]


In [24]:
ranks = []
for doc_id in range(len(x_train_tagged)):
    # Infer the vector of training doc
    inferred_vector = doc2vec_model.infer_vector(x_train_tagged[doc_id].words)  
    # Get the most similar document to training doc from training doc itself
    # Ideally all doc should be most similar to itself
    # This is for sanity check
    sims = doc2vec_model.dv.most_similar([inferred_vector], topn=len(doc2vec_model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

In [25]:
import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 9496, 1: 497, 3: 2, 5: 2, 4: 2, 2: 1})


In [26]:
x_train_doc2vec = np.array([doc2vec_model.infer_vector(doc) for doc in x_train_preprocessed])
x_test_doc2vec = np.array([doc2vec_model.infer_vector(doc) for doc in x_test_preprocessed])

In [27]:
x_train_doc2vec.shape, x_test_doc2vec.shape

((10000, 50), (1000, 50))

In [28]:
model = LogisticRegression(random_state=8, max_iter=1_000)
model.fit(x_train_doc2vec, y_train)
logreg_preds = model.predict(x_test_doc2vec)

logreg_clf_scores = return_regr_score(y_test, logreg_preds)
logreg_clf_scores

Scores(mse=1.931, mae=0.703, rmse=1.3896042602122376, r2=-0.10031060554691029, evs=-0.07146377778626145)

In [29]:
for score in sorted(df_test["score"].unique()):
    sample_idx = df_test.loc[df_test["score"] == score].sample(1).index.values[0]
    text = df_test.loc[sample_idx, "text"]
    true_score = df_test.loc[sample_idx, "score"]
    vader_score = vader_preds_normal[sample_idx]
    logreg_score = logreg_preds[sample_idx]
    print(f"True Score/Vader/Logreg = {true_score:.3f}/{vader_score:.3f}/{logreg_score:.3f}")
    print(f"{text}")
    print()

True Score/Vader/Logreg = 1.000/4.826/1.000
Wow, i am shocked to find clear plastic pieces in the jars. The other posters say they notified Earth's Best about this in August and they are still selling it without checking or recalling!! No more earth's best for my little guy. I am taking my business elsewhere and I am very sad that organic company is selling baby food with plastic and bpa in their jars. They gotta do better than this to get my business.

True Score/Vader/Logreg = 2.000/4.680/5.000
I can't eat these oats, they have a funny taste to them.  My kids also think they taste funny.  My husband thinks they are OK, so he has 50 pounds of oats to eat on his own!

True Score/Vader/Logreg = 3.000/4.868/5.000
OK,I did read the reviews which ranged from splendid to awful. I took the plunge as I do like salty, and tangy flavors. No, I had never had these before I took the leap and ordered a case at a super price. They arrived quickly and in great shape with the sell by date  FEB 2011. 

# Roberta Pretarined Model

In [30]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax


def combine_roberta_scores(scores):
    negative, neutral, positive = scores
    if positive > neutral and positive > negative:
        return positive  # Strong positive sentiment
    elif negative > positive and negative > neutral:
        return -negative  # Strong negative sentiment
    else:
        return neutral  # Neutral sentiment


MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
roberta_model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [31]:
sample = "If it were possible to give this product zero stars, I would have done so.  I am a huge fan of Twinings teas, and was very excited to try this new blend.  After arriving home from the grocery, I immediately started the kettle to enjoy a cup while prepping dinner.  Between sorting, washing, and chopping items, the hot water was added to the bag to steep.  Preparation continued, until a distinctly off odor hit me.  I thought one of my ingredients must be rancid!  I hunted around for almost four minutes before I realized it was the tea.  Even after cleaning up, and airing out the kitchen, my roommate later came in and asked about the smell.  Save your money to purchase ANY other Twinings product."

endoded_sample = tokenizer(sample, return_tensors="pt")
output = roberta_model(**endoded_sample)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores = combine_roberta_scores(scores)
scores

0.83730406

In [32]:
roberta_preds = np.zeros(df_test.shape[0])

for i, text in tqdm(enumerate(df_test["text"].values), total=len(df_test), desc="Processing"):
    try:
        endoded_text = tokenizer(text, return_tensors="pt")
        output = roberta_model(**endoded_text)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        score = combine_roberta_scores(scores)
    except RuntimeError:
        score = 0
    roberta_preds[i] = score

Processing: 100%|██████████| 1000/1000 [02:00<00:00,  8.33it/s]


In [33]:
roberta_preds_normal = normalize_pred(roberta_preds, 1, 5)
roberta_preds_normal.min(), roberta_preds_normal.max()

(1.0, 5.0)

In [34]:
roberta_clf_scores = return_regr_score(df_test["score"].values, roberta_preds_normal)
roberta_clf_scores

Scores(mse=0.7827122513520923, mae=0.5085055407485638, rmse=0.8847102640707252, r2=0.5539996938093183, evs=0.5552486866284492)

In [35]:
for score in sorted(df_test["score"].unique()):
    sample_idx = df_test.loc[df_test["score"] == score].sample(1).index.values[0]
    text = df_test.loc[sample_idx, "text"]
    true_score = df_test.loc[sample_idx, "score"]
    vader_score = vader_preds_normal[sample_idx]
    logreg_score = logreg_preds[sample_idx]
    roberta_score = roberta_preds_normal[sample_idx]
    print(
        f"True Score/Vader/Logreg/Roberta = {true_score:.3f}/{vader_score:.3f}/{logreg_score:.3f}/{roberta_score:.3f}"
    )
    print(f"{text}")
    print()

True Score/Vader/Logreg/Roberta = 1.000/1.941/1.000/1.293
I bought this brand as a trial since I am tired of the Pingos.<br /><br />It claims that it is natural. I have no argument on this. But the point is that more than 50% in the bag is over-fried and in brown color. I really suffer eating the over-fried chips. I open some other bags and it looks like the same. So I just throw away all of them. I don't know if I was with bad luck or every bag they are selling is the same. But for sure I will never buy this brand any more.

True Score/Vader/Logreg/Roberta = 2.000/2.218/5.000/1.124
I really love Kettle brand chips, but these are rather disappointing. There is very little sour cream or onion flavor.

True Score/Vader/Logreg/Roberta = 3.000/4.833/5.000/1.353
Yes, they are good. However, they are absolutely inundated with the flavoring stuff, which is initially satisfying then becomes "too much". And certainly isn't good for the waistline or general nutritional issues.<br /><br />AND the

# Compare Scores

In [36]:
vader_clf_scores, logreg_clf_scores, roberta_clf_scores

(Scores(mse=1.312321207809819, mae=0.7465035252345931, rmse=1.1455658897723078, r2=0.2522211585513854, evs=0.25536666650896),
 Scores(mse=1.931, mae=0.703, rmse=1.3896042602122376, r2=-0.10031060554691029, evs=-0.07146377778626145),
 Scores(mse=0.7827122513520923, mae=0.5085055407485638, rmse=0.8847102640707252, r2=0.5539996938093183, evs=0.5552486866284492))

In [37]:
pd.DataFrame(
    [vader_clf_scores, logreg_clf_scores, roberta_clf_scores], index=["vader", "logreg", "roberta"]
).transpose()

Unnamed: 0,vader,logreg,roberta
mse,1.312321,1.931,0.782712
mae,0.746504,0.703,0.508506
rmse,1.145566,1.389604,0.88471
r2,0.252221,-0.100311,0.554
evs,0.255367,-0.071464,0.555249


# Review: Where Models are "Wrong" the Most

In [38]:
sorted_idx = np.argsort(np.abs(df_test["score"].values - vader_preds_normal))
messages = df_test["text"].values[sorted_idx][-5:]
correct_scores = df_test["score"].values[sorted_idx][-5:]
vader_scores = vader_preds_normal[sorted_idx][-5:]

for i in range(len(messages)):
    print(
        f"Correct score vs vader score = {correct_scores[i]: .3f} vs {vader_scores[i]: .3f}"
    )
    print(messages[i])
    print()

Correct score vs vader score =  1.000 vs  4.845
My daughter had extensive food allergies as an infant/toddler and we bought these bars consistently.  She still has the nut and egg allergy so I like that I can trust the manufacturer and know the food is safe.  Unfortnately the last few times we have purchased the bars (all varieties) they have been hard as a rock and my daughter won't eat them.  I have tried them myself and thought I might break a tooth. I had them on subscribe and save; which I have now cancelled and have several boxes which no one in my family will eat.  I agree with a previous post. I want to love these bars and I love everything the company stands for (and actually really like their cookies), but something has changed with these bars and they are just not even appetizing.  I hope they address the issue and I will certainly try them again if they do something to enhance the taste and address the issue with the texture. A total bummer -  they were better in the past b

In [39]:
sorted_idx = np.argsort(np.abs(df_test["score"].values - logreg_preds))
messages = df_test["text"].values[sorted_idx][-5:]
correct_scores = df_test["score"].values[sorted_idx][-5:]
logreg_scores = logreg_preds[sorted_idx][-5:]

for i in range(len(messages)):
    print(
        f"Correct score vs logreg score = {correct_scores[i]: .3f} vs {logreg_scores[i]: .3f}"
    )
    print(messages[i])
    print()

Correct score vs logreg score =  1.000 vs  5.000
I am very disappointed with this product because I could not see how many calories it has at the time I ordered it. I received it and now I can see it has 60 calories per tablespoon. Granulated sugar has only 45 calories per tablespoon. How could Agave Nectar be better than sugar?

Correct score vs logreg score =  1.000 vs  5.000
To me, these are nothing like the regular Altoids and are not breath mints. They are pleasant-tasting little candies in a cute convenient tin, and that's as far as it goes. The mintiness is just not strong, and the wintergreens are definitely weaker than the peppermint minis. I'm not a dragon-breath person, but still, one of these mints is too small to have any effect on my breath. Four or five will freshen my breath for a short while - maybe 15 minutes. At this point, I think the Icebreakers Frost mints are the best as sugar free breath mints.

Correct score vs logreg score =  1.000 vs  5.000
I was so excited t

In [40]:
sorted_idx = np.argsort(np.abs(df_test["score"].values - roberta_preds_normal))
messages = df_test["text"].values[sorted_idx][-5:]
correct_scores = df_test["score"].values[sorted_idx][-5:]
roberta_scores = roberta_preds_normal[sorted_idx][-5:]

for i in range(len(messages)):
    print(
        f"Correct score vs roberta score = {correct_scores[i]: .3f} vs {roberta_scores[i]: .3f}"
    )
    print(messages[i])
    print()

Correct score vs roberta score =  5.000 vs  1.218
The product is all that it says it is which is why I gave it 5 stars.  However, I do have issues with the product, namely, me and my family members can't take the smell.  The taste is horrible so I put it in a lotion cream and rubbed it on my skin.  This was a few weeks ago and though I've since changed my sheets, the scent is still in my bed.  Ultimately, I stopped using it because it made me smell in a way I did not like.  It smells like an Indian spice and while it didn't work for me, scent-wise, it may work fine with someone else.

Correct score vs roberta score =  5.000 vs  1.217
If this flavor lasted longer than it does, I'd probably die of starvation for lack of wanting to take it out of my mouth.

Correct score vs roberta score =  5.000 vs  1.203
this gum is super sick.tatooes are killin.flavor is a spankin'.this brings back bomb diggity memories yo.peace out.

Correct score vs roberta score =  5.000 vs  1.190
I've eaten other b