In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

import nltk

In [None]:
# Read in data # df --> dataframe
df = pd.read_csv('/kaggle/input/amazon-fine-food-reviews/Reviews.csv/Reviews.csv')
print(df.shape)
df = df.head(500)
print(df.shape)

In [None]:
df.head()

## EDA

In [None]:
# value_counts gives the number of times each score occurs
axis = df['Score'].value_counts().sort_index().plot(kind='bar',
                                           title='Count of Reviews by Stars',
                                           figsize=(10,5))
axis.set_xlabel('Review Stars')

## BASIC NLTK

In [None]:
example = df['Text'][50]
print(example)

In [None]:
# Split sentence into parts of each word
# More accurate and readable (by computer) than splitting on spaces

tokens = nltk.word_tokenize(example)
tokens[:10]

In [None]:
# pos_tag --> part of speech tag
tagged = nltk.pos_tag(tokens)
tagged[:10]

# DT --> Determiner
# NN --> Singular Noun
# JJ --> Adjective
# PRP --> Preposition

In [None]:
# Take tokens and group them into chunks of text
# pprint() --> Pretty print (print data structues in a readable way)
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()

## VADER
Valence Aware Dictionary (and) sEntiment Reasoner

Takes each word and assigns a positive, negative, or neutral value. Then, for all words, it'll add up to communicate if the sentence ahs a positive, negative, or neutral sentiment.

NOTES:
Does not account for relationships between words, which is very important in human speech

Stop words (ie. and, the) are removed
 


In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

# Progress bar tracker for loops on data
from tqdm.notebook import tqdm

# Sentiment Intensity Analyzer Object

sia = SentimentIntensityAnalyzer()

In [None]:
sia.polarity_scores('I am so happy!')

# No negative, some neutral, mostly positive --> Accurate
# Compound value ranges from -1 to 1 and represents how negative to positive it is

In [None]:
sia.polarity_scores('This is the worst thing ever.')

# Fairly negative, mostly neutral (likely because of "." over "!"), and no positive
# Compound value shows its negative overall

In [None]:
sia.polarity_scores(example)

# Overall negative (compund score)

In [None]:
# Run the polarity score on the entire dataset
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row['Text']
    myid = row['Id']
    res[myid] = sia.polarity_scores(text)

In [None]:
# Pandas (pd) DataFrame frames the data
# .T flips the frame horizontally
# Indexs are ids

vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index': 'Id'})
vaders = vaders.merge(df, how='left')

In [None]:
# Now, the table has sentiment score and metadata
vaders.head()

# Can track accuracy of sentiment analysis based off star rating of review

## Plot VADER Results

In [None]:
# "compound" must NOT be capatalized --> that's how it is outputted in sia
axis = sns.barplot(data=vaders, x='Score', y='compound')
axis.set_title('Compound Score by Amazon Star Review')
plt.show()

# Table shows lower star reviews have lower compound scores, and higher star reviews have higher compound scores


In [None]:
# Make barplots for each pos, neg, neu

# Make a 1x3 grid
fig, axs = plt.subplots(1,3,figsize=(12,3))
# Figure size is simply the size of each cell

# Create plots
sns.barplot(data=vaders, x='Score', y='pos', ax=axs[0])
sns.barplot(data=vaders, x='Score', y='neu',ax=axs[1])
sns.barplot(data=vaders, x='Score', y='neg', ax=axs[2])

# Titles
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')

plt.tight_layout()
plt.show() 


Graphs validate VADER

For positive, the more stars a review has, the more postiive the sentiment is. The less stars a review has, the less positive the sentiment is.

For neutral, the scores are even.

For negative, the more stars a review has, the less negative the sentiment is. The less stars a review has, the more negative the sentiment is.