First, we begin by taking in the dataset and cleaning it up for text mining and sentiment analysis. We will also create a new csv file with the clean data.

In [None]:
# Dataset source: https://jmcauley.ucsd.edu/data/amazon/
import pandas as pd
import nltk
import math
import numpy as np
from numpy import dot
from numpy.linalg import norm
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = CountVectorizer()
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
#Json structure
'''
reviewerID - ID of the reviewer, e.g. A2SUAM1J3GNN3B
asin - ID of the product, e.g. 0000013714
reviewerName - name of the reviewer
vote - helpful votes of the review
style - a disctionary of the product metadata, e.g., "Format" is "Hardcover"
reviewText - text of the review
overall - rating of the product
summary - summary of the review
unixReviewTime - time of the review (unix time)
reviewTime - time of the review (raw)
image - images that users post after they have received the product
'''

In [None]:
# Convert reviews to all lowercase
def to_lower(text):
    return text.lower()


# Remove special characters from reviews
def remove_special_char(text):
    newString = ''
    for i in text:
        if i.isalnum():                     # If character is alphanumeric, keep. Else, add a space
            newString = newString + i
        else:
            newString = newString + ' '
    return newString

# Remove stopwords: https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
def rem_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    newStr = ""
    for w in words : 
        if w not in stop_words :
            newStr += w + " "
    #return [w for w in words if w not in stop_words]
    return newStr

In [None]:
def getDataFromJson(filename) :
    df = pd.read_json(filename, lines=True)
    df = df.drop(columns=['verified','reviewTime','reviewerID','asin','vote','unixReviewTime','asin','reviewerName'])
    df.reviewText = df.reviewText.apply(to_lower)
    
    df = df[df['reviewText'].str.split().str.len().gt(5)] # Reviews must have greater than 5 words to perform sentiment analysis

    df.reviewText = df.reviewText.apply(remove_special_char)
    
    df.reviewText = df.reviewText.apply(rem_stopwords)
    
    return df

In [None]:
df = getDataFromJson('Movies_and_TV_5.json')

# End of data cleaning, clean data set
df.to_csv("clean_data.csv")
df

In [None]:
# Use overall to classify as positive or negative: 3+ is positive, 1 and 2 are negative
# reviewText is full review, will use for sentiment analysis
# Summary might be useful for EDA? Length of summary with score maybe?
# Style also for EDA

Now, we will perform EDA to better understand our data and find interesting information about our dataset.

In [None]:
# EDA
pdf = df['overall'].value_counts()
pdf = pdf.reindex(sorted(pdf.index), axis = 1)
pdf.plot.bar()

In [None]:
format_df = df['style']
format_df = df['style'].value_counts()
format_df.plot.pie()

In [None]:
pd.crosstab(df.overall, df['style'].astype(str)).plot.bar()

In [None]:
# END OF EDA

In [None]:
# VECTORIZING BAG OF WORDS
df['review'] = df['summary'] + " " + df['reviewText']
vectorizer.fit(df['review'])
names = vectorizer.get_feature_names_out()
bag_of_words = vectorizer.fit(df['review'])
bag_of_words = vectorizer.transform(df['review'])

In [None]:
print(bag_of_words[0])

In [None]:
# USEFUL FUNCTIONS FOR BAG OF WORDS
bag_of_words[0].indices
bag_of_words[0].data
vectorizer.vocabulary_.get("learn")
names[1361]


In [None]:
#TFIDF TESTING, PROBABLY NOT USEFUL?
tfidfVect = TfidfVectorizer()
tfidf = tfidfVect.fit(df['review'])
tfidf = tfidfVect.transform(df['review'])
print(tfidf[2])

In [None]:
# Cosine similarity function seen in class
def getSimilarity(review1, review2) :
    r1 = review1.todense()
    r2 = review2.todense()
    r1 = np.squeeze(np.asarray(r1))
    r2 = np.squeeze(np.asarray(r2))

    return dot(r1, r2) / (norm(r1) * norm(r2))

In [None]:
getSimilarity(bag_of_words[0],bag_of_words[0])

In [35]:
negWords = ['abysmal','adverse','alarming','angry','annoy','annoying','anxious','apathy','appalling','atrocious','awful','ass','bad',
            'banal','barbed','belligerent','bemoan','beneath','boring','broken','callous','clumsy','coarse','cold','cold-hearted',
            'collapse','confused','confusing','contradictory','contrary','corrosive','corrupt','creepy','criminal','cruel',
            'damage','damaging','dastardly','dead','decaying','deformed','dumpster','deny','deplorable','depressed','deprived','despicable',
            'detrimental','dirty','disease','disgusting','disheveled', 'disagree','dishonest','dishonorable','dismal','distress','dreadful',
            'dreary','enraged','eroding','evil','fail', 'faulty', 'fear', 'feeble', 'filthy', 'foul', 'garbage','grave', 'greed', 'grim', 
            'gross', 'grotesque', 'gruesome', 'guilty','haggard', 'hard', 'harmful', 'hate', 'hideous', 'horrendous', 'horrible',
            'hostile', 'hurt', 'hurtful','icky', 'ignorant', 'ignore', 'ill', 'immature', 'imperfect', 'inelegant', 'infernal',
            'insidious', 'insipid','junky','lousy','mean','messy','monstrous','monstrosity','naive','nasty','negative','never','nonsense',
           'offensive','oppressive','pain','painful','plain','poor','poison','poisonous','prejudice','questionable','quit','repulsive',
           'rotten','rude','ruthless','sad','scary','shoddy','sick','sickening','sorry','stressful','suspicious','terrible','terrifying',
           'ugly','undermine','unfair','unfavorable','unhappy','unhealthy','unjust','unlucky','unpleasant','unsatisfactory','unwanted',
           'unwelcome','upset','vicious','vile','worthless','trash','trashcan', 'yuck', 'one','two']
posWords = ['accepted','acclaimed','accomplishment','achievement','admire','adorable','agree','agreeable','amazing','angelic','appealing',
           'approve','attractive','awesome','beautiful','best','bliss','brilliant','bubbly','celebrate','charming','cheery','classic','commend',
           'cool','creative','cute','dazzling','delight','delightful','distinguished','ecstatic','electrifying','elegant','effective','echanting',
           'encouraging','engaging','essential','esteemed','ethical','excellent','exciting','exquisite','fabulous','fair','fantastic','favorable',
           'fine','fresh','friendly','fun','funny','genius','genuine','glamorous','good','gorgeous','great','happy','healthy','heavenly',
           'ideal','innovative','intelligent','joy','joyful','laugh','laughing','legendary','lovely','love','marvelous','masterful','motivational',
            'nice','perfect','phenomenal','pleasant','pleasureable','polished','positive','powerful','proud','refined','refreshing','rejoice','remarkable',
           'respected','rewarding','right','safe','skilled','skillful','smile','soulful','special','stunning','success','successful','super','superb','terrific',
            'thrilling','thriving','unreal','upbeat','valued','welcome','wholesome','wonderful','wondrous','worthy','wow', 'yay', 'three', 'four', 'five']

def classify(index) :
    i = 0
    ratings = []
    #Weighting/amount of positive scores
    posScores = 0
    #Weighting/amount of negative scores
    negScores = 0
    # the mean rating of the k nearest neighbors
    avgRating = 0
    
    # Neighbor class we will use for knn
    class neighbor :
        def __init__(self):
            self.index = -1
            self.distance = -1
        def __eq__(self, other) :
            if(self.distance == other.distance) :
                return True
            else :
                return False
        def __lt__(self, other) :
            if(self.distance < other.distance) :
                return True
            else :
                return False
        def __gt__(self, other) :
            if(self.distance > other.distance) :
                return True
            else :
                return False

    # Get all similarity scores
    neighbors = []
    for review in df['review'] :
        if i == index :
            i += 1
            continue
        sim = getSimilarity(bag_of_words[index], bag_of_words[i])
        #minimum similarity threshold
        if sim > 0.10 :
            # Create neighbor object that we can sort the similarity of and only take the k nearest ones
            curNeighbor = neighbor()
            curNeighbor.index = i
            curNeighbor.distance = sim
            neighbors.append(curNeighbor)

        i += 1
    
    # Get data from knn
    knn = sorted(neighbors)
    k = 201
    
    #iterator for k
    curK = 0
    for neighbor in knn :
        if curK >= k :
            break
        curRating = df.iloc[neighbor.index]['overall']
        #print("Rating: ", curRating)
        ratings.append(curRating)
        # If neighbor has < 3 rating, add negative score weight
        if curRating < 3.0 :
            #weighted knn
            #negScores += pow(sim,1/2) + 0.1
            negScores += 1
        # otherwise add positive score weight
        else :
            #weighted knn
            #posScores += pow(sim,1/2) + 0.1
            posScores += 1
        curK += 1
    
    print("True rating ", df.iloc[index]['overall'])
    print("Pos scores: ", posScores)
    print("Neg scores: ", negScores)
    #print("Scores array : ", ratings)
    avgRating = np.mean(ratings)
    print("Avg rating : ", avgRating)
    # Algorithm for determining classification, given the meanscore and ratio of positive to negative similar reviews
    if avgRating >= 2.5 : 
        if posScores > negScores or abs(posScores - negScores) <= 1 :
            return 1
        else :
            return -1
    else :
        if posScores < negScores or abs(posScores - negScores) <= 1 :
            return -1
        else : 
            return 1 

SyntaxError: unterminated string literal (detected at line 19) (1958712002.py, line 19)

In [None]:
classify(0)

In [None]:
# Leave one out validator
i = 0
correct = 0
incorrect = 0
for review in df['review'] :
    print("Index ", i)
    trueRating = df.iloc[i]['overall']
    ratingType = 1

    if trueRating < 3.0 :
        ratingType = -1
    predicted = classify(i)
    print("Predicted: ", predicted)
    if ratingType == predicted :
        #print("Correct classification!")
        correct += 1
    else :
        print("Wrong classification")
        incorrect += 1
    i += 1
print("Correct: ", correct, " Incorrect: ", incorrect)
accuracy = correct / (correct + incorrect)
print("Accuracy: ",  accuracy)
        