In [1]:
import math, time, os, nltk
import multiprocessing as mp
import matplotlib as plt
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_absolute_error
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download("all")
analyzer = SentimentIntensityAnalyzer()


  from pandas.core.computation.check import NUMEXPR_INSTALLED
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/nicoletaylor/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/nicoletaylor/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/nicoletaylor/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/nicoletaylor/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/nicoletaylor/nltk_data...
[nltk_data

[nltk_data]    |   Package nombank.1.0 is already up-to-date!
[nltk_data]    | Downloading package nonbreaking_prefixes to
[nltk_data]    |     /Users/nicoletaylor/nltk_data...
[nltk_data]    |   Package nonbreaking_prefixes is already up-to-date!
[nltk_data]    | Downloading package nps_chat to
[nltk_data]    |     /Users/nicoletaylor/nltk_data...
[nltk_data]    |   Package nps_chat is already up-to-date!
[nltk_data]    | Downloading package omw to
[nltk_data]    |     /Users/nicoletaylor/nltk_data...
[nltk_data]    |   Package omw is already up-to-date!
[nltk_data]    | Downloading package omw-1.4 to
[nltk_data]    |     /Users/nicoletaylor/nltk_data...
[nltk_data]    |   Package omw-1.4 is already up-to-date!
[nltk_data]    | Downloading package opinion_lexicon to
[nltk_data]    |     /Users/nicoletaylor/nltk_data...
[nltk_data]    |   Package opinion_lexicon is already up-to-date!
[nltk_data]    | Downloading package panlex_swadesh to
[nltk_data]    |     /Users/nicoletaylor/nltk_d

[nltk_data]    |   Package word2vec_sample is already up-to-date!
[nltk_data]    | Downloading package wordnet to
[nltk_data]    |     /Users/nicoletaylor/nltk_data...
[nltk_data]    |   Package wordnet is already up-to-date!
[nltk_data]    | Downloading package wordnet2021 to
[nltk_data]    |     /Users/nicoletaylor/nltk_data...
[nltk_data]    |   Package wordnet2021 is already up-to-date!
[nltk_data]    | Downloading package wordnet2022 to
[nltk_data]    |     /Users/nicoletaylor/nltk_data...
[nltk_data]    |   Package wordnet2022 is already up-to-date!
[nltk_data]    | Downloading package wordnet31 to
[nltk_data]    |     /Users/nicoletaylor/nltk_data...
[nltk_data]    |   Package wordnet31 is already up-to-date!
[nltk_data]    | Downloading package wordnet_ic to
[nltk_data]    |     /Users/nicoletaylor/nltk_data...
[nltk_data]    |   Package wordnet_ic is already up-to-date!
[nltk_data]    | Downloading package words to
[nltk_data]    |     /Users/nicoletaylor/nltk_data...
[nltk_da

In [2]:
def setup(path_review_train, path_product_train):
    df_reviews = pd.read_json(path_review_train)[["asin","unixReviewTime","verified", "vote",
                                             "reviewText","summary"]]
    df_products = pd.read_json(path_product_train)
    df_train = pd.merge(df_reviews, df_products, on = "asin") #asin = amazon id of product being reviewed

    del df_products
    del df_reviews
    df_train = df_train[df_train['summary'].notna() & df_train['reviewText'].notna()]
    df_train["summary_sentiment"] = 0
    df_train["reviewText_sentiment"] = 0
    
    return df_train.head(10000)

In [3]:
def calcWeights(reviewSentiment, summarySentiment, upvotes, reviewTime, verifiedReview): 
    sentiment = float((reviewSentiment + summarySentiment) / 2)

    if reviewTime is not None:
        sentiment = scalebyTime(float(reviewTime), sentiment)
    if upvotes is not None:
        upvotes = int(upvotes.replace(",",""))
        sentiment = scalebyUpvotes(upvotes, sentiment)
    if verifiedReview is not None: 
        sentiment = scalebyVerifiedReview(verifiedReview, sentiment)
    else:
        sentiment = scalebyVerifiedReview(False, sentiment)
    return sentiment

In [4]:
def scalebyVerifiedReview(verifiedReview, sentiment):
    sentimentScaleVerified = 1 if sentiment > 0 else - 1
    sentimentScaleUnverified = -.5 if sentiment > 0 else .5
    if (verifiedReview):
        sentiment += sentimentScaleVerified
    else:
        sentiment += sentimentScaleUnverified
    return float(sentiment)

In [5]:
def scalebyUpvotes(upvotes, sentiment):
    
    if sentiment > 0:
        sentiment += math.log(upvotes, 10) 
    else: 
        sentiment -= math.log(upvotes, 10)
    return float(sentiment)

In [6]:
def scalebyTime(reviewTime, sentiment):
    currentTime = time.time()
    timeWeight = 1 / (currentTime - reviewTime + 1)
    sentiment = (timeWeight + 1) * sentiment
    return float(sentiment)

In [7]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

In [8]:
def get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    return scores['compound']

In [9]:
def sentiment_analysis(text):
    #pre process
    processed_text = preprocess_text(text)

    #get sentiment
    sentiment = get_sentiment(processed_text)
    
    #return scores
    return float(sentiment)

In [10]:
def run_nlp(data, output_dict):
    star_scores = {"Five Stars": 0.75, "Four Stars": 0.5, "Three Stars": 0.25, "Two Stars": -0.5, "One Star": -0.75}
        
    for index,row in df_train.iterrows():
        print(index)
        #get data from rows
        asin = row["asin"]
        summary_text = row["summary"]
        review_text = row["reviewText"]
        time = row["unixReviewTime"]
        verified = row["verified"]
        
        #get sentiment analysis score for summary
        if summary_text in star_scores:
            summary_sentiment = star_scores[summary_text]
        else:
            summary_sentiment = sentiment_analysis(summary_text)
    
        #get sentiment analysis score for review text
        review_sentiment = sentiment_analysis(review_text)
    
        #weight based on review time, votes, verification
        weight = calcWeights(review_sentiment, summary_sentiment, vote, time, verified)
    
        #append sentiment scores to output DATAFRAME
        if asin in output_dict:
            output_dict[asin].append(weight)
        else:
            output_dict[asin] = [weight]

In [11]:
#setup 
wd = os.getcwd()
path_review_train = "devided_dataset_v2/Toys_and_Games/train/review_training.json"
path_product_train = "devided_dataset_v2/Toys_and_Games/train/product_training.json"

#setup training data
df_train = setup(path_review_train, path_product_train)

In [12]:
#run NLP on products
#list is vaid shared memory between threads, we will store output here
shared_process_list = []
#keep track of which element of list maps to which asin
index2asin = {}

with mp.Manager() as manager:
    output_dict = manager.dict()
    
    asin_ids = df_train["asin"].tolist()
    asin_p1 = asin_ids[0:(len(asin_ids)//2)]
    asin_p2 = asin_ids[(len(asin_ids)//2):]
    df_p1 = df_train[df_train["asin"].isin(asin_p1)]
    df_p2 = df_train[df_train["asin"].isin(asin_p2)]
    #delete old df
    #del df_train
    
    #start process 1
    p1 = mp.Process(target=run_nlp, args=(df_p1,output_dict))
    p1.start()
    
    #start process 2
    p2 = mp.Process(target=run_nlp, args=(df_p2,output_dict))
    p2.start()
    
    p1.join()
    p2.join()
    
    df_sentiments = pd.DataFrame.from_dict({"asin": list(output_dict.keys()), 
                                            "sentiment_scores": list(output_dict.values())})

#save intermediate in case
df_sentiments.to_csv("training_with_sentiment_scores.csv")

KeyboardInterrupt: 