In [22]:
import json
import csv
import pandas as pd
from pathlib import Path
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords

sid = SentimentIntensityAnalyzer()

In [13]:
# run the vader analysis and output the compound score (-1 to 1)
count = 0
output_rows = []
with open('reviews_restaurants.csv', 'r') as file:
    reader = csv.DictReader(file)

    for row in reader:
        review = row['text']
        stars = row['stars']
        review_id = row['review_id']
        text = row['text']
        sentiment_score = sid.polarity_scores(review)['compound']
        output_row = {'review_id':review_id,'text':text, 'sentiment': sentiment_score, 'stars': stars}
        output_rows.append(output_row)

KeyboardInterrupt: 

In [6]:
output_df = pd.DataFrame(output_rows)

In [7]:
# drop na and change dtype stars to integer
output_df = output_df.dropna()
output_df['stars'] = output_df['stars'].str.slice(0,1)
output_df['stars'] = output_df['stars'].astype(int)

In [8]:
# create dataframes for various ranges in vader score
n_one_staradj= output_df.loc[(output_df['sentiment'] >= -1) & (output_df['sentiment'] <= -0.3), ['review_id','text','sentiment','stars']]

n_two_staradj = output_df.loc[(output_df['sentiment'] >= -0.5) & (output_df['sentiment'] <= 0.3), ['review_id','text','sentiment','stars']]

n_three_staradj = output_df.loc[(output_df['sentiment'] >= 0.3) & (output_df['sentiment'] <= 0.75),['review_id','text','sentiment','stars']]

n_four_staradj = output_df.loc[(output_df['sentiment'] >= 0.75) & (output_df['sentiment'] <= 0.95), ['review_id','text','sentiment','stars']]

n_five_staradj = output_df.loc[(output_df['sentiment'] >= 0.95) & (output_df['sentiment'] <= 1), ['review_id','text','sentiment','stars']]

In [24]:
# add newstar column to the dataframes and merge
n_one_staradj['newstar']= 1
n_two_staradj['newstar']= 2
n_three_staradj['newstar']= 3
n_four_staradj['newstar']= 4
n_five_staradj['newstar']= 5

newstar_df = pd.concat([n_one_staradj, n_two_staradj, n_three_staradj, n_four_staradj, n_five_staradj], axis=0)
newstar_df = newstar_df.set_index('review_id')
newstar_df = newstar_df[~newstar_df.index.duplicated(keep='first')]

In [26]:
newstar_df.to_csv('data/vader_newstar_df.csv')
newstar_df.to_json('data/vader_newstar_df.json')