# Combining the data

In [None]:
import pandas as pd
import numpy as np

In [139]:
data = pd.read_csv("amazon/Reviews.csv")

In [141]:
amzn = data[["Score", "Text"]]
amzn.columns = ["stars", "text"]
amzn["source"] = "Amazon"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
dataY = pd.read_csv("yelp.csv")

In [6]:
yelp = dataY[["stars", "text"]]
yelp["source"] = "Yelp"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [8]:
df = amzn.append(yelp).reset_index(drop = True)

This is the dataframe that will be used which has both Yelp and Amazon sourced data.

In [9]:
display(df)

Unnamed: 0,stars,text,source
0,5,I have bought several of the Vitality canned d...,Amazon
1,1,Product arrived labeled as Jumbo Salted Peanut...,Amazon
2,4,This is a confection that has been around a fe...,Amazon
3,2,If you are looking for the secret ingredient i...,Amazon
4,5,Great taffy at a great price. There was a wid...,Amazon
...,...,...,...
578449,3,First visit...Had lunch here today - used my G...,Yelp
578450,4,Should be called house of deliciousness!\n\nI ...,Yelp
578451,4,I recently visited Olive and Ivy for business ...,Yelp
578452,2,My nephew just moved to Scottsdale recently so...,Yelp


# Getting the VADER compound sentiment score

In [10]:
from nltk import wordpunct_tokenize, word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [11]:
tokens = df['text'].apply(lambda x: wordpunct_tokenize(x))
df['tokens'] = tokens

In [14]:
sw = stopwords.words('english')

In [16]:
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in sw])

In [18]:
sid = SentimentIntensityAnalyzer()

In [20]:
df['scores'] = df['tokens'].apply(lambda x: sid.polarity_scores(" ".join(x))['compound'])

Now that I have calculated the sentiment score, this what the dataframe looks like.

In [None]:
display(df)

# Models to predict star rating

In [23]:
from sklearn.model_selection import train_test_split

In [59]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [94]:
train, test = train_test_split(df, test_size=.3, stratify=df.stars, random_state=315)

## Multinomial Naive Bayes

In [123]:
nb = MultinomialNB()

nb.fit(np.array(train['scores'] + 1).reshape(-1, 1), train['stars'])

yhat = nb.predict(np.array(test['scores'] + 1).reshape(-1, 1))

#match = np.where(yhat == test['stars'], 1, 0)
#np.mean(match) # ACCURACY

print("Multinomial Naive Bayes")
print("F1 Score:\t", f1_score(test['stars'], yhat, average = 'weighted'))
print("Accuracy Score:\t", accuracy_score(test['stars'], yhat))
#print("Precision Score:", precision_score(test['stars'], yhat, average = 'weighted'))
print("Recall Score:\t", recall_score(test['stars'], yhat, average = 'weighted'))

Multinomial Naive Bayes
F1 Score:	 0.4913815097099383
Accuracy Score:	 0.6335133141635502
Recall Score:	 0.6335133141635502


## Random forest classifier

In [119]:
rfc = RandomForestClassifier()

rfc.fit(np.array(train['scores']).reshape(-1, 1), train['stars'])

yhat = rfc.predict(np.array(test['scores']).reshape(-1, 1))

#match = np.where(yhat == test['stars'], 1, 0)
#np.mean(match) # ACCURACY

print("Random Forest Classifier")
print("F1 Score:\t", f1_score(test['stars'], yhat, average = 'weighted'))
print("Accuracy Score:\t", accuracy_score(test['stars'], yhat))
print("Precision Score:", precision_score(test['stars'], yhat, average = 'weighted'))
print("Recall Score:\t", recall_score(test['stars'], yhat, average = 'weighted'))

Random Forest Classifier
F1 Score:	 0.5573812923671442
Accuracy Score:	 0.6544483309034961
Precision Score: 0.5715898267542256
Recall Score:	 0.6544483309034961


# Models to detect a positive star rating

In this section I use a different target variable - 'positive' to train models to differentiate between negative reviews (1&2 stars) and positive reviews (4&5 stars).

In [131]:
df_no_3 = df.loc[df['stars'] != 3,:].reset_index()

In [132]:
df_no_3['positive'] = df_no_3['stars'].apply(lambda x: 1 if x > 3 else 0)

In [135]:
train_2, test_2 = train_test_split(df_no_3, test_size=.3, stratify=df_no_3['positive'], random_state=754)

## Multinomial Naive Bayes

In [148]:
nb = MultinomialNB()

nb.fit(np.array(train_2['scores'] + 1).reshape(-1, 1), train_2['positive'])

yhat = nb.predict(np.array(test_2['positive'] + 1).reshape(-1, 1))

#match = np.where(yhat == test['stars'], 1, 0)
#np.mean(match) # ACCURACY

print("Multinomial Naive Bayes")
print("F1 Score:\t", f1_score(test_2['positive'], yhat, average = 'weighted'))
print("Accuracy Score:\t", accuracy_score(test_2['positive'], yhat))
#print("Precision Score:", precision_score(test_2['positive'], yhat, average = 'weighted'))
print("Recall Score:\t", recall_score(test_2['positive'], yhat, average = 'weighted'))

Multinomial Naive Bayes
F1 Score:	 0.7716629664127549
Accuracy Score:	 0.8433371177622796
Recall Score:	 0.8433371177622796


## Random forest classifier

In [149]:
rfc = RandomForestClassifier()

rfc.fit(np.array(train_2['scores'] + 1).reshape(-1, 1), train_2['positive'])

yhat = rfc.predict(np.array(test_2['positive'] + 1).reshape(-1, 1))

#match = np.where(yhat == test['stars'], 1, 0)
#np.mean(match) # ACCURACY


In [147]:
print("Random Forest Classifier")
print("F1 Score:\t", f1_score(test_2['positive'], yhat, average = 'weighted'))
print("Accuracy Score:\t", accuracy_score(test_2['positive'], yhat))
print("Precision Score:", precision_score(test_2['positive'], yhat, average = 'weighted'))
print("Recall Score:\t", recall_score(test_2['positive'], yhat, average = 'weighted'))

Random Forest Classifier
F1 Score:	 0.7716629664127549
Accuracy Score:	 0.8433371177622796
Precision Score: 0.711217494195589
Recall Score:	 0.8433371177622796
