## NLP - Yelp Reviews Implemented using NaiveBayes

---

In [1]:
#Import Sparksession
from pyspark.sql import SparkSession

In [2]:
#Create a SparkSession
spark = SparkSession.builder.appName('YelpReview').getOrCreate()

In [3]:
#Load the Yelp Review TSV file
dataframe = spark.read.format('csv').\
            option('header', 'true').\
            option('delimiter', '\t').\
            load('data/yelp_reviews.tsv')

dataframe.show()

+--------+--------------------+
|   class|                text|
+--------+--------------------+
|positive|Wow... Loved this...|
|negative|  Crust is not good.|
|negative|Not tasty and the...|
|positive|Stopped by during...|
|positive|The selection on ...|
|negative|Now I am getting ...|
|negative|Honeslty it didn'...|
|negative|The potatoes were...|
|positive|The fries were gr...|
|positive|      A great touch.|
|positive|Service was very ...|
|negative|  Would not go back.|
|negative|The cashier had n...|
|positive|I tried the Cape ...|
|negative|I was disgusted b...|
|negative|I was shocked bec...|
|positive| Highly recommended.|
|negative|Waitress was a li...|
|negative|This place is not...|
|negative|did not like at all.|
+--------+--------------------+
only showing top 20 rows



In [4]:
#Create a length column to be used as a future feature
from pyspark.sql.functions import length
data = dataframe.withColumn('length', length(dataframe['text']))
data.show()

+--------+--------------------+------+
|   class|                text|length|
+--------+--------------------+------+
|positive|Wow... Loved this...|    24|
|negative|  Crust is not good.|    18|
|negative|Not tasty and the...|    41|
|positive|Stopped by during...|    87|
|positive|The selection on ...|    59|
|negative|Now I am getting ...|    46|
|negative|Honeslty it didn'...|    37|
|negative|The potatoes were...|   111|
|positive|The fries were gr...|    25|
|positive|      A great touch.|    14|
|positive|Service was very ...|    24|
|negative|  Would not go back.|    18|
|negative|The cashier had n...|    99|
|positive|I tried the Cape ...|    59|
|negative|I was disgusted b...|    62|
|negative|I was shocked bec...|    50|
|positive| Highly recommended.|    19|
|negative|Waitress was a li...|    38|
|negative|This place is not...|    51|
|negative|did not like at all.|    20|
+--------+--------------------+------+
only showing top 20 rows



---

## Feature Trasnformations

In [5]:
#Import Pyspark Libraries needed for feature transformations
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer

In [6]:
#Create all the features to the data set
pos_neg_to_num = StringIndexer(inputCol='class', outputCol='label')
tokenizer = Tokenizer(inputCol='text', outputCol='token_text')
stopremove = StopWordsRemover(inputCol='token_text', outputCol='stop_tokens')
hashingTF = HashingTF(inputCol='token_text', outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')

In [7]:
#Import Pyspark Libraries needed for Vectorization

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

In [8]:
#Create Feature Vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

In [9]:
#Create and run a data processing pipeline
from pyspark.ml import Pipeline

data_prep_pipeline = Pipeline(stages=[pos_neg_to_num,tokenizer,stopremove,hashingTF,idf,clean_up])

In [10]:
#Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(data)
cleaned = cleaner.transform(data)

In [11]:
#Show label of Yelp Reviews and resulting features

cleaned.select(['label', 'features']).show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(262145,[33933,69...|
|  1.0|(262145,[15889,13...|
|  1.0|(262145,[25570,63...|
|  0.0|(262145,[6286,272...|
|  0.0|(262145,[6979,255...|
|  1.0|(262145,[24417,24...|
|  1.0|(262145,[12084,48...|
|  1.0|(262145,[3645,963...|
|  0.0|(262145,[53777,10...|
|  0.0|(262145,[138356,2...|
|  0.0|(262145,[24113,25...|
|  1.0|(262145,[68867,13...|
|  1.0|(262145,[24417,36...|
|  0.0|(262145,[18098,24...|
|  1.0|(262145,[24417,25...|
|  1.0|(262145,[24417,25...|
|  0.0|(262145,[31704,21...|
|  1.0|(262145,[25570,27...|
|  1.0|(262145,[12329,15...|
|  1.0|(262145,[8287,139...|
+-----+--------------------+
only showing top 20 rows



In [16]:
#Break data down into a training set and a testing set
(training, testing) = cleaned.randomSplit([0.7,0.3])

In [17]:
#Import ML NaiveBayes Algo
from pyspark.ml.classification import NaiveBayes

#Create a Naive Bayes model and fit training data
nb = NaiveBayes(smoothing=1.0, modelType='multinomial')
review_predictor = nb.fit(training)

In [19]:
#Transform the model with the testing data
test_results = review_predictor.transform(testing)
test_results.show(5)

+--------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|   class|                text|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+--------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|negative|"It was extremely...|    51|  1.0|["it, was, extrem...|["it, extremely, ...|(262144,[7388,255...|(262144,[7388,255...|(262145,[7388,255...|[-491.65086547036...|[3.64210374914195...|       1.0|
|negative|"The servers went...|    97|  1.0|["the, servers, w...|["the, servers, w...|(262144,[50940,67...|(262144,[50940,67...|(262145,[50940,67...|[-1085.6720073040...|[4.22302768101216.

In [20]:
#Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print(f'Accuracy of model at predicting reviews was: {acc}')

Accuracy of model at predicting reviews was: 0.7578631895185322


In [21]:
spark.stop()