## Create Spark Session and read data into spark dataframe

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
  
spark = SparkSession.builder.getOrCreate()

schema = StructType([
      StructField("reviewerID",StringType(),True),
      StructField("asin",StringType(),True),
      StructField("reviewerName",StringType(),True),
      StructField("helpful",StringType(),True),
      StructField("reviewText",StringType(),True),
      StructField("overall",StringType(),True),
      StructField("summary",StringType(),True),
      StructField("unixReviewTime",StringType(),True),
      StructField("reviewTime",StringType(),True)
  ])

df = spark.read.schema(schema).json('data/Software.json')
df.show(5,truncate=70)

AnalysisException: Path does not exist: file:/Users/deepali/Desktop/DATA -228-22/PROJECT/CODE/data/Software.json

In [None]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import rand 

df = df.selectExpr("cast(reviewText as string) reviewText",
                    "cast(overall as int) overall")
#df.show()

## Cleaning data

In [None]:
df = df.na.drop("any")
df.count()

## Create a table from data to work with sql

In [None]:
from pyspark.sql import SparkSession
  
# creating sparksession and giving app name
spark = SparkSession.builder.appName('sparkdf').getOrCreate()

# creating a temporary view of
# Dataframe and storing it into df
df.createOrReplaceTempView("df")

# using the SQL query to count all
# distinct records and display the
# count on the screen
spark.sql("select count((overall)),overall from df group by overall").show()

In [None]:
# filtering against review scores more than 5 or less than 1
df = df.filter("overall<6 and overall!=3")
df = df.filter("overall>0")
df.count()

In [None]:
from pyspark.ml.feature import Bucketizer
# map review scores into two categories
bucketizer = Bucketizer(splits=[ 1, 4, 5 ],inputCol="overall", outputCol="label")
df = bucketizer.setHandleInvalid("keep").transform(df)

df.show()

In [None]:
from pyspark.sql import SparkSession
  
# creating sparksession and giving app name

# creating a temporary view of
# Dataframe and storing it into df
df.createOrReplaceTempView("df")

# using the SQL query to count all
# distinct records and display the
# count on the screen
spark.sql("select count((overall)),overall from df group by overall").show()

In [None]:
df.show()

In [None]:
#keeping reviewText and label column
df = df["reviewText", "label"]

In [None]:
#shuffling rows in df
df = df.orderBy(rand())

In [None]:
#check how data is spread among two categories
df.createOrReplaceTempView("df")

# using the SQL query to count all
# distinct records and display the
# count on the screen
spark.sql("select count((label)),label from df group by label").show()

In [None]:
#df.show()

In [None]:
import pyspark.sql.functions as sq
from pyspark.sql.functions import lower, col
#replace regex
df = df.select("*", lower(col('reviewText')).alias("lower_text"))
df = df.withColumn("no_line_text", sq.regexp_replace("lower_text", r"\n", " "))
df = df.withColumn("no_digit_text", sq.regexp_replace("no_line_text", r"[0-9]", " "))
df = df.withColumn("text_ready", sq.regexp_replace("no_digit_text", r"[^\P{P}-]+", " "))


In [None]:
#dropping duplicates
df = df.dropDuplicates()
#df.show()

In [None]:
#from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
import pyspark.ml.feature as ft
from pyspark.ml.classification import LogisticRegression

# regular expression tokenizer
regexTokenizer = ft.RegexTokenizer(inputCol="text_ready", outputCol="words", pattern="\\W")

# stop words
stopwordsRemover = ft.StopWordsRemover(inputCol="words", outputCol="filtered")

ngram = ft.NGram(n=3, inputCol="filtered", outputCol="nGrams")


# bag of words count
countVectors = ft.CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

word2Vec = ft.Word2Vec(vectorSize=10, seed=42, inputCol="filtered", outputCol="features")


In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

#crete pipeline
pipeline_w = Pipeline(stages=[regexTokenizer, stopwordsRemover, word2Vec])
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors])
# Fit the pipeline to training documents.
pipelineFit_w = pipeline_w.fit(df)
pipelineFit = pipeline.fit(df)
dataset_w = pipelineFit_w.transform(df)
dataset = pipelineFit.transform(df)
dataset.show(5, truncate=50)

In [None]:
dataset_w = dataset_w["text_ready", "features", "label"]
dataset = dataset["text_ready", "features", "label"]
# set seed for reproducibility
(trainingData_w, testData_w) = dataset_w.randomSplit([0.7, 0.3], seed = 100)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

## LogisticRegression model

In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData_w)
lrw_predictions = lrModel.transform(testData_w)
lrw_predictions.filter(lrw_predictions['prediction'] == 0) \
    .select("text_ready","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
lrw_evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
lr_ev = lrw_evaluator.evaluate(lrw_predictions)
print("Logistic Regression Accuracy: \n" + str(lr_ev))

## Logistic Regression with word2vec percision and recall

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import numpy as np

y_true = np.array(lrw_predictions.select("label").collect())
y_pred = np.array(lrw_predictions.select("prediction").collect())


print("Logistic Regression model with word2vec Recall score: {}".format(recall_score(y_true,y_pred)))
print("Logistic Regression model with word2ve Precision score: {}".format(precision_score(y_true,y_pred)))


## Naive Bayes model with countVec

In [None]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)
nb_predictions = model.transform(testData)
nb_predictions.filter(nb_predictions['prediction'] == 0) \
    .select("text_ready","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
nb_evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
nb_ev = nb_evaluator.evaluate(nb_predictions)
print("Naive Bayes model Accuracy: \n" + str(nb_ev))

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import numpy as np

y_true = np.array(nb_predictions.select("label").collect())
y_pred = np.array(nb_predictions.select("prediction").collect())


print("Naive Bayes model with countVec Recall score: {}".format(recall_score(y_true,y_pred)))
print("Naive Bayes model with countVec Precision score: {}".format(precision_score(y_true,y_pred)))

## Random Forest Classifier with countVec

In [None]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)
# Train model with Training Data
rfModel = rf.fit(trainingData)
rf_predictions = rfModel.transform(testData)
rf_predictions.filter(rf_predictions['prediction'] == 0) \
    .select("text_ready","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
rf_ev = evaluator.evaluate(rf_predictions)
print("Random Forest model Accuracy: \n" + str(rf_ev))

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import numpy as np

y_true = np.array(rf_predictions.select("label").collect())
y_pred = np.array(rf_predictions.select("prediction").collect())


print("Random Forest model with countVec Recall score: {}".format(recall_score(y_true,y_pred)))
print("Random Forest model with countVec Precision score: {}".format(precision_score(y_true,y_pred)))

## Random Forest Classifier with word2vec

In [None]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)
# Train model with Training Data
rfModel = rf.fit(trainingData_w)
rf_predictions_w = rfModel.transform(testData_w)
rf_predictions_w.filter(rf_predictions_w['prediction'] == 0) \
    .select("text_ready","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
rf_ev_w = evaluator.evaluate(rf_predictions_w)
print("Random Forest with word2vec model Accuracy: \n" + str(rf_ev_w))

In [None]:
y_true = np.array(rf_predictions_w.select("label").collect())
y_pred = np.array(rf_predictions_w.select("prediction").collect())


print("Random Forest model with w2v Recall score: {}".format(recall_score(y_true,y_pred)))
print("Random Forest model with w2v Precision score: {}".format(precision_score(y_true,y_pred)))