## Importing Libraries

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import col
from pyspark.sql.types import StringType, IntegerType, DoubleType

import pyspark.ml.feature as feats
# from pyspark.ml.feature import Tokenizer 
from pyspark.ml.feature import CountVectorizer

from pyspark.ml import Pipeline #Build a pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.mllib.evaluation import MulticlassMetrics

import pandas as pd




## Creating Spark session

In [0]:
spark = SparkSession.builder.appName('Sentiment140RandomForest').getOrCreate()

## Setting the configuration for Azure Storage Account access

In [0]:
spark.conf.set(
  "fs.azure.account.key.twittergenstorage.blob.core.windows.net",
  "okEjVsoQ+OmK+TNB4/gpnkiDAVNofpG1IxYTOFx+j1JJGQHw9JIk2zakiqyoXm4fmtrAH66vXQB0+AStEZgvtg==")

## Importing the dataset from Azure Storage Account

In [0]:
train = spark.read.format("csv").option("header", "false").load("wasbs://realtimetwitterdata@twittergenstorage.blob.core.windows.net/train-2-3.csv")

In [0]:
display(train)

_c0,_c1,_c2
508566,0,"is ready for tomorrow's hangover. how i wonder how it would be?!? rain, rain, rain. I guess i won't be doing laundry after all"
1372625,4,Going out!
92958,0,I'm so bored. I wish he was here
1325629,4,"@pinktank1 yeah, i was changing it but he seemed excited so i suffered through Barney for him"
149409,0,NOOOOO!!!! I'm wearing sandals and just chipped the paint off my big toenail Lookin' silly now. . .
1378228,4,@willie_day26 how was the show?
207120,0,I can't have a sleep over 2nite
105003,0,@quotergal I'm very sad that you and @cabri and @NYPinTA won't be there this year.
479058,0,@aventuredebz not sure what happened there the line just went dead unless you hung up on me
942660,4,"@BerlyAnne Again, good daughter, this is true!!"


## Renaming the columns

In [0]:
train = (train.withColumnRenamed('_c0','row_num')
        .withColumnRenamed('_c1','target')
        .withColumnRenamed('_c2','text')
)

## Replacing the 4 with 1 denoting the Positive Sentiment, 0 denotes the Negative Sentiment

In [0]:
train = train.withColumn('target', regexp_replace('target', '4', '1'))

## Converting target to Integer and creating a final dataframe for training

In [0]:
train = train.withColumn("target", train["target"].cast(IntegerType()))
end_df = train.select(col('text'), col('target'))

end_df.show()

+--------------------+------+
|                text|target|
+--------------------+------+
|is ready for tomo...|     0|
|         Going out! |     1|
|I'm so bored. I w...|     0|
|@pinktank1 yeah, ...|     1|
|NOOOOO!!!! I'm we...|     0|
|@willie_day26 how...|     1|
|I can't have a sl...|     0|
|@quotergal I'm ve...|     0|
|@aventuredebz not...|     0|
|@BerlyAnne Again,...|     1|
|Oops, to x &quot;...|     0|
|is going to test ...|     1|
|eating nachos wit...|     1|
|@tartsea hehe, cÃ...|     1|
|Making caleb thro...|     1|
|I just want to gi...|     1|
|is going to sleep...|     1|
|@tomkelshaw I'm g...|     1|
|okay, Dear Catast...|     1|
|@parachutesfail I...|     0|
+--------------------+------+
only showing top 20 rows



## Installing and importing Spark NLP dependencies

In [0]:
!pip install spark-nlp==4.0.1
import sparknlp
from sparknlp.base import *

Collecting spark-nlp==4.0.1
  Using cached spark_nlp-4.0.1-py2.py3-none-any.whl (531 kB)
Installing collected packages: spark-nlp
Successfully installed spark-nlp-4.0.1
You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-90e11d20-2c29-4ced-9c75-d5a11f850384/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
from sparknlp.annotator import *

## Creating the transformers and estimator for the pipeline

In [0]:
# Converting string to Spark NLP document
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

# Converting Spark NLP document to Spark NLP token in order to perform pre-processing
tokenizer = Tokenizer().setInputCols("document").setOutputCol("token")

# Removing punctuations
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")\
    .setLowercase(True)\
    .setCleanupPatterns(["[^\w\d\s]"]) # remove punctuations (keep alphanumeric chars)
    # if we don't set CleanupPatterns, it will only keep alphabet letters ([^A-Za-z])

# Removing stop words
stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)\

# Converting back to string
finisher = Finisher().setInputCols("cleanTokens").setOutputCols("output").setOutputAsArray(False).setAnnotationSplitSymbol(' ')

# Creating tokens using Spark ML features
tokenizer2 = feats.Tokenizer().setInputCol("output").setOutputCol("token_tweet")

# Vectorizing the text
vectorizer = CountVectorizer().setInputCol("token_tweet").setOutputCol("features")

# Using the RandomForestClassifier estimator
forest = RandomForestClassifier(labelCol = "target", featuresCol="features", numTrees = 3, maxDepth = 16)

## Creating the pipeline and fitting the model

In [0]:
pipeline = Pipeline(stages=[documentAssembler,tokenizer,normalizer,stopwords_cleaner,finisher,tokenizer2,vectorizer,forest])
pipelineFit = pipeline.fit(end_df)

## Saving the pipeline to the DBFS file system

In [0]:
pipelineFit.save("dbfs:/FileStore/models/RandomForest")

## Copying the model to Azure Storage Account

In [0]:
dbutils.fs.cp("FileStore/models/RandomForest", "wasbs://realtimetwitterdata@twittergenstorage.blob.core.windows.net/models/RandomForest", recurse=True)

Out[38]: True

## Importing the test dataset and performing same pre-processing as the train dataset

In [0]:
test = spark.read.format("csv").option("header", "false").load("wasbs://realtimetwitterdata@twittergenstorage.blob.core.windows.net/test-2-3.csv")

In [0]:
test = (test.withColumnRenamed('_c0','row_num')
        .withColumnRenamed('_c1','target')
        .withColumnRenamed('_c2','text')
)

In [0]:
test = test.withColumn('target', regexp_replace('target', '4', '1'))

In [0]:
test = test.withColumn("target", test["target"].cast(IntegerType()))

## Fitting the model pipeline on test dataset and evaluating the results

In [0]:
result = pipelineFit.transform(test)

row_num,target,text,output,token_tweet,features,rawPrediction,probability,prediction
1235522,1,"@ColorblindFish Can't wait to see the pics, Thanks",colorblindfish cant wait see pics thanks,"List(colorblindfish, cant, wait, see, pics, thanks)","Map(vectorType -> sparse, length -> 81326, indices -> List(11, 20, 25, 64, 406, 3746), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(1.1197072617025055, 1.8802927382974945))","Map(vectorType -> dense, length -> 2, values -> List(0.37323575390083513, 0.6267642460991648))",1.0
966533,1,The Festï¿½s on,fests,List(fests),"Map(vectorType -> sparse, length -> 81326, indices -> List(58701), values -> List(1.0))","Map(vectorType -> dense, length -> 2, values -> List(1.5050710858940104, 1.4949289141059896))","Map(vectorType -> dense, length -> 2, values -> List(0.5016903619646701, 0.49830963803532985))",0.0
361764,0,I missed an live chat with Selena Gomez,missed live chat selena gomez,"List(missed, live, chat, selena, gomez)","Map(vectorType -> sparse, length -> 81326, indices -> List(154, 162, 694, 2906, 6762), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(1.5050710858940104, 1.4949289141059896))","Map(vectorType -> dense, length -> 2, values -> List(0.5016903619646701, 0.49830963803532985))",0.0
1443757,1,Goodnight! Thanks Robbie for all of your help today! You are AMAZING!,goodnight thanks robbie help today amazing,"List(goodnight, thanks, robbie, help, today, amazing)","Map(vectorType -> sparse, length -> 81326, indices -> List(8, 25, 148, 179, 353, 18203), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(1.1197072617025055, 1.8802927382974945))","Map(vectorType -> dense, length -> 2, values -> List(0.37323575390083513, 0.6267642460991648))",1.0
149240,0,It's a hot day and I'm on a crowded bus...this isn't pleasant!,hot day im crowded busthis isnt pleasant,"List(hot, day, im, crowded, busthis, isnt, pleasant)","Map(vectorType -> sparse, length -> 81326, indices -> List(0, 3, 135, 187, 3216, 6543), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(1.5050710858940104, 1.4949289141059896))","Map(vectorType -> dense, length -> 2, values -> List(0.5016903619646701, 0.49830963803532985))",0.0
911918,1,@wolfchild59 Heh... I send dem when I get dem Jen... I was shocked I found so many these last few days... earlier in the week nothing...,wolfchild59 heh send dem get dem jen shocked found many last days earlier week nothing,"List(wolfchild59, heh, send, dem, get, dem, jen, shocked, found, many, last, days, earlier, week, nothing)","Map(vectorType -> sparse, length -> 81326, indices -> List(2, 35, 68, 72, 168, 200, 228, 333, 662, 2018, 3602, 3874, 7711), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0))","Map(vectorType -> dense, length -> 2, values -> List(1.5050710858940104, 1.4949289141059896))","Map(vectorType -> dense, length -> 2, values -> List(0.5016903619646701, 0.49830963803532985))",0.0
185841,0,lakers baby !!! sad that I can't watch birdman anymore,lakers baby sad cant watch birdman anymore,"List(lakers, baby, sad, cant, watch, birdman, anymore)","Map(vectorType -> sparse, length -> 81326, indices -> List(11, 46, 98, 208, 294, 943), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(1.7274970370902205, 1.2725029629097795))","Map(vectorType -> dense, length -> 2, values -> List(0.5758323456967401, 0.4241676543032598))",0.0
1149736,1,@TheUndomestic I think if you put a couch on a dance floor and pump Daft Punk you better expect it wonder if the bouncer is on twitter...,theundomestic think put couch dance floor pump daft punk better expect wonder bouncer twitter,"List(theundomestic, think, put, couch, dance, floor, pump, daft, punk, better, expect, wonder, bouncer, twitter)","Map(vectorType -> sparse, length -> 81326, indices -> List(24, 40, 63, 215, 578, 690, 1256, 1739, 1920, 4839, 5220, 8136), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(1.5050710858940104, 1.4949289141059896))","Map(vectorType -> dense, length -> 2, values -> List(0.5016903619646701, 0.49830963803532985))",0.0
348673,0,Does anybody know somebody who can wipe away all my parking tickets?,anybody know somebody wipe away parking tickets,"List(anybody, know, somebody, wipe, away, parking, tickets)","Map(vectorType -> sparse, length -> 81326, indices -> List(17, 133, 560, 1226, 1423, 2010, 5546), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(1.5050710858940104, 1.4949289141059896))","Map(vectorType -> dense, length -> 2, values -> List(0.5016903619646701, 0.49830963803532985))",0.0
665171,0,I so bombed.,bombed,List(bombed),"Map(vectorType -> sparse, length -> 81326, indices -> List(9523), values -> List(1.0))","Map(vectorType -> dense, length -> 2, values -> List(1.5050710858940104, 1.4949289141059896))","Map(vectorType -> dense, length -> 2, values -> List(0.5016903619646701, 0.49830963803532985))",0.0


In [0]:
res = result.select(["target", "prediction"])

In [0]:
result1 = result.withColumn("target", result["target"].cast(DoubleType()))

In [0]:
metrics = MulticlassMetrics(result1.select("prediction","target").rdd)



In [0]:
modelMetrics = pd.DataFrame(columns = ["Metric", "Value"])
modelMetrics.loc[len(modelMetrics.index)] = [ "Model Accuracy", str(round(metrics.accuracy*100,2)) ]
labels = result.rdd.map(lambda l: l.target).distinct().collect()
 
for label in labels:
    modelMetrics.loc[len(modelMetrics.index)] = [str(label) + " Precision and Recall",str(round(metrics.precision(label),2))+" and "+ str(round(metrics.recall(label),2))]
        
modelMetrics.loc[len(modelMetrics.index)] = [ "Weighted Recall", str(round(metrics.weightedRecall,2)) ]
modelMetrics.loc[len(modelMetrics.index)] = [ "Weighted Precision", str(round(metrics.weightedPrecision,2)) ]
modelMetrics.loc[len(modelMetrics.index)] = [ "Weighted False Positive Rate", str(round(metrics.weightedFalsePositiveRate,2)) ]
modelMetrics.loc[len(modelMetrics.index)] = [ "Weighted F 1 Score", str(round(metrics.weightedFMeasure(),2)) ]
modelMetrics.loc[len(modelMetrics.index)] = [ "Weighted F 0.5 Score", str(round(metrics.weightedFMeasure(beta=0.5),2)) ]

In [0]:
display(modelMetrics)

Metric,Value
Model Accuracy,56.33
1 Precision and Recall,0.73 and 0.2
0 Precision and Recall,0.54 and 0.93
Weighted Recall,0.56
Weighted Precision,0.63
Weighted False Positive Rate,0.44
Weighted F 1 Score,0.5
Weighted F 0.5 Score,0.53
