## Importing Libraries

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import col
from pyspark.sql.types import StringType, IntegerType, DoubleType

import pyspark.ml.feature as feats
# from pyspark.ml.feature import Tokenizer 
from pyspark.ml.feature import CountVectorizer
from pyspark.ml import Pipeline #Build a pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.evaluation import MulticlassMetrics

import pandas as pd

## Creating Spark session

In [0]:
spark = SparkSession.builder.appName('Sentiment140LogisticRegression').getOrCreate()

## Setting the configuration for Azure Storage Account access

In [0]:
spark.conf.set(
  "fs.azure.account.key.twittergenstorage.blob.core.windows.net",
  "okEjVsoQ+OmK+TNB4/gpnkiDAVNofpG1IxYTOFx+j1JJGQHw9JIk2zakiqyoXm4fmtrAH66vXQB0+AStEZgvtg==")

## Importing the dataset from Azure Storage Account

In [0]:
train = spark.read.format("csv").option("header", "false").load("wasbs://realtimetwitterdata@twittergenstorage.blob.core.windows.net/train-2-3.csv")

In [0]:
display(train)

_c0,_c1,_c2
762637,0,Probelm with nap: i'm not good at waking up. I JUST got up
1177984,4,btw I made muffins today
1560678,4,rain comes again
797330,0,@bsemaj calll meeeeee
1421115,4,with dafiii the best girl everr! MILEY COME TO ARGENTINAAA PLEASE!
678545,0,my friend just left i had fun tho!
45336,0,"says, ""ah babon goreng Watchmen tiketnya abis."""
1239887,4,@MTVBuzzworthy awesome! i was actually at the show... and it was so fun! i think a lot of it had to do with Twilight madness
365234,0,@usweekly Not gonna lie...pretty tired of seeing Jon & Kate on the cover. It's making me not want to renew my subscription
1287937,4,@secondhandjohn thank u for following me can't wait for the new album <3


## Renaming the columns

In [0]:
train = (train.withColumnRenamed('_c0','row_num')
        .withColumnRenamed('_c1','target')
        .withColumnRenamed('_c2','text')
)

## Replacing the 4 with 1 denoting the Positive Sentiment, 0 denotes the Negative Sentiment

In [0]:
train = train.withColumn('target', regexp_replace('target', '4', '1'))

## Converting target to Integer and creating a final dataframe for training

In [0]:
train = train.withColumn("target", train["target"].cast(IntegerType()))
end_df = train.select(col('text'), col('target'))

end_df.show()

+--------------------+------+
|                text|target|
+--------------------+------+
|Probelm with nap:...|     0|
|btw I made muffin...|     1|
|   rain comes again |     1|
|@bsemaj calll mee...|     0|
|with dafiii the b...|     1|
|my friend just le...|     0|
|says, &quot;ah ba...|     0|
|@MTVBuzzworthy aw...|     1|
|@usweekly Not gon...|     0|
|@secondhandjohn t...|     1|
|@bashley the musi...|     1|
|U love Cookie and...|     1|
|In seaside @ the ...|     1|
|@DonnieWahlberg c...|     1|
|Two words... Yay ...|     1|
|@evanmiles ur soo...|     0|
|@Tomtjok and some...|     1|
|that part I could...|     0|
|don't leave. you'...|     0|
|I cannot open my ...|     0|
+--------------------+------+
only showing top 20 rows



## Installing and importing Spark NLP dependencies

In [0]:
!pip install spark-nlp==4.0.1
import sparknlp
from sparknlp.base import *

You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-c026d9fc-e720-46ef-a8fe-37e72d1926cd/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
from sparknlp.annotator import *

## Creating the transformers and estimator for the pipeline

In [0]:
# Converting string to Spark NLP document
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

# Converting Spark NLP document to Spark NLP token in order to perform pre-processing
tokenizer = Tokenizer().setInputCols("document").setOutputCol("token")

# Removing punctuations
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")\
    .setLowercase(True)\
    .setCleanupPatterns(["[^\w\d\s]"]) # remove punctuations (keep alphanumeric chars)
    # if we don't set CleanupPatterns, it will only keep alphabet letters ([^A-Za-z])

# Removing stop words
stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)\

# Converting back to string
finisher = Finisher().setInputCols("cleanTokens").setOutputCols("output").setOutputAsArray(False).setAnnotationSplitSymbol(' ')

# Creating tokens using Spark ML features
tokenizer2 = feats.Tokenizer().setInputCol("output").setOutputCol("token_tweet")

# Vectorizing the text
vectorizer = CountVectorizer().setInputCol("token_tweet").setOutputCol("features")

# Using Logistic Regression estimator
lr = LogisticRegression(maxIter = 100, labelCol="target", featuresCol="features", predictionCol="prediction")

## Creating the pipeline and fitting the model

In [0]:
pipeline = Pipeline(stages=[documentAssembler,tokenizer,normalizer,stopwords_cleaner,finisher,tokenizer2, vectorizer, lr])
pipelineFit = pipeline.fit(end_df)

## Saving the pipeline to the DBFS file system

In [0]:
pipelineFit.save("dbfs:/FileStore/models/logistic-regression")

## Copying the model to Azure Storage Account

In [0]:
dbutils.fs.cp("FileStore/models/logistic-regression", "wasbs://realtimetwitterdata@twittergenstorage.blob.core.windows.net/models/logistic-regression", recurse=True)

Out[71]: True

## Importing the test dataset and performing same pre-processing as the train dataset

In [0]:
test = spark.read.format("csv").option("header", "false").load("wasbs://realtimetwitterdata@twittergenstorage.blob.core.windows.net/test-2-3.csv")

In [0]:
test = (test.withColumnRenamed('_c0','row_num')
        .withColumnRenamed('_c1','target')
        .withColumnRenamed('_c2','text')
)

In [0]:
test = test.withColumn('target', regexp_replace('target', '4', '1'))

In [0]:
test = test.withColumn("target", test["target"].cast(IntegerType()))

## Fitting the model pipeline on test dataset and evaluating the results

In [0]:
result = pipelineFit.transform(test)

row_num,target,text,output,token_tweet,features,rawPrediction,probability,prediction
541200,0,@chrishasboobs AHHH I HOPE YOUR OK!!!,chrishasboobs ahhh hope ok,"List(chrishasboobs, ahhh, hope, ok)","Map(vectorType -> sparse, length -> 262144, indices -> List(37, 97, 723, 10000), values -> List(1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(-1.161795710513713, 1.161795710513713))","Map(vectorType -> dense, length -> 2, values -> List(0.23834114833367281, 0.7616588516663272))",1.0
750,0,"@misstoriblack cool , i have no tweet apps for my razr 2",misstoriblack cool tweet apps razr 2,"List(misstoriblack, cool, tweet, apps, razr, 2)","Map(vectorType -> sparse, length -> 262144, indices -> List(31, 110, 146, 1682, 30565, 51499), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(2.789974830203041, -2.789974830203041))","Map(vectorType -> dense, length -> 2, values -> List(0.9421316724920458, 0.05786832750795423))",0.0
766711,0,"@TiannaChaos i know just family drama. its lame.hey next time u hang out with kim n u guys like have a sleepover or whatever, ill call u",tiannachaos know family drama lamehey next time u hang kim n u guys like sleepover whatever ill call u,"List(tiannachaos, know, family, drama, lamehey, next, time, u, hang, kim, n, u, guys, like, sleepover, whatever, ill, call, u)","Map(vectorType -> sparse, length -> 262144, indices -> List(4, 13, 16, 18, 40, 76, 113, 189, 227, 261, 585, 900, 1425, 2860, 3238), values -> List(1.0, 1.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(-3.5596515005362135, 3.5596515005362135))","Map(vectorType -> dense, length -> 2, values -> List(0.02766179423745162, 0.9723382057625484))",1.0
285055,0,School email won't open and I have geography stuff on there to revise! *Stupid School* :'(,school email wont open geography stuff revise stupid school,"List(school, email, wont, open, geography, stuff, revise, stupid, school)","Map(vectorType -> sparse, length -> 262144, indices -> List(72, 100, 190, 287, 442, 459, 2016, 3368), values -> List(2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(3.765976110718883, -3.765976110718883))","Map(vectorType -> dense, length -> 2, values -> List(0.9773785642231867, 0.022621435776813303))",0.0
705995,0,upper airways problem,upper airways problem,"List(upper, airways, problem)","Map(vectorType -> sparse, length -> 262144, indices -> List(499, 5772, 20569), values -> List(1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(-0.8099337552093188, 0.8099337552093188))","Map(vectorType -> dense, length -> 2, values -> List(0.30790461224119925, 0.6920953877588008))",1.0
379611,0,Going to miss Pastor's sermon on Faith...,going miss pastors sermon faith,"List(going, miss, pastors, sermon, faith)","Map(vectorType -> sparse, length -> 262144, indices -> List(8, 33, 2587, 9435, 16746), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(-1.8138212992364908, 1.8138212992364908))","Map(vectorType -> dense, length -> 2, values -> List(0.14017692106512275, 0.8598230789348773))",1.0
1189018,1,on lunch....dj should come eat with me,lunchdj come eat,"List(lunchdj, come, eat)","Map(vectorType -> sparse, length -> 262144, indices -> List(55, 262), values -> List(1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(-0.36557050162074256, 0.36557050162074256))","Map(vectorType -> dense, length -> 2, values -> List(0.409611776569538, 0.590388223430462))",1.0
667030,0,@piginthepoke oh why are you feeling like that?,piginthepoke oh feeling like,"List(piginthepoke, oh, feeling, like)","Map(vectorType -> sparse, length -> 262144, indices -> List(4, 30, 102, 10458), values -> List(1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(0.38256812320345357, -0.38256812320345357))","Map(vectorType -> dense, length -> 2, values -> List(0.5944923533960224, 0.40550764660397765))",0.0
93541,0,gahh noo!peyton needs to live!this is horrible,gahh noopeyton needs livethis horrible,"List(gahh, noopeyton, needs, livethis, horrible)","Map(vectorType -> sparse, length -> 262144, indices -> List(384, 642, 4961), values -> List(1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(2.9033478814455984, -2.9033478814455984))","Map(vectorType -> dense, length -> 2, values -> List(0.9480116867303519, 0.051988313269648145))",0.0
1097326,1,@mrstessyman thank you glad you like it! There is a product review bit on the site Enjoy knitting it!,mrstessyman thank glad like product review bit site enjoy knitting,"List(mrstessyman, thank, glad, like, product, review, bit, site, enjoy, knitting)","Map(vectorType -> sparse, length -> 262144, indices -> List(4, 79, 154, 167, 267, 455, 1515, 3337, 5341, 45617), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(-15.003702062650959, 15.003702062650959))","Map(vectorType -> dense, length -> 2, values -> List(3.047718517125595E-7, 0.9999996952281482))",1.0


In [0]:
res = result.select(["target", "prediction"])

In [0]:
result1 = result.withColumn("target", result["target"].cast(DoubleType()))

In [0]:
metrics = MulticlassMetrics(result1.select("prediction","target").rdd)

In [0]:
modelMetrics = pd.DataFrame(columns = ["Metric", "Value"])
modelMetrics.loc[len(modelMetrics.index)] = [ "Model Accuracy", str(round(metrics.accuracy*100,2)) ]
labels = result.rdd.map(lambda l: l.target).distinct().collect()
 
for label in labels:
    modelMetrics.loc[len(modelMetrics.index)] = [str(label) + " Precision and Recall",str(round(metrics.precision(label),2))+" and "+ str(round(metrics.recall(label),2))]
        
modelMetrics.loc[len(modelMetrics.index)] = [ "Weighted Recall", str(round(metrics.weightedRecall,2)) ]
modelMetrics.loc[len(modelMetrics.index)] = [ "Weighted Precision", str(round(metrics.weightedPrecision,2)) ]
modelMetrics.loc[len(modelMetrics.index)] = [ "Weighted False Positive Rate", str(round(metrics.weightedFalsePositiveRate,2)) ]
modelMetrics.loc[len(modelMetrics.index)] = [ "Weighted F 1 Score", str(round(metrics.weightedFMeasure(),2)) ]
modelMetrics.loc[len(modelMetrics.index)] = [ "Weighted F 0.5 Score", str(round(metrics.weightedFMeasure(beta=0.5),2)) ]

In [0]:
display(modelMetrics)

Metric,Value
Model Accuracy,75.42
0 Precision and Recall,0.76 and 0.75
1 Precision and Recall,0.75 and 0.76
Weighted Recall,0.75
Weighted Precision,0.75
Weighted False Positive Rate,0.25
Weighted F 1 Score,0.75
Weighted F 0.5 Score,0.75
