## Importing Libraries

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import col
from pyspark.sql.types import StringType, IntegerType, DoubleType

import pyspark.ml.feature as feats
# from pyspark.ml.feature import Tokenizer 
from pyspark.ml.feature import StringIndexer, IDF, HashingTF
from pyspark.ml import Pipeline #Build a pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.evaluation import MulticlassMetrics

import pandas as pd


## Creating Spark session

In [0]:
spark = SparkSession.builder.appName('Sentiment140LogisticRegressionHash').getOrCreate()

## Setting the configuration for Azure Storage Account access

In [0]:
spark.conf.set(
  "fs.azure.account.key.twittergenstorage.blob.core.windows.net",
  "okEjVsoQ+OmK+TNB4/gpnkiDAVNofpG1IxYTOFx+j1JJGQHw9JIk2zakiqyoXm4fmtrAH66vXQB0+AStEZgvtg==")

## Importing the dataset from Azure Storage Account

In [0]:
train = spark.read.format("csv").option("header", "false").load("wasbs://realtimetwitterdata@twittergenstorage.blob.core.windows.net/train-2-3.csv")

In [0]:
display(train)

_c0,_c1,_c2
762637,0,Probelm with nap: i'm not good at waking up. I JUST got up
1177984,4,btw I made muffins today
1560678,4,rain comes again
797330,0,@bsemaj calll meeeeee
1421115,4,with dafiii the best girl everr! MILEY COME TO ARGENTINAAA PLEASE!
678545,0,my friend just left i had fun tho!
45336,0,"says, ""ah babon goreng Watchmen tiketnya abis."""
1239887,4,@MTVBuzzworthy awesome! i was actually at the show... and it was so fun! i think a lot of it had to do with Twilight madness
365234,0,@usweekly Not gonna lie...pretty tired of seeing Jon & Kate on the cover. It's making me not want to renew my subscription
1287937,4,@secondhandjohn thank u for following me can't wait for the new album <3


## Renaming the columns

In [0]:
train = (train.withColumnRenamed('_c0','row_num')
        .withColumnRenamed('_c1','target')
        .withColumnRenamed('_c2','text')
)

## Replacing the 4 with 1 denoting the Positive Sentiment, 0 denotes the Negative Sentiment

In [0]:
train = train.withColumn('target', regexp_replace('target', '4', '1'))

## Converting target to Integer and creating a final dataframe for training

In [0]:
train = train.withColumn("target", train["target"].cast(IntegerType()))
end_df = train.select(col('text'), col('target'))

end_df.show()

+--------------------+------+
|                text|target|
+--------------------+------+
|Probelm with nap:...|     0|
|btw I made muffin...|     1|
|   rain comes again |     1|
|@bsemaj calll mee...|     0|
|with dafiii the b...|     1|
|my friend just le...|     0|
|says, &quot;ah ba...|     0|
|@MTVBuzzworthy aw...|     1|
|@usweekly Not gon...|     0|
|@secondhandjohn t...|     1|
|@bashley the musi...|     1|
|U love Cookie and...|     1|
|In seaside @ the ...|     1|
|@DonnieWahlberg c...|     1|
|Two words... Yay ...|     1|
|@evanmiles ur soo...|     0|
|@Tomtjok and some...|     1|
|that part I could...|     0|
|don't leave. you'...|     0|
|I cannot open my ...|     0|
+--------------------+------+
only showing top 20 rows



## Installing and importing Spark NLP dependencies

In [0]:
!pip install spark-nlp==4.0.1
import sparknlp
from sparknlp.base import *

Collecting spark-nlp==4.0.1
  Using cached spark_nlp-4.0.1-py2.py3-none-any.whl (531 kB)
Installing collected packages: spark-nlp
Successfully installed spark-nlp-4.0.1
You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-8172aa49-8972-4552-9fc2-ad1aac56e793/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
from sparknlp.annotator import *

## Creating the transformers and estimator for the pipeline

In [0]:
# Converting string to Spark NLP document
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

# Converting Spark NLP document to Spark NLP token in order to perform pre-processing
tokenizer = Tokenizer().setInputCols("document").setOutputCol("token")

# Removing punctuations
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")\
    .setLowercase(True)\
    .setCleanupPatterns(["[^\w\d\s]"]) # remove punctuations (keep alphanumeric chars)
    # if we don't set CleanupPatterns, it will only keep alphabet letters ([^A-Za-z])

# Removing stop words
stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)\

# Converting back to string
finisher = Finisher().setInputCols("cleanTokens").setOutputCols("output").setOutputAsArray(False).setAnnotationSplitSymbol(' ')

# Creating tokens using Spark ML features
tokenizer2 = feats.Tokenizer().setInputCol("output").setOutputCol("token_tweet")

# Vectorizing the text using TF-IDF
hashtf = HashingTF(numFeatures=2**16, inputCol="token_tweet", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol = "target", outputCol = "label")

lr = LogisticRegression(maxIter = 100, labelCol="label", featuresCol="features", predictionCol="prediction")

## Creating the pipeline and fitting the model

In [0]:
pipeline = Pipeline(stages=[documentAssembler,tokenizer,normalizer,stopwords_cleaner,finisher,tokenizer2,hashtf,idf,label_stringIdx,lr])
pipelineFit = pipeline.fit(end_df)

## Saving the pipeline to the DBFS file system

In [0]:
pipelineFit.save("dbfs:/FileStore/models/logistic-regression-hash")

## Copying the model to Azure Storage Account

In [0]:
dbutils.fs.cp("FileStore/models/logistic-regression-hash", "wasbs://realtimetwitterdata@twittergenstorage.blob.core.windows.net/models/logistic-regression-hash", recurse=True)

Out[15]: True

## Importing the test dataset and performing same pre-processing as the train dataset

In [0]:
test = spark.read.format("csv").option("header", "false").load("wasbs://realtimetwitterdata@twittergenstorage.blob.core.windows.net/test-2-3.csv")

In [0]:
test = (test.withColumnRenamed('_c0','row_num')
        .withColumnRenamed('_c1','target')
        .withColumnRenamed('_c2','text')
)

In [0]:
test = test.withColumn('target', regexp_replace('target', '4', '1'))

In [0]:
test = test.withColumn("target", test["target"].cast(IntegerType()))

## Fitting the model pipeline on test dataset and evaluating the results

In [0]:
result = pipelineFit.transform(test)

row_num,target,text,output,token_tweet,tf,features,label,rawPrediction,probability,prediction
541200,0,@chrishasboobs AHHH I HOPE YOUR OK!!!,chrishasboobs ahhh hope ok,"List(chrishasboobs, ahhh, hope, ok)","Map(vectorType -> sparse, length -> 65536, indices -> List(1589, 24792, 39000, 62624), values -> List(1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(1589, 24792, 39000, 62624), values -> List(4.665935675803515, 6.478934172211255, 9.695382811421844, 3.885649086534939))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(-0.6028536319411848, 0.6028536319411848))","Map(vectorType -> dense, length -> 2, values -> List(0.3536910994513451, 0.6463089005486549))",1.0
750,0,"@misstoriblack cool , i have no tweet apps for my razr 2",misstoriblack cool tweet apps razr 2,"List(misstoriblack, cool, tweet, apps, razr, 2)","Map(vectorType -> sparse, length -> 65536, indices -> List(7062, 11155, 12524, 27011, 29597, 58863), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(7062, 11155, 12524, 27011, 29597, 58863), values -> List(7.522009449907805, 10.749543337519121, 3.8591553926985362, 4.736572585190175, 9.503010918774388, 5.001120082240299))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.24754190839069937, -0.24754190839069937))","Map(vectorType -> dense, length -> 2, values -> List(0.5615713885842804, 0.43842861141571965))",0.0
766711,0,"@TiannaChaos i know just family drama. its lame.hey next time u hang out with kim n u guys like have a sleepover or whatever, ill call u",tiannachaos know family drama lamehey next time u hang kim n u guys like sleepover whatever ill call u,"List(tiannachaos, know, family, drama, lamehey, next, time, u, hang, kim, n, u, guys, like, sleepover, whatever, ill, call, u)","Map(vectorType -> sparse, length -> 65536, indices -> List(5015, 6512, 9859, 11650, 16754, 18312, 24409, 35840, 38640, 42899, 44170, 51783, 52303, 52644, 54961, 55981, 58644), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(5015, 6512, 9859, 11650, 16754, 18312, 24409, 35840, 38640, 42899, 44170, 51783, 52303, 52644, 54961, 55981, 58644), values -> List(10.66616172858007, 7.3368184506858976, 3.4629050280940197, 3.0633411098820833, 7.960781756033739, 6.214609031257577, 5.385194000367146, 4.769337585626208, 3.9499544393223913, 10.749543337519121, 5.17557247438491, 10.800825394815586, 6.637244971680424, 5.196246768601053, 4.474863199593502, 3.38232895838436, 4.918043440103816))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.002476507327670352, -0.002476507327670352))","Map(vectorType -> dense, length -> 2, values -> List(0.5006191265154878, 0.4993808734845122))",0.0
285055,0,School email won't open and I have geography stuff on there to revise! *Stupid School* :'(,school email wont open geography stuff revise stupid school,"List(school, email, wont, open, geography, stuff, revise, stupid, school)","Map(vectorType -> sparse, length -> 65536, indices -> List(1696, 14175, 15674, 21671, 38455, 51381, 61031, 63810), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(1696, 14175, 15674, 21671, 38455, 51381, 61031, 63810), values -> List(5.480789121048167, 7.723830231753194, 5.12333078498163, 8.387869328155068, 5.927510151217497, 4.6765989887616835, 8.870415903243583, 6.009918272407978))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(3.4787293291234027, -3.4787293291234027))","Map(vectorType -> dense, length -> 2, values -> List(0.9700764570276881, 0.029923542972311923))",0.0
705995,0,upper airways problem,upper airways problem,"List(upper, airways, problem)","Map(vectorType -> sparse, length -> 65536, indices -> List(14708, 37682, 42718), values -> List(1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(14708, 37682, 42718), values -> List(10.147367935164901, 6.062192263168497, 8.69208070255806))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(-0.2796702216757343, 0.2796702216757343))","Map(vectorType -> dense, length -> 2, values -> List(0.4305346274640952, 0.5694653725359048))",1.0
379611,0,Going to miss Pastor's sermon on Faith...,going miss pastors sermon faith,"List(going, miss, pastors, sermon, faith)","Map(vectorType -> sparse, length -> 65536, indices -> List(16147, 28293, 36127, 36846, 63887), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(16147, 28293, 36127, 36846, 63887), values -> List(9.5809724602441, 8.029965631245842, 3.809952407481163, 3.2373295501681962, 10.358677028832108))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(-0.5593600066833737, 0.5593600066833737))","Map(vectorType -> dense, length -> 2, values -> List(0.3636955547506442, 0.6363044452493558))",1.0
1189018,1,on lunch....dj should come eat with me,lunchdj come eat,"List(lunchdj, come, eat)","Map(vectorType -> sparse, length -> 65536, indices -> List(5964, 7772, 53756), values -> List(1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(5964, 7772, 53756), values -> List(5.376078166959136, 4.191345534706852, 10.626941015426787))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(-1.3262410064653782, 1.3262410064653782))","Map(vectorType -> dense, length -> 2, values -> List(0.20978182660268777, 0.7902181733973123))",1.0
667030,0,@piginthepoke oh why are you feeling like that?,piginthepoke oh feeling like,"List(piginthepoke, oh, feeling, like)","Map(vectorType -> sparse, length -> 65536, indices -> List(6661, 11650, 18184, 44648), values -> List(1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(6661, 11650, 18184, 44648), values -> List(4.686061511551137, 3.0633411098820833, 3.762615294371987, 9.79069299122617))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.4087346215767734, -0.4087346215767734))","Map(vectorType -> dense, length -> 2, values -> List(0.6007844260645335, 0.3992155739354665))",0.0
93541,0,gahh noo!peyton needs to live!this is horrible,gahh noopeyton needs livethis horrible,"List(gahh, noopeyton, needs, livethis, horrible)","Map(vectorType -> sparse, length -> 65536, indices -> List(2784, 6270, 39655, 43417, 58839), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(2784, 6270, 39655, 43417, 58839), values -> List(11.320088195986733, 5.783662882219689, 6.3529494099065476, 8.835181546198733, 7.308567984400046))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(3.237346304688515, -3.237346304688515))","Map(vectorType -> dense, length -> 2, values -> List(0.9622157483984197, 0.037784251601580277))",0.0
1097326,1,@mrstessyman thank you glad you like it! There is a product review bit on the site Enjoy knitting it!,mrstessyman thank glad like product review bit site enjoy knitting,"List(mrstessyman, thank, glad, like, product, review, bit, site, enjoy, knitting)","Map(vectorType -> sparse, length -> 65536, indices -> List(2701, 6693, 11650, 16247, 24185, 29315, 41092, 44402, 47808, 52879), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(2701, 6693, 11650, 16247, 24185, 29315, 41092, 44402, 47808, 52879), values -> List(5.388863729256108, 5.006261481740718, 3.0633411098820833, 4.522061935048493, 5.047066062091062, 5.987369402721364, 7.391283713596584, 8.887825279684154, 7.403460424171839, 8.236063315287064))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(-5.6649635808248835, 5.6649635808248835))","Map(vectorType -> dense, length -> 2, values -> List(0.0034533073168761808, 0.9965466926831238))",1.0


In [0]:
res = result.select(["target", "prediction"])

In [0]:
result1 = result.withColumn("target", result["target"].cast(DoubleType()))

In [0]:
metrics = MulticlassMetrics(result1.select("prediction","target").rdd)

In [0]:
modelMetrics = pd.DataFrame(columns = ["Metric", "Value"])
modelMetrics.loc[len(modelMetrics.index)] = [ "Model Accuracy", str(round(metrics.accuracy*100,2)) ]
labels = result.rdd.map(lambda l: l.target).distinct().collect()
 
for label in labels:
    modelMetrics.loc[len(modelMetrics.index)] = [str(label) + " Precision and Recall",str(round(metrics.precision(label),2))+" and "+ str(round(metrics.recall(label),2))]

        
modelMetrics.loc[len(modelMetrics.index)] = [ "Weighted Recall", str(round(metrics.weightedRecall,2)) ]
modelMetrics.loc[len(modelMetrics.index)] = [ "Weighted Precision", str(round(metrics.weightedPrecision,2)) ]
modelMetrics.loc[len(modelMetrics.index)] = [ "Weighted False Positive Rate", str(round(metrics.weightedFalsePositiveRate,2)) ]
modelMetrics.loc[len(modelMetrics.index)] = [ "Weighted F 1 Score", str(round(metrics.weightedFMeasure(),2)) ]
modelMetrics.loc[len(modelMetrics.index)] = [ "Weighted F 0.5 Score", str(round(metrics.weightedFMeasure(beta=0.5),2)) ]

In [0]:
display(modelMetrics)

Metric,Value
Model Accuracy,76.18
0 Precision and Recall,0.77 and 0.75
1 Precision and Recall,0.76 and 0.77
Weighted Recall,0.76
Weighted Precision,0.76
Weighted False Positive Rate,0.24
Weighted F 1 Score,0.76
Weighted F 0.5 Score,0.76
