# DS 5559 Final Project - Spark NLP Emotion Prediction and Evaluation



## Download Packages and Files

In [1]:
# Download necessary packages
! pip install spark-nlp==3.0.1 pyspark==3.1.1

Collecting spark-nlp==3.0.1
[?25l  Downloading https://files.pythonhosted.org/packages/e5/31/6e0f5cff049aa1f5b9bf06754001d9986211b45ca9165938adc8bed2fdf6/spark_nlp-3.0.1-py2.py3-none-any.whl (146kB)
[K     |████████████████████████████████| 153kB 17.4MB/s 
[?25hCollecting pyspark==3.1.1
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 67kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 51.6MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=cbbf5cb4ac5455c3b040a2acb

In [2]:
# Download Manually Labeled Data - 'joined_preds_manual.csv'
! gdown --id '1Z6jP9dvFUhb-UF9SHt_Q06LwgGV9dHBN'

Downloading...
From: https://drive.google.com/uc?id=1Z6jP9dvFUhb-UF9SHt_Q06LwgGV9dHBN
To: /content/joined_preds_manual_tabdelim.txt
  0% 0.00/183k [00:00<?, ?B/s]100% 183k/183k [00:00<00:00, 5.75MB/s]


## Import Modules

In [3]:
import pandas as pd
import numpy as np
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

## Data Cleaning and Transformation

### Read In Raw Tweets and Manual Labeled Data

In [4]:
spark = sparknlp.start()
data = spark.read.option("delimiter", "\t") \
                 .option("header", "true") \
                 .csv("joined_preds_manual_tabdelim.txt")
print(data.count())
data.show(5)

400
+-----+------------+-------------------+--------------+-----------+----------+-----------+---------------+----------------+--------------+---------------+--------------------+----------+----------+----------+-------------------+---------------------------+--------------+----------------------+-------------+--------------------+-----------+---------+-----------------------+-------------+--------------+----+-----------+--------------------+--------------------+-------------+--------------------+--------------------+-------+----------+-----------------+------------+---------+--------+---------+-------------+--------------+------------+-------------+----------+--------+--------+--------+-------------+
|index|anger_manual|anticipation_manual|disgust_manual|fear_manual|joy_manual|love_manual|optimism_manual|pessimism_manual|sadness_manual|surprise_manual|                text|neg_manual|neu_manual|pos_manual|main_emotion_manual|main_emotion_reduced_manual|  label_manual|scraped_hashtag_ma

## Spark NLP Emotion Modeling
Downloading the pre-trained Spark NLP Emotion Detection Classifier

In [5]:
MODEL_NAME='classifierdl_use_emotion'

documentAssembler = DocumentAssembler() \
  .setInputCol("text") \
  .setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
  .setInputCols(["document"])\
  .setOutputCol("sentence_embeddings")


sentimentdl = ClassifierDLModel.pretrained(name=MODEL_NAME)\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("sentiment")

nlpPipeline = Pipeline(stages = [documentAssembler, use, sentimentdl])


tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
classifierdl_use_emotion download started this may take some time.
Approximate size to download 21.3 MB
[OK!]


Making Classifications

In [6]:
# Create an empty Data Frame to store the results
empty_df = spark.createDataFrame([['']]).toDF("text")

# Setting up the pipelien
pipelineModel = nlpPipeline.fit(empty_df)
# df = spark.createDataFrame(pd.DataFrame({"text": text_list}))

# Transform the results
result = pipelineModel.transform(data.select('text'))

# Organize and display the results
output = result.select(F.explode(F.arrays_zip('document.result', 'sentiment.result')).alias("cols")) \
               .select(F.expr("cols['0']").alias("document"),
                       F.expr("cols['1']").alias("sentiment"))
output.show(5)

+--------------------+---------+
|            document|sentiment|
+--------------------+---------+
|This business abo...|     fear|
|Coronavirus world...|      joy|
|Part-2  *Step-by-...|      joy|
|The European Medi...|     fear|
|My #AstraZeneca j...|      joy|
+--------------------+---------+
only showing top 5 rows



## Model Evaluation

In [7]:
output = data.join(output, data.text == output.document, how='left')
print(output.count())
output.show(5)

400
+-----+------------+-------------------+--------------+-----------+----------+-----------+---------------+----------------+--------------+---------------+--------------------+----------+----------+----------+-------------------+---------------------------+--------------+----------------------+-------------+--------------------+-----------+---------+-----------------------+-------------+--------------+----+-----------+--------------------+--------------------+-------------+--------------------+--------------------+-------+----------+-----------------+------------+---------+--------+---------+-------------+--------------+------------+-------------+----------+--------+--------+--------+-------------+--------------------+---------+
|index|anger_manual|anticipation_manual|disgust_manual|fear_manual|joy_manual|love_manual|optimism_manual|pessimism_manual|sadness_manual|surprise_manual|                text|neg_manual|neu_manual|pos_manual|main_emotion_manual|main_emotion_reduced_manual|  

In [8]:
from pyspark.sql.functions import col, expr, when

# Split the predictions
fear_pred_sparknlp = expr("""IF(sentiment == 'fear', 1, 0)""")
surprise_pred_sparknlp = expr("""IF(sentiment == 'surprise', 1, 0)""")
joy_pred_sparknlp = expr("""IF(sentiment == 'joy', 1, 0)""")
sadness_pred_sparknlp = expr("""IF(sentiment == 'sadness', 1, 0)""")
sentiment_pred_sparknlp = when(col("sentiment") == 'joy', 0.0) \
                          .when(col("sentiment") == 'surprise', 1.0) \
                          .when(col("sentiment") == 'fear', 2.0) \
                          .when(col("sentiment") == 'sadness', 3.0)

# Combine the actuals
fear_actual_sparknlp = expr("""IF(fear_manual == 1 OR disgust_manual == 1 OR anticipation_manual == 1, 1, 0)""")
surprise_actual_sparknlp = expr("""IF(surprise_manual == 1 OR neu_manual = 1, 1, 0)""")
joy_actual_sparknlp = expr("""IF(joy_manual == 1 OR optimism_manual == 1 OR love_manual == 1, 1, 0)""")
sadness_actual_sparknlp = expr("""IF(sadness_manual == 1 OR pessimism_manual == 1 OR anger_manual == 1, 1, 0)""")
sentiment_actual_sparknlp = when(col("joy_actual_sparknlp") == 1, 0.0) \
                            .when(col("surprise_actual_sparknlp") == 1, 1.0) \
                            .when(col("fear_actual_sparknlp") == 1, 2.0) \
                            .when(col("sadness_actual_sparknlp") == 1, 3.0)

# Add the additional columns
output = output.withColumn("fear_pred_sparknlp", fear_pred_sparknlp) \
               .withColumn("surprise_pred_sparknlp", surprise_pred_sparknlp) \
               .withColumn("joy_pred_sparknlp", joy_pred_sparknlp) \
               .withColumn("sadness_pred_sparknlp", sadness_pred_sparknlp) \
               .withColumn("fear_actual_sparknlp", fear_actual_sparknlp) \
               .withColumn("surprise_actual_sparknlp", surprise_actual_sparknlp) \
               .withColumn("joy_actual_sparknlp", joy_actual_sparknlp) \
               .withColumn("sadness_actual_sparknlp", sadness_actual_sparknlp) \
               .withColumn("sentiment_pred_sparknlp", sentiment_pred_sparknlp) \
               .withColumn("sentiment_actual_sparknlp", sentiment_actual_sparknlp)
output.show(5)

+-----+------------+-------------------+--------------+-----------+----------+-----------+---------------+----------------+--------------+---------------+--------------------+----------+----------+----------+-------------------+---------------------------+--------------+----------------------+-------------+--------------------+-----------+---------+-----------------------+-------------+--------------+----+-----------+--------------------+--------------------+-------------+--------------------+--------------------+-------+----------+-----------------+------------+---------+--------+---------+-------------+--------------+------------+-------------+----------+--------+--------+--------+-------------+--------------------+---------+------------------+----------------------+-----------------+---------------------+--------------------+------------------------+-------------------+-----------------------+-----------------------+-------------------------+
|index|anger_manual|anticipation_manual|

Multiclass confusion matrix: 
https://spark.apache.org/docs/2.2.0/mllib-evaluation-metrics.html#multilabel-classification

In [9]:
# Confusion Matrix
from pyspark.mllib.evaluation import MulticlassMetrics

pl = output.select(col('sentiment_pred_sparknlp').alias('prediction'), 
                   col('sentiment_actual_sparknlp').alias('label')).rdd
metrics = MulticlassMetrics(pl)

print('joy, surprise, fear, sadness')
print(metrics.confusionMatrix().toArray())

joy, surprise, fear, sadness
[[62. 13. 30.  4.]
 [53. 10. 68. 12.]
 [14.  9. 83.  9.]
 [ 5.  0. 23.  5.]]


In [10]:
# Total Accuracy
print('Overall Accuracy rate is', metrics.accuracy)

Overall Accuracy rate is 0.4


In [11]:
# F1 Score
labels = pl.map(lambda lp: lp.label).distinct().collect()
for label in sorted(labels):
    # print("Class %s precision = %s" % (label, metrics.precision(label)))
    # print("Class %s recall = %s" % (label, metrics.recall(label)))
    print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))

Class 0.0 F1 Measure = 0.5102880658436215
Class 1.0 F1 Measure = 0.1142857142857143
Class 2.0 F1 Measure = 0.5203761755485893
Class 3.0 F1 Measure = 0.15873015873015872


In [12]:
# Precision
for label in sorted(labels):
    print("Class %s precision = %s" % (label, metrics.precision(label)))
print("\n")
print('Overall Precision rate is', metrics.precision(1.0))

Class 0.0 precision = 0.4626865671641791
Class 1.0 precision = 0.3125
Class 2.0 precision = 0.4068627450980392
Class 3.0 precision = 0.16666666666666666


Overall Precision rate is 0.3125


In [13]:
output.toPandas().to_csv('3_model_output_all.csv', sep = '\t')

In [14]:
data = spark.read.option("delimiter", "\t") \
                 .option("header", "true") \
                 .csv("3_model_output_all.csv")
print(data.count())
data.show(5)

400
+---+-----+------------+-------------------+--------------+-----------+----------+-----------+---------------+----------------+--------------+---------------+--------------------+----------+----------+----------+-------------------+---------------------------+--------------+----------------------+-------------+--------------------+-----------+---------+-----------------------+-------------+--------------+----+-----------+--------------------+--------------------+-------------+--------------------+--------------------+-------+----------+-----------------+------------+---------+--------+---------+-------------+--------------+------------+-------------+----------+--------+--------+--------+-------------+--------------------+---------+------------------+----------------------+-----------------+---------------------+--------------------+------------------------+-------------------+-----------------------+-----------------------+-------------------------+
|_c0|index|anger_manual|anticipa

In [22]:
data.select('label_manual').distinct().show(truncate = False)

+--------------+
|label_manual  |
+--------------+
|conversation  |
|annoucement   |
|joy           |
|eugenics      |
|shouting      |
|news          |
|statement     |
|fact check    |
|confusion     |
|eugenics      |
|clickbait     |
|conspiracy    |
|fake news     |
|misinformation|
|FYI           |
|vaccinated    |
|pessimism     |
|anger         |
|fear          |
|isinformation |
+--------------+
only showing top 20 rows

