In [1]:
# Import Spark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd
import string, re, json

# Import NLTK
import nltk
import sys
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download("vader_lexicon")

spark = SparkSession.builder \
        .config("spark.mongodb.input.uri", "mongodb://192.168.1.27/TwitterSentimentAnalysis.Covid19?retryWrites=true") \
        .config("spark.mongodb.output.uri", "mongodb://192.168.1.27/TwitterSentimentAnalysis.Covid19?retryWrites=true") \
        .getOrCreate()


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/emanuele/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
pipeline_noRetweet = "[\
    {\
        '$match': {\
            'lang': 'en',\
            'retweeted_status':null\
        }\
    },{\
        '$project': {\
            'id_str': 1\
            'created_at': 1\
            'full_text': 1\
        },\
    }\
]"

pipeline_Retweet = "[\
    {\
        '$match': {\
            'lang': 'en'\
            'retweeted_status':{$ne: null}\
        }\
    },{\
        '$project': {\
            'id_str': 1\
            'created_at': 1\
            'retweeted_status.full_text': 1\
        },\
    }\
]"

df_ENGNoRetweet = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("pipeline", pipeline_noRetweet).load()
df_ENGRetweet = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("pipeline", pipeline_Retweet).load()

df_ENGNoRetweet.printSchema()
df_ENGRetweet.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- full_text: string (nullable = true)
 |-- id_str: string (nullable = true)

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- id_str: string (nullable = true)
 |-- retweeted_status: struct (nullable = true)
 |    |-- full_text: string (nullable = true)



In [4]:
df_Tweets = df_ENGRetweet\
    .selectExpr("id_str", "retweeted_status.full_text as full_text")\
    .union(df_ENGNoRetweet.select("id_str", "full_text"))


df_Tweets = df_Tweets.select("full_text").distinct()
df_Tweets.count()

43710

In [5]:
def vaderSentimentAnalysis(data_str):
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(data_str)
    return ss
vaderSentimentAnalysis_udf = udf(vaderSentimentAnalysis, StringType())

In [6]:
df_Tweets = df_Tweets.withColumn("score", vaderSentimentAnalysis_udf(df_Tweets['full_text']))
df_Tweets.show(truncate=50)

+--------------------------------------------------+--------------------------------------------------+
|                                         full_text|                                             score|
+--------------------------------------------------+--------------------------------------------------+
|"The United States, long accustomed to thinking...|  {neg=0.0, pos=0.299, compound=0.9076, neu=0.701}|
|Coronavirus &amp; climate change demand similar...|  {neg=0.031, pos=0.25, compound=0.8639, neu=0.72}|
|Oh to be a 1998 baby

✔️ first memory is 9/11

...| {neg=0.233, pos=0.0, compound=-0.7184, neu=0.767}|
|From uprising to outbreak: Hong Kong sign langu...|         {neg=0.0, pos=0.0, compound=0.0, neu=1.0}|
|Another little way we can help our neighbors. #...|  {neg=0.0, pos=0.213, compound=0.4019, neu=0.787}|
|“Our country is facing a medical and economic c...|{neg=0.164, pos=0.056, compound=-0.7506, neu=0.78}|
|😷 Dr. Anthony Fauci: "The idea of anybody gett...| {neg=0.065, 

In [7]:
df_Tweets.count()

43710