In [None]:
# Import Spark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd
import string, re, json

# Import NLTK
import nltk
import sys
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download("vader_lexicon")

# Import numpy per 
import numpy as np

# Import time
import time


start_time = time.time()
spark = SparkSession.builder \
        .config("spark.mongodb.input.uri", "mongodb://192.168.1.27/TwitterSentimentAnalysis.Covid19?retryWrites=true") \
        .config("spark.mongodb.output.uri", "mongodb://192.168.1.27/TwitterSentimentAnalysis.LabeledTweets?retryWrites=true") \
        .getOrCreate()

pipeline_noRetweet = "[\
    {\
        '$match': {\
            'lang': 'en',\
            'retweeted_status':null\
        }\
    },{\
        '$project': {\
            'id_str': 1\
            'created_at': 1\
            'full_text': 1\
        },\
    }\
]"

pipeline_Retweet = "[\
    {\
        '$match': {\
            'lang': 'en'\
            'retweeted_status':{$ne: null}\
            'retweeted_status.lang':'en'\
        }\
    },{\
        '$project': {\
            'id_str': 1\
            'created_at': 1\
            'retweeted_status.full_text': 1\
        },\
    }\
]"

df_ENGNoRetweet = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("pipeline", pipeline_noRetweet).load()
df_ENGRetweet = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("pipeline", pipeline_Retweet).load()

df_Tweets = df_ENGRetweet\
    .selectExpr("id_str", "retweeted_status.full_text as full_text")\
    .union(df_ENGNoRetweet.select("id_str", "full_text"))


df_Tweets = df_Tweets.select("full_text").distinct()

# df_Tweets = df_Tweets.repartition(2)

def vaderSentimentAnalysis(data_str):
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(data_str)
    ss.pop('compound', None)
    maximum = max(ss, key=ss.get)  # Just use 'min' instead of 'max' for minimum.
    if maximum == 'neu':
        if(ss['neu'] >= 0.6):
            return 0
        elif(ss['pos'] > ss['neg']):
            return 1
        elif(ss['neg'] > ss['pos']):
            return 2
        else:
            return 0
    elif maximum == 'pos':
        return 1
    elif maximum == 'neg':
        return 2
    
vaderSentimentAnalysis_udf = udf(vaderSentimentAnalysis, IntegerType())

df_Tweets = df_Tweets.withColumn("label", vaderSentimentAnalysis_udf(df_Tweets['full_text']))

In [3]:
%time df_Tweets.show()

+--------------------+-----+
|           full_text|label|
+--------------------+-----+
|"The United State...|    0|
|Coronavirus &amp;...|    0|
|Oh to be a 1998 b...|    0|
|From uprising to ...|    0|
|Another little wa...|    0|
|“Our country is f...|    0|
|😷 Dr. Anthony Fa...|    0|
|Chinese number is...|    0|
|Union Ministry of...|    0|
|MHC &amp; its par...|    0|
|As the government...|    0|
|@kumailn 📣I will...|    0|
|New poll in Italy...|    0|
|To the medical pr...|    0|
|BREAKING: Gov. JB...|    0|
|Together we can d...|    0|
|In short, the Tru...|    2|
|BREAKING: 

“Abou...|    0|
|1st 10 minutes of...|    0|
|Ecuador had the 1...|    0|
+--------------------+-----+
only showing top 20 rows

CPU times: user 9.92 ms, sys: 1.78 ms, total: 11.7 ms
Wall time: 18 s
