In [1]:
# Import Spark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd
import string, re, json

# Import NLTK
import nltk
import sys
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.corpus import sentiwordnet as swn

spark = SparkSession.builder \
        .config("spark.mongodb.input.uri", "mongodb://192.168.1.27/TwitterSentimentAnalysis.Covid19?retryWrites=true") \
        .config("spark.mongodb.output.uri", "mongodb://192.168.1.27/TwitterSentimentAnalysis.Covid19?retryWrites=true") \
        .getOrCreate()

#Download the needed corpus 
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('sentiwordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/emanuele/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/emanuele/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /home/emanuele/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


True

In [5]:
pipeline_noRetweet = "[\
    {\
        '$match': {\
            'lang': 'en',\
            'retweeted_status':null\
        }\
    },{\
        '$project': {\
            'id_str': 1\
            'created_at': 1\
            'full_text': 1\
        },\
    }\
]"

pipeline_Retweet = "[\
    {\
        '$match': {\
            'lang': 'en'\
            'retweeted_status':{$ne: null}\
            'retweeted_status.lang': 'en'\
        }\
    },{\
        '$project': {\
            'id_str': 1\
            'created_at': 1\
            'retweeted_status.full_text': 1\
        },\
    }\
]"

df_ENGNoRetweet = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("pipeline", pipeline_noRetweet).load()
df_ENGRetweet = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("pipeline", pipeline_Retweet).load()

df_ENGNoRetweet.printSchema()
df_ENGRetweet.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- full_text: string (nullable = true)
 |-- id_str: string (nullable = true)

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- id_str: string (nullable = true)
 |-- retweeted_status: struct (nullable = true)
 |    |-- full_text: string (nullable = true)



In [6]:
print (df_ENGNoRetweet.count())
print (df_ENGRetweet.count())

19113
59851


In [90]:
# Unione dei risultati
df_Tweets = df_ENGRetweet\
    .selectExpr("id_str", "retweeted_status.full_text as full_text")\
    .union(df_ENGNoRetweet.select("id_str", "full_text"))

df_Tweets = df_Tweets.select("full_text").distinct()
df_Tweets.count()

43710

In [91]:
# remove whitespace
def remove_all_space(astring):
  return " ".join(astring.split())

# clean the text 
def remove_features(data_str):
    # compile regex
    url_re = re.compile('https?://(www.)?\w+\.\w+(/\w+)*/?')
    punc_re = re.compile('[%s]' % re.escape(string.punctuation))
    num_re = re.compile('(\\d+)')
    alpha_num_re = re.compile("^[a-z0-9_.]+$")
    # convert to lowercase
    data_str = data_str.lower()
    # remove hyperlinks
    data_str = url_re.sub(' ', data_str)
    # remove puncuation
    data_str = punc_re.sub(' ', data_str)
    # remove numeric 'words'
    data_str = num_re.sub(' ', data_str)
    # remove non a-z 0-9 characters and words shorter than 3 characters
    list_pos = 0
    cleaned_str = ''
    for word in data_str.split():
        if list_pos == 0:
            if alpha_num_re.match(word) and len(word) > 2:
                cleaned_str = word
            else:
                cleaned_str = ' '
        else:
            if alpha_num_re.match(word) and len(word) > 2:
                cleaned_str = cleaned_str + ' ' + word
            else:
                cleaned_str += ' '
        list_pos += 1
    cleaned_str2 = remove_all_space(cleaned_str)
    return cleaned_str2


# extract part of speech
def pos(tokenized_text):
    sent_tag_list = pos_tag(tokenized_text) 
    aList = []
    for word, tag in sent_tag_list:
        tagToUse = ''
        if tag.startswith('J'):
            tagToUse= 'a' # aggettivi
        elif tag.startswith('N'):
            tagToUse= 'n' # sostantivi
        elif tag.startswith('R'):
            tagToUse= 'r' # avverbi
        elif tag.startswith('V'):
            tagToUse= 'v' # verbi
        else:
            continue
        aList.append((word, tagToUse))
    return aList

# lemmatize the commit comments  
lemmatizer = WordNetLemmatizer()
def lemmatize(array_of_word_for_a_comment):
    all_words_in_comment = []
    for word in array_of_word_for_a_comment:
        lemma = lemmatizer.lemmatize(word[0], pos=word[1])
        if not lemma:
            continue
        all_words_in_comment.append([lemma,word[1]])  
    return all_words_in_comment


#calculate the sentiment 
def cal_score(array_of_lemma_tag_for_a_comment):
    alist = [array_of_lemma_tag_for_a_comment]
    totalScore = 0
    count_words_included = 0
    for word in array_of_lemma_tag_for_a_comment:
        synset_forms = list(swn.senti_synsets(word[0], word[1]))
        if not synset_forms:
            continue
        synset = synset_forms[0] 
        totalScore = totalScore + synset.pos_score() - synset.neg_score()
        count_words_included = count_words_included +1
    final_dec = ''
    if count_words_included == 0:
        final_dec = 'N/A'
    elif totalScore == 0:
        final_dec = 'Neu'        
    elif totalScore/count_words_included < 0:
        final_dec = 'Neg'
    elif totalScore/count_words_included > 0:
        final_dec = 'Pos'
    return final_dec


remove_features_udf = udf(remove_features, StringType())
pos_udf = udf(pos,ArrayType(StructType([ StructField("word", StringType(), False), StructField("tag", StringType(), False)])))
lemmatize_udf = udf(lemmatize,ArrayType(StructType([ StructField("lemma", StringType(), False), StructField("tag", StringType(), False)])))
cal_score_udf = udf(cal_score,StringType())

In [93]:
#remove noise
df_TweetsCleaned = df_Tweets.withColumn("cleaned_text", remove_features_udf(df_Tweets['full_text']))
df_TweetsCleaned.select('cleaned_text').show(truncate=50)
#df_TweetsCleaned.printSchema()

+--------------------------------------------------+
|                                      cleaned_text|
+--------------------------------------------------+
|the united states long accustomed thinking itse...|
|coronavirus amp climate change demand similar r...|
|baby first memory watches parents lose their jo...|
|from uprising outbreak hong kong sign language ...|
|another little way can help our neighbors covid...|
|country facing medical and economic crisis the ...|
|anthony fauci the idea anybody getting test eas...|
|chinese number lie there are many more than thi...|
|union ministry health and family welfare total ...|
|mhc amp its partners are actively monitoring th...|
|the government has just declared state emergenc...|
|kumailn will join you shouting from the rooftop...|
|new poll italy hints the possible political eff...|
|the medical professionals first responders groc...|
|breaking gov pritzker ordering all bars and res...|
|together can defeat corona not let panic cree

In [94]:
# Tokenizzazione
tkn = Tokenizer()\
      .setInputCol("cleaned_text")\
      .setOutputCol("words")

# Eliminazione Stopwords
englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
stops = StopWordsRemover()\
        .setStopWords(englishStopWords)\
        .setInputCol("words")\
        .setOutputCol("words_nsw")

pipeline = Pipeline(stages = [tkn, stops])


df_TweetsCleaned = pipeline\
    .fit(df_TweetsCleaned.select("full_text", "cleaned_text"))\
    .transform(df_TweetsCleaned.select("full_text", "cleaned_text"))

df_TweetsCleaned.select("full_text", "cleaned_text").show()

+--------------------+--------------------+
|           full_text|        cleaned_text|
+--------------------+--------------------+
|"The United State...|the united states...|
|Coronavirus &amp;...|coronavirus amp c...|
|Oh to be a 1998 b...|baby first memory...|
|From uprising to ...|from uprising out...|
|Another little wa...|another little wa...|
|“Our country is f...|country facing me...|
|😷 Dr. Anthony Fa...|anthony fauci the...|
|Chinese number is...|chinese number li...|
|Union Ministry of...|union ministry he...|
|MHC &amp; its par...|mhc amp its partn...|
|As the government...|the government ha...|
|@kumailn 📣I will...|kumailn will join...|
|New poll in Italy...|new poll italy hi...|
|To the medical pr...|the medical profe...|
|BREAKING: Gov. JB...|breaking gov prit...|
|Together we can d...|together can defe...|
|In short, the Tru...|short the trump p...|
|BREAKING: 

“Abou...|breaking air and ...|
|1st 10 minutes of...|minutes democrati...|
|Ecuador had the 1...|ecuador had 

In [96]:
#tag for part of speech
df_TweetsCleanedTagged = df_TweetsCleaned.withColumn("words_nsw_tag", pos_udf(df_TweetsCleaned['words_nsw']))
df_TweetsCleanedTagged.select("words_nsw_tag").show(truncate=50)
#df_TweetsCleanedTagged.printSchema()

+--------------------------------------------------+
|                                     words_nsw_tag|
+--------------------------------------------------+
|[[united, a], [states, n], [long, r], [accustom...|
|[[coronavirus, n], [amp, n], [climate, n], [cha...|
|[[baby, n], [first, r], [memory, n], [watches, ...|
|[[uprising, a], [outbreak, n], [hong, n], [kong...|
|[[little, a], [way, n], [help, n], [neighbors, ...|
|[[country, n], [facing, v], [medical, a], [econ...|
|[[anthony, n], [fauci, n], [idea, n], [anybody,...|
|[[chinese, a], [number, n], [lie, v], [many, a]...|
|[[union, n], [ministry, n], [health, n], [famil...|
|[[mhc, n], [amp, a], [partners, n], [actively, ...|
|[[government, n], [declared, v], [state, n], [e...|
|[[kumailn, n], [join, n], [shouting, v], [rooft...|
|[[new, a], [poll, n], [italy, a], [hints, n], [...|
|[[medical, a], [professionals, n], [first, a], ...|
|[[breaking, v], [gov, n], [pritzker, n], [order...|
|[[together, r], [defeat, n], [corona, a], [le

In [97]:
#lemmatize the tokens 
df_Tweet_Lemmatized = df_TweetsCleanedTagged.withColumn("words_lemmatized", lemmatize_udf(df_TweetsCleanedTagged['words_nsw_tag']))
df_Tweet_Lemmatized.select("words_lemmatized").show(truncate=50)
#df_Tweet_Lemmatized.printSchema()

+--------------------------------------------------+
|                                  words_lemmatized|
+--------------------------------------------------+
|[[united, a], [state, n], [long, r], [accustom,...|
|[[coronavirus, n], [amp, n], [climate, n], [cha...|
|[[baby, n], [first, r], [memory, n], [watch, n]...|
|[[uprising, a], [outbreak, n], [hong, n], [kong...|
|[[little, a], [way, n], [help, n], [neighbor, n...|
|[[country, n], [face, v], [medical, a], [econom...|
|[[anthony, n], [fauci, n], [idea, n], [anybody,...|
|[[chinese, a], [number, n], [lie, v], [many, a]...|
|[[union, n], [ministry, n], [health, n], [famil...|
|[[mhc, n], [amp, a], [partner, n], [actively, r...|
|[[government, n], [declare, v], [state, n], [em...|
|[[kumailn, n], [join, n], [shout, v], [rooftop,...|
|[[new, a], [poll, n], [italy, a], [hint, n], [p...|
|[[medical, a], [professional, n], [first, a], [...|
|[[break, v], [gov, n], [pritzker, n], [order, v...|
|[[together, r], [defeat, n], [corona, a], [le

In [99]:
#calculate the sentiment
try:
    df_Tweet_Lemmatized = df_Tweet_Lemmatized.withColumn("score", cal_score_udf(df_Tweet_Lemmatized["words_lemmatized"]))
except:#
    sys.setrecursionlimit(2000)
    df_Tweet_Lemmatized = df_Tweet_Lemmatized.withColumn("score", cal_score_udf(df_Tweet_Lemmatized["words_lemmatized"]))

df_Tweet_Lemmatized.select("full_text","score").show(truncate=50)
#df_Tweet_Lemmatized.printSchema()

+--------------------------------------------------+-----+
|                                         full_text|score|
+--------------------------------------------------+-----+
|"The United States, long accustomed to thinking...|  Pos|
|Coronavirus &amp; climate change demand similar...|  Pos|
|Oh to be a 1998 baby

✔️ first memory is 9/11

...|  Neg|
|From uprising to outbreak: Hong Kong sign langu...|  Neu|
|Another little way we can help our neighbors. #...|  Pos|
|“Our country is facing a medical and economic c...|  Neg|
|😷 Dr. Anthony Fauci: "The idea of anybody gett...|  Pos|
|Chinese number is a lie. There are many more th...|  Neg|
|Union Ministry of Health and Family Welfare: A ...|  Pos|
|MHC &amp; its partners are actively monitoring ...|  Pos|
|As the government has just declared state of em...|  Neg|
|@kumailn 📣I will join you in shouting from the...|  Pos|
|New poll in Italy hints at the possible politic...|  Pos|
|To the medical professionals, first responders,...|  Pos|

In [101]:
df_Tweet_Lemmatized.count()

43710