In [1]:
# Import Spark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd
import string, re, json

# Import NLTK
import nltk
import sys
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.corpus import sentiwordnet as swn

spark = SparkSession.builder \
        .config("spark.mongodb.input.uri", "mongodb://192.168.1.27/SentimentAnalysisSpark.Covid19?retryWrites=true") \
        .config("spark.mongodb.output.uri", "mongodb://192.168.1.27/SentimentAnalysisSpark.LabeledTweetSentimentAnalysis?retryWrites=true") \
        .getOrCreate()

#Download the needed corpus 
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('sentiwordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/emanuele/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/emanuele/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /home/emanuele/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


True

In [2]:
pipeline_noRetweet = "[\
    {\
        '$match': {\
            'lang': 'en',\
            'retweeted_status':null\
        }\
    },{\
        '$project': {\
            'id_str': 1\
            'created_at': 1\
            'full_text': 1\
        },\
    }\
]"

pipeline_Retweet = "[\
    {\
        '$match': {\
            'lang': 'en'\
            'retweeted_status':{$ne: null}\
            'retweeted_status.lang': 'en'\
        }\
    },{\
        '$project': {\
            'id_str': 1\
            'created_at': 1\
            'retweeted_status.full_text': 1\
        },\
    }\
]"

df_ENGNoRetweet = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("pipeline", pipeline_noRetweet).load()
df_ENGRetweet = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("pipeline", pipeline_Retweet).load()

df_ENGNoRetweet.printSchema()
df_ENGRetweet.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- full_text: string (nullable = true)
 |-- id_str: string (nullable = true)

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- id_str: string (nullable = true)
 |-- retweeted_status: struct (nullable = true)
 |    |-- full_text: string (nullable = true)



In [None]:
print (df_ENGNoRetweet.count())
print (df_ENGRetweet.count())

In [3]:
# Unione dei risultati
df_Tweets = df_ENGRetweet\
    .selectExpr("id_str", "retweeted_status.full_text as full_text", "created_at")\
    .union(df_ENGNoRetweet.select("id_str", "full_text", "created_at"))

df_Tweets_noDup = df_Tweets.dropDuplicates(["full_text"])
df_Tweets_noDup.count()

50210

In [4]:
# remove whitespace
def remove_all_space(astring):
  return " ".join(astring.split())

# clean the text 
def remove_features(data_str):
    # compile regex
    url_re = re.compile('https?://(www.)?\w+\.\w+(/\w+)*/?')
    punc_re = re.compile('[%s]' % re.escape(string.punctuation))
    num_re = re.compile('(\\d+)')
    alpha_num_re = re.compile("^[a-z0-9_.]+$")
    # convert to lowercase
    data_str = data_str.lower()
    # remove hyperlinks
    data_str = url_re.sub(' ', data_str)
    # remove puncuation
    data_str = punc_re.sub(' ', data_str)
    # remove numeric 'words'
    data_str = num_re.sub(' ', data_str)
    # remove non a-z 0-9 characters and words shorter than 3 characters
    list_pos = 0
    cleaned_str = ''
    for word in data_str.split():
        if list_pos == 0:
            if alpha_num_re.match(word) and len(word) > 2:
                cleaned_str = word
            else:
                cleaned_str = ' '
        else:
            if alpha_num_re.match(word) and len(word) > 2:
                cleaned_str = cleaned_str + ' ' + word
            else:
                cleaned_str += ' '
        list_pos += 1
    cleaned_str2 = remove_all_space(cleaned_str)
    return cleaned_str2


# extract part of speech
def pos(tokenized_text):
    sent_tag_list = pos_tag(tokenized_text) 
    aList = []
    for word, tag in sent_tag_list:
        tagToUse = ''
        if tag.startswith('J'):
            tagToUse= 'a' # aggettivi
        elif tag.startswith('N'):
            tagToUse= 'n' # sostantivi
        elif tag.startswith('R'):
            tagToUse= 'r' # avverbi
        elif tag.startswith('V'):
            tagToUse= 'v' # verbi
        else:
            continue
        aList.append((word, tagToUse))
    return aList

# lemmatize the commit comments  
lemmatizer = WordNetLemmatizer()
def lemmatize(array_of_word_for_a_comment):
    all_words_in_comment = []
    for word in array_of_word_for_a_comment:
        lemma = lemmatizer.lemmatize(word[0], pos=word[1])
        if not lemma:
            continue
        all_words_in_comment.append([lemma,word[1]])  
    return all_words_in_comment


#calculate the sentiment 
def cal_score(array_of_lemma_tag_for_a_comment):
    alist = [array_of_lemma_tag_for_a_comment]
    totalScore = 0
    count_words_included = 0
    for word in array_of_lemma_tag_for_a_comment:
        synset_forms = list(swn.senti_synsets(word[0], word[1]))
        if not synset_forms:
            continue
        synset = synset_forms[0] 
        totalScore = totalScore + synset.pos_score() - synset.neg_score()
        count_words_included = count_words_included +1
    #final_dec = ''
    if count_words_included == 0:
        return 3
    elif totalScore == 0:
        return 0        
    elif totalScore/count_words_included < 0:
        return 2
    elif totalScore/count_words_included > 0:
        return 1


remove_features_udf = udf(remove_features, StringType())
pos_udf = udf(pos,ArrayType(StructType([ StructField("word", StringType(), False), StructField("tag", StringType(), False)])))
lemmatize_udf = udf(lemmatize,ArrayType(StructType([ StructField("lemma", StringType(), False), StructField("tag", StringType(), False)])))
cal_score_udf = udf(cal_score,IntegerType())

In [5]:
#remove noise
df_TweetsCleaned = df_Tweets_noDup.withColumn("cleaned_text", remove_features_udf(df_Tweets_noDup['full_text']))
df_TweetsCleaned.select('cleaned_text').show(truncate=50)
#df_TweetsCleaned.printSchema()

+--------------------------------------------------+
|                                      cleaned_text|
+--------------------------------------------------+
|from the beginning have said was matter when no...|
|            this not what president trump said all|
|recent health events across the globe surroundi...|
|trump slump real and not being driven the coron...|
|heck yes president trump taking care business a...|
|new york confirms first coronavirus case govern...|
|like ewarren for hhs crisis think she would exc...|
|coronavirus causing mass hysteria but millions ...|
|eight people have confirmed cases the novel cor...|
|                                     told you plan|
|covid been given small window opportunity manag...|
|all top rok italy iran have coronavirus cases b...|
|been shouting only tacha can save nigeria but a...|
|joshngkamstra gail carson quote from the articl...|
|breaking manila city mayor iskomoreno has order...|
|now that one your constituents dead these lul

In [6]:
# Tokenizzazione
tkn = Tokenizer()\
      .setInputCol("cleaned_text")\
      .setOutputCol("words")

# Eliminazione Stopwords
englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
stops = StopWordsRemover()\
        .setStopWords(englishStopWords)\
        .setInputCol("words")\
        .setOutputCol("words_nsw")

pipeline = Pipeline(stages = [tkn, stops])


df_TweetsCleaned = pipeline\
    .fit(df_TweetsCleaned.select("full_text", "cleaned_text", "created_at"))\
    .transform(df_TweetsCleaned.select("full_text", "cleaned_text", "created_at"))

#df_TweetsCleaned.select("full_text", "cleaned_text").show()

In [7]:
#tag for part of speech
df_TweetsCleanedTagged = df_TweetsCleaned.withColumn("words_nsw_tag", pos_udf(df_TweetsCleaned['words_nsw']))


In [8]:
#lemmatize the tokens 
df_Tweet_Lemmatized = df_TweetsCleanedTagged.withColumn("words_lemmatized", lemmatize_udf(df_TweetsCleanedTagged['words_nsw_tag']))


In [10]:
#calculate the sentiment

try:
    df_Tweet_Lemmatized_Score = df_Tweet_Lemmatized.withColumn("score", cal_score_udf(df_Tweet_Lemmatized["words_lemmatized"]))
except:#
    sys.setrecursionlimit(2000)
    df_Tweet_Lemmatized_Score = df_Tweet_Lemmatized.withColumn("score", cal_score_udf(df_Tweet_Lemmatized["words_lemmatized"]))


#df_Tweet_Lemmatized_Score = df_Tweet_Lemmatized.withColumn("score", cal_score_udf(df_Tweet_Lemmatized["words_lemmatized"]))

df_Tweet_Lemmatized_Score.show(truncate=10)
#df_Tweet_Lemmatized.printSchema()

+----------+------------+----------+----------+----------+-------------+----------------+-----+
| full_text|cleaned_text|created_at|     words| words_nsw|words_nsw_tag|words_lemmatized|score|
+----------+------------+----------+----------+----------+-------------+----------------+-----+
|"First ...|  first p...|Mon Mar...|[first,...|[first,...|   [[first...|      [[first...|    1|
|"I feel...|  feel li...|Mon Apr...|[feel, ...|[feel, ...|   [[feel,...|      [[feel,...|    0|
|"Most a...|  most ap...|Mon Apr...|[most, ...|[applic...|   [[appli...|      [[appli...|    1|
|"The Un...|  the uni...|Mon Mar...|[the, u...|[united...|   [[unite...|      [[unite...|    1|
|"To lov...|  love pu...|Mon Mar...|[love, ...|[love, ...|   [[love,...|      [[love,...|    1|
|#29. Mr...|  mrs amp...|Mon Mar...|[mrs, a...|[mrs, a...|   [[mrs, ...|      [[mr, n...|    1|
|#BS 
#p...|  plainan...|Mon Apr...|[plaina...|[plaina...|   [[plain...|      [[plain...|    1|
|#COVID1...|  covid h...|Mon Mar...|[cov

In [11]:
df_Tweet_Lemmatized_Score.count()

50210