In [36]:
# Import Spark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd
import string, re, json

# Import NLTK
import nltk
import sys
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.corpus import sentiwordnet as swn

In [37]:
#Download the needed corpus 

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('sentiwordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/emanuele/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/emanuele/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /home/emanuele/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


True

In [38]:
ENGRetweet = "./ENGRetweet.json"
ENGNoRetweet = "./ENGNoRetweet.json"

# Creazione dei dataFrame
df_ENGRetweet = spark.read\
    .format("json")\
    .option("inferSchema", "true")\
    .option("multiLine", "true")\
    .load(ENGRetweet)

df_ENGNoRetweet = spark.read\
    .format("json")\
    .option("inferSchema", "true")\
    .option("multiLine", "true")\
    .load(ENGNoRetweet)

In [39]:
# UNIONE DEI RISULTATI

df_Tweets = df_ENGRetweet\
    .selectExpr("id_str", "retweeted_status.full_text as full_text")\
    .union(df_ENGNoRetweet.select("id_str", "full_text"))


df_Tweets = df_Tweets.select("full_text").distinct()
df_Tweets.count()

1050

In [40]:
# remove whitespace
def remove_all_space(astring):
  return " ".join(astring.split())

# clean the text 
def remove_features(data_str):
    # compile regex
    url_re = re.compile('https?://(www.)?\w+\.\w+(/\w+)*/?')
    punc_re = re.compile('[%s]' % re.escape(string.punctuation))
    num_re = re.compile('(\\d+)')
    alpha_num_re = re.compile("^[a-z0-9_.]+$")
    # convert to lowercase
    data_str = data_str.lower()
    # remove hyperlinks
    data_str = url_re.sub(' ', data_str)
    # remove puncuation
    data_str = punc_re.sub(' ', data_str)
    # remove numeric 'words'
    data_str = num_re.sub(' ', data_str)
    # remove non a-z 0-9 characters and words shorter than 3 characters
    list_pos = 0
    cleaned_str = ''
    for word in data_str.split():
        if list_pos == 0:
            if alpha_num_re.match(word) and len(word) > 2:
                cleaned_str = word
            else:
                cleaned_str = ' '
        else:
            if alpha_num_re.match(word) and len(word) > 2:
                cleaned_str = cleaned_str + ' ' + word
            else:
                cleaned_str += ' '
        list_pos += 1
    cleaned_str2 = remove_all_space(cleaned_str)
    return cleaned_str2


# extract part of speech
def pos(tokenized_text):
    sent_tag_list = pos_tag(tokenized_text) 
    aList = []
    for word, tag in sent_tag_list:
        tagToUse = ''
        if tag.startswith('J'):
            tagToUse= 'a' # aggettivi
        elif tag.startswith('N'):
            tagToUse= 'n' # sostantivi
        elif tag.startswith('R'):
            tagToUse= 'r' # avverbi
        elif tag.startswith('V'):
            tagToUse= 'v' # verbi
        else:
            continue
        aList.append((word, tagToUse))
    return aList

# lemmatize the commit comments  
lemmatizer = WordNetLemmatizer()
def lemmatize(array_of_word_for_a_comment):
    all_words_in_comment = []
    for word in array_of_word_for_a_comment:
        lemma = lemmatizer.lemmatize(word[0], pos=word[1])
        if not lemma:
            continue
        all_words_in_comment.append([lemma,word[1]])  
    return all_words_in_comment


#calculate the sentiment 
def cal_score(array_of_lemma_tag_for_a_comment):
    alist = [array_of_lemma_tag_for_a_comment]
    totalScore = 0
    count_words_included = 0
    for word in array_of_lemma_tag_for_a_comment:
        synset_forms = list(swn.senti_synsets(word[0], word[1]))
        if not synset_forms:
            continue
        synset = synset_forms[0] 
        totalScore = totalScore + synset.pos_score() - synset.neg_score()
        count_words_included = count_words_included +1
    final_dec = ''
    if count_words_included == 0:
        final_dec = 'N/A'
    elif totalScore == 0:
        final_dec = 'Neu'        
    elif totalScore/count_words_included < 0:
        final_dec = 'Neg'
    elif totalScore/count_words_included > 0:
        final_dec = 'Pos'
    return final_dec


remove_features_udf = udf(remove_features, StringType())
pos_udf = udf(pos,ArrayType(StructType([ StructField("word", StringType(), False), StructField("tag", StringType(), False)])))
lemmatize_udf = udf(lemmatize,ArrayType(StructType([ StructField("lemma", StringType(), False), StructField("tag", StringType(), False)])))
cal_score_udf = udf(cal_score,StringType())


In [41]:
#remove noise
df_TweetsCleaned = df_Tweets.withColumn("cleaned_text", remove_features_udf(df_Tweets['full_text']))
#df_TweetsCleaned.select('cleaned_text').show(truncate=50)
df_TweetsCleaned.printSchema()

root
 |-- full_text: string (nullable = true)
 |-- cleaned_text: string (nullable = true)



In [42]:
# Tokenizzazione
tkn = Tokenizer()\
      .setInputCol("cleaned_text")\
      .setOutputCol("words")

# Eliminazione Stopwords
englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
stops = StopWordsRemover()\
        .setStopWords(englishStopWords)\
        .setInputCol("words")\
        .setOutputCol("words_nsw")

pipeline = Pipeline(stages = [tkn, stops])


df_TweetsCleaned = pipeline\
    .fit(df_TweetsCleaned.select("full_text", "cleaned_text"))\
    .transform(df_TweetsCleaned.select("full_text", "cleaned_text"))

df_TweetsCleaned.select("full_text", "cleaned_text").show()

+--------------------+--------------------+
|           full_text|        cleaned_text|
+--------------------+--------------------+
|"The United State...|the united states...|
|Coronavirus &amp;...|coronavirus amp c...|
|Oh to be a 1998 b...|baby first memory...|
|Hookah should be ...|hookah should ban...|
|@caitrionambalfe ...|caitrionambalfe n...|
|tangled will be T...|tangled will the ...|
|#Epeeps Please ta...|epeeps please tak...|
|Chairman and CEO ...|chairman and ceo ...|
|As an ER doc tryi...|doc trying treat ...|
|Public News Servi...|public news servi...|
|Wash your hands, ...|wash your hands s...|
|Ohhhhh my frick, ...|ohhhhh frick than...|
|@Colonel_Eevee Ur...|colonel eevee exc...|
|@patmcguinness @n...|patmcguinness nev...|
|Exclusive. Access...|exclusive access ...|
|#coronavirus upda...|coronavirus updat...|
|On question what ...|question what can...|
|New: Starting imm...|new starting imme...|
|The safest bet is...|the safest bet us...|
|Me, 2 weeks ago: ...|weeks ago 

In [43]:
#tag for part of speech
df_TweetsCleanedTagged = df_TweetsCleaned.withColumn("words_nsw_tag", pos_udf(df_TweetsCleaned['words_nsw']))
#df_TweetsCleanedTagged.select("words_nsw_tag").show(truncate=50)
df_TweetsCleanedTagged.printSchema()

root
 |-- full_text: string (nullable = true)
 |-- cleaned_text: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- words_nsw: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- words_nsw_tag: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- word: string (nullable = false)
 |    |    |-- tag: string (nullable = false)



In [44]:
#lemmatize the tokens 
df_Tweet_Lemmatized = df_TweetsCleanedTagged.withColumn("words_lemmatized", lemmatize_udf(df_TweetsCleanedTagged['words_nsw_tag']))
#df_Tweet_Lemmatized.select("words_lemmatized").show(truncate=50)
df_Tweet_Lemmatized.printSchema()

root
 |-- full_text: string (nullable = true)
 |-- cleaned_text: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- words_nsw: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- words_nsw_tag: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- word: string (nullable = false)
 |    |    |-- tag: string (nullable = false)
 |-- words_lemmatized: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- lemma: string (nullable = false)
 |    |    |-- tag: string (nullable = false)



In [46]:
#calculate the sentiment
try:
    df_Tweet_Lemmatized = df_Tweet_Lemmatized.withColumn("score", cal_score_udf(df_Tweet_Lemmatized["words_lemmatized"]))
except:#
    sys.setrecursionlimit(2000)
    df_Tweet_Lemmatized = df_Tweet_Lemmatized.withColumn("score", cal_score_udf(df_Tweet_Lemmatized["words_lemmatized"]))

df_Tweet_Lemmatized.select("full_text","score").show(truncate=50)
#df_Tweet_Lemmatized.printSchema()

+--------------------------------------------------+-----+
|                                         full_text|score|
+--------------------------------------------------+-----+
|"The United States, long accustomed to thinking...|  Pos|
|Coronavirus &amp; climate change demand similar...|  Pos|
|Oh to be a 1998 baby

✔️ first memory is 9/11

...|  Neg|
|Hookah should be banned at groove and restauran...|  Neu|
|@caitrionambalfe @netflix My husband and I have...|  Pos|
|tangled will be THE movie of self-isolation #CO...|  Neu|
|#Epeeps Please take care  🦠🦠🦠🔥🔥🔥  #COVID1...|  Neu|
|Chairman and CEO of Universal Music Diagnosed W...|  Pos|
|As an ER doc trying to treat patients who may h...|  Pos|
|Public News Service Daily Newscast: Biden and S...|  Neg|
|Wash your hands, self quarantine, and don’t pan...|  Pos|
|Ohhhhh my frick, thank you!!! https://t.co/lQHB...|  Neu|
|@Colonel_Eevee Ur excellent. Remember being sad...|  Pos|
|@patmcguinness @nevancik1 @chicagosmayor @realD...|  Pos|
|Ex