In [149]:
# Import

from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, IntegerType
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd
import string, re, json

In [150]:
# Caricamento dati necessari
#
#

tweets = "./iorestoacasa_1_original.json"
# JSON contenente una lista di strutture. Ogni struttura contiene:
# - Parola
# - positive_Score
# - negativeScore
sentix = "./sentix.json"

# Creazione dataFrame
df_Tweets = spark.read.format("json").option("inferSchema", "true").option("multiLine", "true").load(tweets)
df_Tweets.select("extended_tweet.full_text").show(10)

# Creazione JSON Sentix per l'etichettatura
f = open(sentix)
sentix_words = json.load(f) 
#print(sentix_words[0]['lemma'])
f.close()

+--------------------+
|           full_text|
+--------------------+
|CORONAVIRUS IN CA...|
|Torta sfornata
Pe...|
|🇪🇺 Il Parlament...|
|In un #AnfieldRoa...|
|Cosa si può e non...|
|Sono 2.162 le per...|
|@carmelitadurso P...|
|MOTUS-E ha scelto...|
|Il teatro vive, a...|
|#ParigiConsiglia ...|
+--------------------+
only showing top 10 rows



In [156]:
# Definizione funzione per l'eliminazione dei caratteri speciali
#
#

def remove_punct(text):
    
    url_re = re.compile('https?://(www.)?\w+\.\w+(/\w+)*/?')
    punc_re = re.compile('[%s]' % re.escape(string.punctuation))
    mention_re = re.compile('@(\w+)')
    special = re.compile('[\$#,@&%£!=°§*/;-]')
    num_re = re.compile('( \\d+)')

    text = url_re.sub("", text)
    text = punc_re.sub("", text)
    text = mention_re.sub("", text)
    text = special.sub("", text)
    text = num_re.sub("", text)
    
    return text

# setup pyspark udf function
remove_features_udf = udf(lambda x: remove_punct(x), StringType())


In [157]:
# DEFINIZIONE FUNZIONE PER L'ETICHETTATURA DEI TWEETS
#
#

emoticonsPositive = ('😇','😊','❤️','😘','💞','💖','🤗','💕','👏','🎉','👍','🔝')
emoticonsNegative = ('😂','😡','😠','😭','🤦‍','🤷🏼‍','😞','😱','😓','👎', '🇪🇺')

def labeling(tweet):
    val = 0
    for word in tweet:
        if (word in emoticonsPositive):
            val = val + 1
        elif (word in emoticonsNegative):
            val = val - 1
        else:
            js = list(filter(lambda js: js['lemma']==word, sentix_words))
            if(len(js)>0):
                val = val + float(js[0]['positive_score'])
                val = val - float(js[0]['negativeScore'])
                
    if(val>0):
        # Positivo
        return "2"
    elif(val<0):
        # Negativo
        return "1"
    else:
        # Neutro
        return "0"
    
# setup pyspark udf function
label = udf(lambda x: labeling(x), StringType())

In [158]:
# Eliminazione caratteri speciali
df_noHash = df_Tweets.withColumn('words_filtered',remove_features_udf("extended_tweet.full_text"))

# Tokenizzazione
tkn = Tokenizer()\
      .setInputCol("words_filtered")\
      .setOutputCol("words")

# Eliminazione Stopwords
italianStopWords = StopWordsRemover.loadDefaultStopWords("italian")
stops = StopWordsRemover()\
        .setStopWords(italianStopWords)\
        .setInputCol("words")\
        .setOutputCol("words_nsw")

pipeline = Pipeline(stages = [tkn, stops])

df_TweetCleaned = pipeline.fit(df_noHash.select("words_filtered")).transform(df_noHash.select("words_filtered"))

df_TweetCleaned.select("words_nsw").show(10)

+--------------------+
|           words_nsw|
+--------------------+
|[coronavirus, cal...|
|[torta, sfornata,...|
|[🇪🇺, parlamento...|
|[anfieldroad, inc...|
|[cosa, può, può, ...|
|[persone, state, ...|
|[carmelitadurso, ...|
|[motuse, scelto, ...|
|[teatro, vive, si...|
|[parigiconsiglia,...|
+--------------------+
only showing top 10 rows



In [154]:
df_TweetLabeled = df_TweetCleaned.withColumn("label", label("words_nsw"))
# df_TweetLabeled.show(5)
df_TweetLabeled.persist()
#print(df_TweetLabeled.count())
#df_TweetLabeled.show(50)

DataFrame[words_filtered: string, words: array<string>, words_nsw: array<string>, label: string]

In [155]:
# Creazione DataFrame per il training
df_TweetLabeled_toFit = df_TweetLabeled.select("words_nsw", "label")
df_TweetLabeled_toFit.show(5)

# Estrazione delle features
hashingTF = HashingTF(inputCol="words_nsw", outputCol="rawFeatures")
idf = IDF(minDocFreq=3, inputCol="rawFeatures", outputCol="features")
nb = NaiveBayes()

# Trasformazione label da String a Integer
data_df = df_TweetLabeled_toFit.withColumn("label", df_TweetLabeled["label"].cast(IntegerType()))

# Dichiarazione della pipeline
pipeline = Pipeline(stages=[hashingTF, idf, nb])
model = pipeline.fit(data_df)

# Valutazione del modello con dati di training
predictions = model.transform(data_df)

predictions.show(5)

# Calcolo dell'accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

+--------------------+-----+
|           words_nsw|label|
+--------------------+-----+
|[coronavirus, cal...|    2|
|[torta, sfornata,...|    1|
|[🇪🇺, parlamento...|    1|
|[anfieldroad, inc...|    1|
|[cosa, può, può, ...|    2|
+--------------------+-----+
only showing top 5 rows

+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|           words_nsw|label|         rawFeatures|            features|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|[coronavirus, cal...|    2|(262144,[261,4772...|(262144,[261,4772...|[-178.21655521785...|[1.11957656329673...|       2.0|
|[torta, sfornata,...|    1|(262144,[261,1940...|(262144,[261,1940...|[-243.71470617081...|[1.35380722355126...|       1.0|
|[🇪🇺, parlamento...|    1|(262144,[261,904,...|(262144,[261,904,...|[-416.06166977245...|[3.6011

0.7825901439989156

In [159]:
df_TweetLabeled_toFit.persist()

DataFrame[words_nsw: array<string>, label: string]