In [None]:
# Import

from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, IntegerType
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd
import string, re, json

In [None]:
# Caricamento dati necessari
#
#
ITRetweet = "./ITRetweet.json"
ITNoRetweet = "./ITNoRetweet.json"

# Creazione dataFrame
df_ITRetweet = spark.read.format("json").option("inferSchema", "true").option("multiLine", "true").load(ITRetweet)
df_ITNoRetweet = spark.read.format("json").option("inferSchema", "true").option("multiLine", "true").load(ITNoRetweet)

# JSON contenente una lista di strutture. Ogni struttura contiene:
# - Parola
# - positive_Score
# - negativeScore
sentix = "./sentix.json"

# Creazione JSON Sentix per l'etichettatura
f = open(sentix)
sentix_words = json.load(f)
f.close()

In [None]:
# ITRetweet

df_ITRetweet.printSchema()

In [None]:
# ITNoRetweet

df_ITNoRetweet.printSchema()

In [None]:
# UNIONE DEI RISULTATI

df_Tweets = df_ITRetweet\
    .selectExpr("id_str", "retweeted_status.full_text as full_text")\
    .union(df_ITNoRetweet.select("id_str", "full_text"))


df_Tweets = df_Tweets.select("full_text").distinct()
df_Tweets.count()

In [None]:
# Definizione funzione per l'eliminazione dei caratteri speciali
#
#

def remove_punct(text):
    url_re = re.compile('https?://(www.)?\w+\.\w+(/\w+)*/?')
    punc_re = re.compile('[%s]' % re.escape(string.punctuation))
    mention_re = re.compile('@(\w+)')
    special = re.compile('[\$#,@&%£!=°§*/;-]')
    num_re = re.compile('( \\d+)')
    text = url_re.sub("", text)
    text = punc_re.sub("", text)
    text = mention_re.sub("", text)
    text = special.sub("", text)
    text = num_re.sub("", text)
    return text

# setup pyspark udf function
remove_features_udf = udf(lambda x: remove_punct(x), StringType())

In [None]:
# DEFINIZIONE FUNZIONE PER L'ETICHETTATURA DEI TWEETS
#
#

emoticonsPositive = ('😇','😊','❤️','😘','💞','💖','🤗','💕','👏','🎉','👍','🔝')
emoticonsNegative = ('😂','😡','😠','😭','🤦‍','🤷🏼‍','😞','😱','😓','👎', '🇪🇺')

def labeling(tweet):
    val = 0
    for word in tweet:
        if (word in emoticonsPositive):
            val = val + 1
        elif (word in emoticonsNegative):
            val = val - 1
        else:
            js = list(filter(lambda js: js['lemma']==word, sentix_words))
            if(len(js)>0):
                val = val + float(js[0]['positive_score'])
                val = val - float(js[0]['negativeScore'])
                
    if(val>0):
        # Positivo
        return "2"
    elif(val<0):
        # Negativo
        return "1"
    else:
        # Neutro
        return "0"
    
# setup pyspark udf function
label = udf(lambda x: labeling(x), StringType())

In [None]:
# Eliminazione caratteri speciali
df_noHash = df_Tweets.withColumn('words_filtered',remove_features_udf("text"))

# Tokenizzazione
tkn = Tokenizer()\
      .setInputCol("words_filtered")\
      .setOutputCol("words")

# Eliminazione Stopwords
italianStopWords = StopWordsRemover.loadDefaultStopWords("italian")
stops = StopWordsRemover()\
        .setStopWords(italianStopWords)\
        .setInputCol("words")\
        .setOutputCol("words_nsw")

pipeline = Pipeline(stages = [tkn, stops])

df_TweetCleaned = pipeline.fit(df_noHash.select("words_filtered")).transform(df_noHash.select("words_filtered"))

df_TweetCleaned.show(10)

In [None]:
df_TweetLabeled = df_TweetCleaned.withColumn("label", label("words_nsw"))

df_TweetLabeled.persist()

In [None]:
# Trasformazione label da String a Integer
df_TweetLabeled = df_TweetLabeled.withColumn("label", df_TweetLabeled["label"].cast(IntegerType()))

# Creazione training set
train,test = df_TweetLabeled.randomSplit([0.8,0.2], seed = 2805)

In [None]:
test.persist()
train.persist()

In [None]:
train.printSchema()

In [None]:
cv  = CountVectorizer(inputCol='words_nsw', outputCol='tf')
idf = IDF().setInputCol('tf').setOutputCol('features')
nb  = NaiveBayes()

pipeline = Pipeline(stages=[cv, idf, nb])

# Dichiarazione della pipeline
model = pipeline.fit(train)

# Valutazione del modello con dati di training
predictions_train = model.transform(train)
# Calcolo dell'accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
eval_train = evaluator.evaluate(predictions_train)

# Valutazione del modello con dati di test
predictions_test = model.transform(test)
# Calcolo dell'accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
eval_test = evaluator.evaluate(predictions_test)

In [None]:
print (eval_train)
print (eval_test)