In [None]:
# Import Spark
from sklearn.metrics import confusion_matrix
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd
import string, re, json


spark = SparkSession.builder \
        .config("spark.mongodb.input.uri", "mongodb://192.168.1.27/TwitterSentimentAnalysis.LabeledDataset?retryWrites=true") \
        .config("spark.mongodb.output.uri", "mongodb://192.168.1.27/TwitterSentimentAnalysis.LabeledDataset?retryWrites=true") \
        .getOrCreate()



In [None]:
df_TweetLabeled = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()

In [None]:
# remove whitespace
def remove_all_space(astring):
  return " ".join(astring.split())

# clean the text 
def remove_features(data_str):
    # compile regex
    url_re = re.compile('https?://(www.)?\w+\.\w+(/\w+)*/?')
    punc_re = re.compile('[%s]' % re.escape(string.punctuation))
    num_re = re.compile('(\\d+)')
    alpha_num_re = re.compile("^[a-z0-9_.]+$")
    # convert to lowercase
    data_str = data_str.lower()
    # remove hyperlinks
    data_str = url_re.sub(' ', data_str)
    # remove puncuation
    data_str = punc_re.sub(' ', data_str)
    # remove numeric 'words'
    data_str = num_re.sub(' ', data_str)
    # remove non a-z 0-9 characters and words shorter than 3 characters
    list_pos = 0
    cleaned_str = ''
    for word in data_str.split():
        if list_pos == 0:
            if alpha_num_re.match(word) and len(word) > 2:
                cleaned_str = word
            else:
                cleaned_str = ' '
        else:
            if alpha_num_re.match(word) and len(word) > 2:
                cleaned_str = cleaned_str + ' ' + word
            else:
                cleaned_str += ' '
        list_pos += 1
    cleaned_str2 = remove_all_space(cleaned_str)
    return cleaned_str2

remove_features_udf = udf(remove_features, StringType())

In [None]:
#remove noise
df_TweetsCleaned = df_TweetLabeled.withColumn("cleaned_text", remove_features_udf(df_TweetLabeled['full_text']))
df_TweetsCleaned.select('cleaned_text').show(truncate=50)
#df_TweetsCleaned.printSchema()

In [None]:
# Tokenizzazione
tkn = Tokenizer()\
      .setInputCol("cleaned_text")\
      .setOutputCol("words")

# Eliminazione Stopwords
englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
stops = StopWordsRemover()\
        .setStopWords(englishStopWords)\
        .setInputCol("words")\
        .setOutputCol("words_nsw")

pipeline = Pipeline(stages = [tkn, stops])


df_TweetsCleaned = pipeline\
    .fit(df_TweetsCleaned.select("full_text", "cleaned_text"))\
    .transform(df_TweetsCleaned.select("full_text", "cleaned_text"))

df_TweetsCleaned.select("full_text", "cleaned_text").show()

In [None]:
# Divisione Training e Test
train, test = df_TweetsCleaned.select("words_nsw", "label").randomSplit([0.75,0.25], seed=2020)

cv  = CountVectorizer(inputCol='words_nsw', outputCol='tf')
idf = IDF().setInputCol('tf').setOutputCol('features')
nb  = NaiveBayes()

pipeline = Pipeline(stages=[cv, idf, nb])

# Dichiarazione della pipeline
model = pipeline.fit(train)

# Valutazione del modello con dati di training
predictions_train = model.transform(train)

# Calcolo dell'accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
eval_train = evaluator.evaluate(predictions_train)

# Valutazione del modello con dati di test
predictions_test = model.transform(test)

# Calcolo dell'accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
eval_test = evaluator.evaluate(predictions_test)

In [None]:
y_true = test.select('label').collect()
y_pred = predictions_test.select("prediction").collect()
confusion_matrix(y_true, y_pred, labels=["0", "1", "2"])