In [1]:
# Import Spark
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd
import string, re, json


spark = SparkSession.builder \
        .config("spark.mongodb.input.uri", "mongodb://192.168.1.27/TwitterSentimentAnalysis.LabeledTweets?retryWrites=true") \
        .config("spark.mongodb.output.uri", "mongodb://192.168.1.27/TwitterSentimentAnalysis.LabeledTweets?retryWrites=true") \
        .getOrCreate()

In [2]:
df_TweetLabeled = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()

In [3]:
# remove whitespace
def remove_all_space(astring):
  return " ".join(astring.split())

# clean the text 
def remove_features(data_str):
    # compile regex
    url_re = re.compile('https?://(www.)?\w+\.\w+(/\w+)*/?')
    punc_re = re.compile('[%s]' % re.escape(string.punctuation))
    num_re = re.compile('(\\d+)')
    alpha_num_re = re.compile("^[a-z0-9_.]+$")
    # convert to lowercase
    data_str = data_str.lower()
    # remove hyperlinks
    data_str = url_re.sub(' ', data_str)
    # remove puncuation
    data_str = punc_re.sub(' ', data_str)
    # remove numeric 'words'
    data_str = num_re.sub(' ', data_str)
    # remove non a-z 0-9 characters and words shorter than 3 characters
    list_pos = 0
    cleaned_str = ''
    for word in data_str.split():
        if list_pos == 0:
            if alpha_num_re.match(word) and len(word) > 2:
                cleaned_str = word
            else:
                cleaned_str = ' '
        else:
            if alpha_num_re.match(word) and len(word) > 2:
                cleaned_str = cleaned_str + ' ' + word
            else:
                cleaned_str += ' '
        list_pos += 1
    cleaned_str2 = remove_all_space(cleaned_str)
    return cleaned_str2

remove_features_udf = udf(remove_features, StringType())

In [4]:
#remove noise
df_TweetsCleaned = df_TweetLabeled.withColumn("cleaned_text", remove_features_udf(df_TweetLabeled['full_text']))
#df_TweetsCleaned.select('cleaned_text').show(truncate=50)
df_TweetsCleaned.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- full_text: string (nullable = true)
 |-- label: integer (nullable = true)
 |-- cleaned_text: string (nullable = true)



In [5]:
# Tokenizzazione
tkn = Tokenizer()\
      .setInputCol("cleaned_text")\
      .setOutputCol("words")

# Eliminazione Stopwords
englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
stops = StopWordsRemover()\
        .setStopWords(englishStopWords)\
        .setInputCol("words")\
        .setOutputCol("words_nsw")

pipeline = Pipeline(stages = [tkn, stops])


df_TweetsCleanedNSW = pipeline\
    .fit(df_TweetsCleaned.select("full_text", "cleaned_text", "label"))\
    .transform(df_TweetsCleaned.select("full_text", "cleaned_text", "label"))

# df_TweetsCleaned.select("full_text", "words_nsw").show()
df_TweetsCleanedNSW.show()

+--------------------+--------------------+-----+--------------------+--------------------+
|           full_text|        cleaned_text|label|               words|           words_nsw|
+--------------------+--------------------+-----+--------------------+--------------------+
|"The United State...|the united states...|    0|[the, united, sta...|[united, states, ...|
|Coronavirus &amp;...|coronavirus amp c...|    0|[coronavirus, amp...|[coronavirus, amp...|
|Oh to be a 1998 b...|baby first memory...|    0|[baby, first, mem...|[baby, first, mem...|
|From uprising to ...|from uprising out...|    0|[from, uprising, ...|[uprising, outbre...|
|Another little wa...|another little wa...|    0|[another, little,...|[another, little,...|
|“Our country is f...|country facing me...|    0|[country, facing,...|[country, facing,...|
|😷 Dr. Anthony Fa...|anthony fauci the...|    0|[anthony, fauci, ...|[anthony, fauci, ...|
|Chinese number is...|chinese number li...|    0|[chinese, number,...|[chinese, n

In [6]:
# Divisione Training e Test
train, test = df_TweetsCleanedNSW.randomSplit([0.75,0.25], seed=2020)

cv  = CountVectorizer(inputCol='words_nsw', outputCol='tf')
idf = IDF().setInputCol('tf').setOutputCol('features')
nb  = NaiveBayes()

pipeline = Pipeline(stages=[cv, idf, nb])

# Dichiarazione della pipeline
model = pipeline.fit(train)

# Valutazione del modello con dati di training
predictions_train = model.transform(train)

# Calcolo dell'accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
eval_train = evaluator.evaluate(predictions_train)

# Valutazione del modello con dati di test
predictions_test = model.transform(test)

# Calcolo dell'accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
eval_test = evaluator.evaluate(predictions_test)

In [7]:
predictions_testRDD = predictions_test.select("label","prediction").rdd
predictions_testRDD = predictions_testRDD.map(lambda x: (float(x['label']), x['prediction']))
# predictions_testRDD.take(4)

In [8]:
metrics = MulticlassMetrics(predictions_testRDD)

In [9]:
metrics.confusionMatrix().toArray()

array([[9417.,  200.,  177.],
       [ 295.,  178.,   15.],
       [ 308.,   20.,  156.]])

In [10]:
print eval_test

0.911650544158


In [11]:
print eval_train

0.923920474773


In [12]:
sc

In [13]:
spark
