In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StopWordsRemover, RegexTokenizer, Word2Vec, Word2VecModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.sql.functions import col,regexp_replace, udf
from pyspark.sql.types import *

bucket_name = "web-app-project"
spark = SparkSession\
    .builder\
    .appName("sentiment_analysis")\
    .getOrCreate()

tweets_df = spark.read.csv("s3://" + bucket_name + "/tweets.csv", header=False)

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
7,application_1604386898051_0008,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
tweets_df.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+----------+--------------------+--------+---------------+--------------------+
|_c0|       _c1|                 _c2|     _c3|            _c4|                 _c5|
+---+----------+--------------------+--------+---------------+--------------------+
|  0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|  0|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|  0|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|  0|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|  0|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
|  0|1467811372|Mon Apr 06 22:20:...|NO_QUERY|       joy_wolf|@Kwesidei not the...|
|  0|1467811592|Mon Apr 06 22:20:...|NO_QUERY|        mybirch|         Need a hug |
|  0|1467811594|Mon Apr 06 22:20:...|NO_QUERY|           coZZ|@LOLTrish hey  lo...|
|  0|1467811795|Mon Apr 06 22:20:...|NO_QUERY|2Hood4Hollywood|@Tatiana_K nop

In [3]:
tweets_df = tweets_df.withColumnRenamed("_c0", "sentiment") \
.withColumnRenamed("_c5", "tweets")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
tweets_df = tweets_df.withColumn("cleaned_tweets", regexp_replace(col("tweets"), "http.+|@.|\n|RT|\d+", ' '))
# All words are lowercase and tokenized
tweets_df = RegexTokenizer(inputCol="cleaned_tweets", outputCol="lowercase_tweets", pattern="\\W").transform(tweets_df)
# We remove the StopWords
tweets_df = StopWordsRemover(inputCol="lowercase_tweets", outputCol="processed_tweets").transform(tweets_df)
# We drop the unused columns
tweets_df = tweets_df.drop("cleaned_tweets", "lowercase_tweets", "_c1", "_c2", "_c3", "_c4")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
tweets_df.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+--------------------+--------------------+
|sentiment|              tweets|    processed_tweets|
+---------+--------------------+--------------------+
|        0|@switchfoot http:...|         [witchfoot]|
|        0|is upset that he ...|[upset, update, f...|
|        0|@Kenichan I dived...|[enichan, dived, ...|
|        0|my whole body fee...|[whole, body, fee...|
|        0|@nationwideclass ...|[ationwideclass, ...|
|        0|@Kwesidei not the...|[wesidei, whole, ...|
|        0|         Need a hug |         [need, hug]|
|        0|@LOLTrish hey  lo...|[oltrish, hey, lo...|
|        0|@Tatiana_K nope t...|[atiana_k, nope, ...|
|        0|@twittera que me ...|[wittera, que, mu...|
|        0|spring break in p...|[spring, break, p...|
|        0|I just re-pierced...| [re, pierced, ears]|
|        0|@caregiving I cou...|[aregiving, could...|
|        0|@octolinz16 It it...|[ctolinz, counts,...|
|        0|@smarrison i woul...|[marrison, ve, fi...|
|        0|@iamjazzyfizzle I

In [6]:
# We define the NLP model
word2Vec_tweets = Word2Vec(vectorSize=300, maxIter=5, inputCol="processed_tweets", outputCol="features")

# We train the NLP model
tweets_model = word2Vec_tweets.fit(tweets_df)

# We save the model for the script
model_path = "s3://" + bucket_name + "/Word2Vec"
tweets_model.save(model_path)

# We add the features columns : it is the average of the words' vectors for each tweet
tweets_df = tweets_model.transform(tweets_df)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
tweets_df.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+--------------------+--------------------+--------------------+
|sentiment|              tweets|    processed_tweets|            features|
+---------+--------------------+--------------------+--------------------+
|        0|@switchfoot http:...|         [witchfoot]|[-0.0217530820518...|
|        0|is upset that he ...|[upset, update, f...|[0.01698558167977...|
|        0|@Kenichan I dived...|[enichan, dived, ...|[0.03277167000342...|
|        0|my whole body fee...|[whole, body, fee...|[-8.4388318161169...|
|        0|@nationwideclass ...|[ationwideclass, ...|[0.04142366405576...|
|        0|@Kwesidei not the...|[wesidei, whole, ...|[-0.0357014524439...|
|        0|         Need a hug |         [need, hug]|[-0.0815076828002...|
|        0|@LOLTrish hey  lo...|[oltrish, hey, lo...|[-0.0255299146072...|
|        0|@Tatiana_K nope t...|[atiana_k, nope, ...|[0.07791925345857...|
|        0|@twittera que me ...|[wittera, que, mu...|[0.10993757843971...|
|        0|spring break i

In [8]:
tweets_df = tweets_df.withColumn("sentiment", col("sentiment").cast(FloatType()))
# we replace the 4 sentiment value by 1
zero_or_one = udf(lambda x:"1" if x == 4 else "0", StringType())
tweets_df = tweets_df.withColumn("sentiment", zero_or_one("sentiment"))
tweets_df = tweets_df.withColumn("sentiment", col("sentiment").cast(IntegerType()))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
# We train the classification model
clf = MultilayerPerceptronClassifier(labelCol="sentiment", featuresCol="features", layers=[300, 64, 2])
train_data, test_data = tweets_df.randomSplit([0.8, 0.2])
clf_model = clf.fit(train_data)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
predictions = clf_model.transform(test_data)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
predictions.select("sentiment", "tweets", "probability", "prediction").show(10)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+--------------------+--------------------+----------+
|sentiment|              tweets|         probability|prediction|
+---------+--------------------+--------------------+----------+
|        0|           FUCK YOU!|[0.94792657240952...|       0.0|
|        0|        my head f...|[0.58909755930549...|       0.0|
|        0|      My current ...|[0.88309888891999...|       0.0|
|        0|     &lt;- but mu...|[0.13142297640123...|       1.0|
|        0|     I dont like ...|[0.92396972239584...|       0.0|
|        0|    awhhe man.......|[0.51391362600482...|       0.0|
|        0|   Awwwwwh  i wan...|[0.90860359585344...|       0.0|
|        0|   BoRinG   ): wh...|[0.78419618250192...|       0.0|
|        0|   I am going to ...|[0.79703344108948...|       0.0|
|        0|   I'm thinking o...|[0.27137601371047...|       1.0|
+---------+--------------------+--------------------+----------+
only showing top 10 rows

In [12]:
# We evaluate the model
evaluator = BinaryClassificationEvaluator()
evaluator.setRawPredictionCol("prediction")
evaluator.setLabelCol("sentiment")
accuracy = evaluator.evaluate(predictions)
print(f"The accuracy of the model is {round(accuracy, 2)}%")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

The accuracy of the model is 0.75%

In [13]:
# We save the model
clf_path = "s3://" + bucket_name + "/mpc_model"
clf_model.save(clf_path)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…