In [None]:
#Import modules and create spark session

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover

In [6]:
#create Spark session
appName = "Sentiment Analysis in Spark"
spark = SparkSession \
    .builder \
    .appName(appName) \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()


In [None]:
#Read data file into Spark dataFrame
#Role: Read the CSV file containing the tweet data into a Spark DataFrame.
#The inferSchema=True option automatically detects the data types of the columns,
#and header=True indicates that the first row of the CSV file contains column names.

In [7]:
#read csv file into dataFrame with automatically inferred schema
tweets_csv = spark.read.csv('tweets.csv', inferSchema=True, header=True)
tweets_csv.show(truncate=False, n=3)

+------+---------+---------------+---------------------------------+
|ItemID|Sentiment|SentimentSource|SentimentText                    |
+------+---------+---------------+---------------------------------+
|1038  |1        |Sentiment140   |that film is fantastic #brilliant|
|1804  |1        |Sentiment140   |this music is really bad #myband |
|1693  |0        |Sentiment140   |winter is terrible #thumbs-down  |
+------+---------+---------------+---------------------------------+
only showing top 3 rows



In [None]:
#Select the related data
#Role: Select only the relevant columns (SentimentText and Sentiment). 
#The Sentiment column is cast to an integer type and renamed to label, 
#which is required for the machine learning model.


In [8]:
#select only "SentimentText" and "Sentiment" column, 
#and cast "Sentiment" column data into integer
data = tweets_csv.select("SentimentText", col("Sentiment").cast("Int").alias("label"))
data.show(truncate = False,n=5)

+---------------------------------+-----+
|SentimentText                    |label|
+---------------------------------+-----+
|that film is fantastic #brilliant|1    |
|this music is really bad #myband |1    |
|winter is terrible #thumbs-down  |0    |
|this game is awful #nightmare    |0    |
|I love jam #loveit               |1    |
+---------------------------------+-----+
only showing top 5 rows



In [None]:
#Divide data into training and testing data
#Role: Select only the relevant columns (SentimentText and Sentiment).
#The Sentiment column is cast to an integer type and renamed to label,
#which is required for the machine learning model.


In [9]:
#divide data, 70% for training, 30% for testing
dividedData = data.randomSplit([0.7, 0.3]) 
trainingData = dividedData[0] #index 0 = data training
testingData = dividedData[1] #index 1 = data testing
train_rows = trainingData.count()
test_rows = testingData.count()
print ("Training data rows:", train_rows, "; Testing data rows:", test_rows)

Training data rows: 1356 ; Testing data rows: 576


In [None]:
#Prepare training data
#Separate "SentimentText" into individual words using tokenizer
#Role: Tokenize the text data by splitting sentences into individual words. 
#This is the first step in converting text data into
#a format suitable for machine learning.

In [11]:

tokenizer = Tokenizer(inputCol="SentimentText", outputCol="SentimentWords")
tokenizedTrain = tokenizer.transform(trainingData)
tokenizedTrain.show(truncate=False, n=5)

+-------------------------+-----+------------------------------+
|SentimentText            |label|SentimentWords                |
+-------------------------+-----+------------------------------+
|I adore cheese #bestever |1    |[i, adore, cheese, #bestever] |
|I adore cheese #brilliant|1    |[i, adore, cheese, #brilliant]|
|I adore cheese #favorite |1    |[i, adore, cheese, #favorite] |
|I adore cheese #loveit   |1    |[i, adore, cheese, #loveit]   |
|I adore cheese #thumbs-up|1    |[i, adore, cheese, #thumbs-up]|
+-------------------------+-----+------------------------------+
only showing top 5 rows



In [None]:
#Removing stop words (unimportant words to be features)
#Role: Remove common words (stop words) that are typically not useful
#for sentiment analysis. This helps focus on the meaningful words
#that are more likely to indicate sentiment.

In [12]:
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(),outputCol="MeaningfulWords")
SwRemovedTrain = swr.transform(tokenizedTrain)
SwRemovedTrain.show(truncate=False, n=5)

+-------------------------+-----+------------------------------+---------------------------+
|SentimentText            |label|SentimentWords                |MeaningfulWords            |
+-------------------------+-----+------------------------------+---------------------------+
|I adore cheese #bestever |1    |[i, adore, cheese, #bestever] |[adore, cheese, #bestever] |
|I adore cheese #brilliant|1    |[i, adore, cheese, #brilliant]|[adore, cheese, #brilliant]|
|I adore cheese #favorite |1    |[i, adore, cheese, #favorite] |[adore, cheese, #favorite] |
|I adore cheese #loveit   |1    |[i, adore, cheese, #loveit]   |[adore, cheese, #loveit]   |
|I adore cheese #thumbs-up|1    |[i, adore, cheese, #thumbs-up]|[adore, cheese, #thumbs-up]|
+-------------------------+-----+------------------------------+---------------------------+
only showing top 5 rows



In [None]:
#Converting words feature into numerical feature. In Spark 2.2.1,it is implemented 
#in HashingTF funtion using Austin Appleby's MurmurHash 3 algorithm
#Role: Convert the meaningful words into numerical feature vectors using hashing.
#This step is essential because machine learning models require numerical input, not text.

In [13]:
hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="features")
numericTrainData = hashTF.transform(SwRemovedTrain).select('label', 'MeaningfulWords', 'features')
numericTrainData.show(truncate=False, n=3)

+-----+---------------------------+-------------------------------------------+
|label|MeaningfulWords            |features                                   |
+-----+---------------------------+-------------------------------------------+
|1    |[adore, cheese, #bestever] |(262144,[1689,91011,100089],[1.0,1.0,1.0]) |
|1    |[adore, cheese, #brilliant]|(262144,[1689,45361,100089],[1.0,1.0,1.0]) |
|1    |[adore, cheese, #favorite] |(262144,[1689,100089,108624],[1.0,1.0,1.0])|
+-----+---------------------------+-------------------------------------------+
only showing top 3 rows



In [None]:
#Train our classifier model using training data
#Role: Train a Logistic Regression model using the training data. 
#Logistic Regression is a common algorithm for binary classification 
#tasks like sentiment analysis.

In [14]:
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10, regParam=0.01)
model = lr.fit(numericTrainData)
print ("Training is done!")

Training is done!


In [None]:
#Prepare testing data
#Role: Apply the same preprocessing steps
#(tokenization, stop word removal, and feature hashing) to the testing data.
#This ensures the test data is in the same format as the training data 
#for making predictions.

In [15]:
tokenizedTest = tokenizer.transform(testingData)
SwRemovedTest = swr.transform(tokenizedTest)
numericTest = hashTF.transform(SwRemovedTest).select('Label', 'MeaningfulWords', 'features')
numericTest.show(truncate=False, n=2)

+-----+------------------------------------+-------------------------------------------------------+
|Label|MeaningfulWords                     |features                                               |
+-----+------------------------------------+-------------------------------------------------------+
|1    |[adore, cheese, #toptastic]         |(262144,[1689,42010,100089],[1.0,1.0,1.0])             |
|1    |[adore, classical, music, #bestever]|(262144,[91011,100089,102383,131250],[1.0,1.0,1.0,1.0])|
+-----+------------------------------------+-------------------------------------------------------+
only showing top 2 rows



In [None]:
#Predict testing data and calculate the accuracy model

In [16]:
prediction = model.transform(numericTest)
predictionFinal = prediction.select( "MeaningfulWords", "prediction", "Label")
predictionFinal.show(n=10, truncate = False)
correctPrediction = predictionFinal.filter(predictionFinal['prediction'] == predictionFinal['Label']).count()
totalData = predictionFinal.count()
print("correct prediction:", correctPrediction, ", total data:", totalData, ", accuracy:", correctPrediction/totalData)

+-------------------------------------+----------+-----+
|MeaningfulWords                      |prediction|Label|
+-------------------------------------+----------+-----+
|[adore, cheese, #toptastic]          |1.0       |1    |
|[adore, classical, music, #bestever] |1.0       |1    |
|[adore, classical, music, #loveit]   |1.0       |1    |
|[adore, classical, music, #toptastic]|1.0       |1    |
|[adore, coffee, #brilliant]          |1.0       |1    |
|[adore, coffee, #loveit]             |1.0       |1    |
|[adore, coffee, #thumbs-up]          |1.0       |1    |
|[adore, coffee, #toptastic]          |1.0       |1    |
|[adore, pop, music, #loveit]         |1.0       |1    |
|[adore, rock, music, #thumbs-up]     |1.0       |1    |
+-------------------------------------+----------+-----+
only showing top 10 rows

correct prediction: 563 , total data: 576 , accuracy: 0.9774305555555556
