<a href="https://colab.research.google.com/github/daddyojoey/PythonProjects/blob/main/Real_Time_Social_Media_Sentiment_Analysis_using_Apache_Spark_and_Cloud_Infrastructure.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install PySpark and dependencies
!pip install -q pyspark

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when


In [3]:
# Start Spark Session
spark = SparkSession.builder.appName("Sentiment140").getOrCreate()


In [5]:
# Define column names
columns = ["sentiment", "id", "date", "query", "user", "text"]

# Load the CSV file
df = spark.read.csv("sentiment140.csv", header=False, inferSchema=True)

# Rename columns
for idx, name in enumerate(columns):
    df = df.withColumnRenamed(df.columns[idx], name)

# Show a sample of the data
df.show(5, truncate=False)


+---------+----------+----------------------------+--------+---------------+-------------------------------------------------------------------------------------------------------------------+
|sentiment|id        |date                        |query   |user           |text                                                                                                               |
+---------+----------+----------------------------+--------+---------------+-------------------------------------------------------------------------------------------------------------------+
|0        |1467810369|Mon Apr 06 22:19:45 PDT 2009|NO_QUERY|_TheSpecialOne_|@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D|
|0        |1467810672|Mon Apr 06 22:19:49 PDT 2009|NO_QUERY|scotthamilton  |is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!    |
|0        |1467810917|Mon Apr 06 22

In [6]:
# Replace sentiment values: 0 -> Negative, 4 -> Positive
df = df.withColumn("sentiment_label", when(col("sentiment") == 0, "Negative")
                   .when(col("sentiment") == 4, "Positive"))

# Keep only relevant columns
df = df.select("sentiment_label", "text")

# Show cleaned data
df.show(5, truncate=False)


+---------------+-------------------------------------------------------------------------------------------------------------------+
|sentiment_label|text                                                                                                               |
+---------------+-------------------------------------------------------------------------------------------------------------------+
|Negative       |@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D|
|Negative       |is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!    |
|Negative       |@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds                          |
|Negative       |my whole body feels itchy and like its on fire                                                                     |
|Negative       |@nationwideclass no, it's not behaving at all

In [7]:
# Group by sentiment and count
df.groupBy("sentiment_label").count().show()


+---------------+------+
|sentiment_label| count|
+---------------+------+
|       Positive|800000|
|       Negative|800000|
+---------------+------+

