In [0]:
# pyspark functions for real time streaming
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions
from pyspark.streaming import StreamingContext

# textblob for sentiment analysis
!pip install --upgrade pip
!pip install textblob==0.17.1
from textblob import TextBlob

Collecting pip
  Using cached pip-23.0.1-py3-none-any.whl (2.1 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.2.4
    Uninstalling pip-21.2.4:
      Successfully uninstalled pip-21.2.4
Successfully installed pip-23.0.1


In [0]:
# text cleaning

def preprocessing(lines):
    
    words = lines.select(explode(split(lines.value, "t_end")).alias("word"))
    
    words = words.na.replace("", None)
    words = words.na.drop()
    
    words = words.withColumn("word", functions.lower("word"))
    words = words.withColumn("word", functions.regexp_replace("word", r"http\S+", ""))
    words = words.withColumn("word", functions.regexp_replace("word", "@\w+", ""))
    words = words.withColumn("word", functions.regexp_replace("word", r"[^\w\s]", ""))
    words = words.withColumn("word", functions.regexp_replace("word", r"[^\x00-\x7F]+", ""))
    
    return words


# text classification

def sentiment_detection(text):
    sentiment = float(TextBlob(text).sentiment.polarity)
    if sentiment < 0:
        sentiment_str = "negative"
    elif sentiment > 0:
        sentiment_str = "positive"
    else:
        sentiment_str = "neutral"
    return sentiment_str

def polarity_detection(text):
    return TextBlob(text).sentiment.polarity

def subjectivity_detection(text):
    return TextBlob(text).sentiment.subjectivity

def text_classification(words):
    
    # sentiment_detection
    sentiment_detection_udf = udf(sentiment_detection, StringType())  
    words = words.withColumn("sentiment", sentiment_detection_udf("word"))
    
    # polarity detection
    polarity_detection_udf = udf(polarity_detection, StringType())
    words = words.withColumn("polarity", polarity_detection_udf("word"))    
    
    # subjectivity detection
    subjectivity_detection_udf = udf(subjectivity_detection, StringType()) 
    words = words.withColumn("subjectivity", subjectivity_detection_udf("word"))    
    return words

In [0]:
# real time streaming
ssc = StreamingContext(sc, 1)

# create Spark session
spark_s = SparkSession.builder.appName("Real_time_Dwayne_Johnson_tweets").getOrCreate()

# read the tweet data from socket
host=3360
lines = spark_s \
    .readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", host) \
    .load()
print("Streaming in progress...")

# Preprocess the data
words = preprocessing(lines)
print("Processing: OK")
print(words)
    
# text classification to define sentiment
words = text_classification(words)
print("Classification of tweets sentiment: OK")
    
words = words.repartition(1)
    
query = words.writeStream.queryName("tweets_list") \
    .outputMode("append") \
    .format("memory") \
    .start()
    
#query.awaitTermination()

Streaming in progress...
Processing: OK
DataFrame[word: string]
Classification of tweets sentiment: OK


In [0]:
%sql

select * from tweets_list

word,sentiment,polarity,subjectivity
whos dwayne johnson,neutral,0.0,0.0
ya do not want to miss this one,neutral,0.0,0.0
all i want at wrestlemania is another 3 second squash match,neutral,0.0,0.0
success isnt always about greatness its about consistency consistent hard work leads to success greatness will come dwayne johnson,positive,0.1395833333333333,0.1979166666666666
disney sues dwayne johnson wins 30 million judgment in illegal streaming case the hollywood reporter,negative,-0.1,0.35
theres still time,neutral,0.0,0.0
cost the title with getting the win setting up wm match next year my prediction,positive,0.4,0.2
bro lives under,neutral,0.0,0.0
do you think trans women and their allies are incapable of having a societydestructing hillthatidieon argument over a 30 second piss in a mens toilet,negative,-0.0166666666666666,0.0
or even acknowledging that joanna cherry _might_ have even an essence of a legitimate concern,neutral,0.0,0.0


Output can only be rendered in Databricks