In [1]:
import os 
packages = "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1"

os.environ["PYSPARK_PYTHON"] = '/usr/bin/python3'
os.environ["PYSPARK_SUBMIT_ARGS"] = (
    "--packages {0} pyspark-shell".format(packages)
)
from pyspark.sql.functions import *
import json
import sys
import re

from pyspark.sql.types import *
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession


from textblob import TextBlob

# from model import preprocess_tweet

spark = SparkSession.builder.master('spark://spark-master:7077').config('spark.cores.max','1').config("spark.executor.memory", "1g").getOrCreate()

In [2]:
trumpDF = spark.readStream.format("kafka")\
.option("kafka.bootstrap.servers", "kafka:9092")\
.option("subscribe", "Trump")\
.load()


bidenDF = spark.readStream.format("kafka")\
.option("kafka.bootstrap.servers", "kafka:9092")\
.option("subscribe", "Biden")\
.load()

In [3]:
import preprocessor as p
p.set_options(p.OPT.URL, p.OPT.EMOJI)

def preprocess(text):
    tweet_text = (text.replace('&amp;', '&').replace('&lt;', '<')\
                  .replace('&gt;', '>').replace('&quot;', '"')\
                  .replace('&#39;', "'").replace(';', " ")\
                  .replace(r'\u', " "))
    tweet_text = tweet_text.replace("#", "").replace("_", " ")
    tweet_text = p.clean(tweet_text)
    
    return tweet_text
    
    
    
    

In [4]:
print(preprocess('Preprocessor is #awesome 👍 https://github.com/s/preprocessor'))

Preprocessor is awesome


In [5]:
def predict_sentiment(tweet_text):
    tweet = TextBlob(tweet_text)
    if tweet.sentiment.polarity > 0.1:
        return "Positive"
    elif tweet.sentiment.polarity < -0.1:
        return "Negative"
    else:
        return "Neutral"

In [6]:
schema = StructType([   
        StructField("time", StringType(), True),
        StructField("text", StringType(), True),
        StructField("retweet_count", DoubleType(), True),
        StructField("location", StringType(), True),
        StructField("favorite_count", DoubleType(), True),
        StructField("user_id", StringType(), True),
        StructField("place", StringType(), True),
        StructField("user_followers_count", StringType(), True),
    
])


def castData(schema, df):
    df = df.selectExpr("CAST(value AS STRING)")
    df = df.select(from_json(col("value"), schema).alias("data")).select("data.*")
    pre_udf = udf(preprocess, StringType())
    df = df.withColumn('text', pre_udf(col('text')))
    one_row_udf = udf(predict_sentiment, StringType())
    df = df.withColumn('sentiment', one_row_udf(col('text')))
    
    return df
    
trumpDF = castData(schema, trumpDF)
bidenDF = castData(schema, bidenDF)


In [7]:
query = trumpDF.writeStream.queryName("device_counts").format("console")\
    .start()

In [None]:
spark.sql('SELECT count(*) FROM device_counts').show(20, False)

In [7]:
trumpDF.writeStream.trigger(processingTime='5 seconds').queryName("trump_tweets")\
.format("csv").outputMode("append")\
.option('failOnDataLoss', 'false') \
.option("checkpointLocation", "hdfs://namenode:9000/checkpoints_Trump")\
.option('path', 'hdfs://namenode:9000/data/trump.csv').start()


<pyspark.sql.streaming.StreamingQuery at 0x7f6777eeca50>

In [8]:
bidenDF.writeStream.trigger(processingTime='5 seconds').queryName("biden_tweets")\
.format("csv").outputMode("append").option("checkpointLocation", "hdfs://namenode:9000/checkpoints_Biden")\
.option('failOnDataLoss', 'false') \
.option('path', 'hdfs://namenode:9000/data/biden.csv').start()

<pyspark.sql.streaming.StreamingQuery at 0x7f67ad2e0f50>