In [1]:
import os 
packages = "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1"

os.environ["PYSPARK_SUBMIT_ARGS"] = (
    "--packages {0} pyspark-shell".format(packages)
)
from pyspark.sql.functions import *
import json
import sys
from pyspark.sql.types import *
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession


from torchtext import data
from torchtext import datasets
import pickle
import spacy

from model import RNN, preprocess_tweet
nlp = spacy.load('en')

sc = SparkContext('local')
spark = SparkSession(sc)

In [2]:
trumpDF = spark.readStream.format("kafka")\
.option("kafka.bootstrap.servers", "kafka:9092")\
.option("subscribe", "Trump")\
.load()


bidenDF = spark.readStream.format("kafka")\
.option("kafka.bootstrap.servers", "kafka:9092")\
.option("subscribe", "Biden")\
.load()

In [3]:
with open('./model/vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)


INPUT_DIM = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = 0

import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

model.load_state_dict(torch.load('./model/tut2-model.pt', map_location=torch.device('cpu')))

The history saving thread hit an unexpected error (OperationalError('database is locked')).History will not be written to the database.


  return torch._C._cuda_getDeviceCount() > 0


<All keys matched successfully>

In [4]:
def predict_sentiment(sentence):
    sentence = preprocess_tweet(sentence)
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    if prediction.item() >= 0.5:
        return 'Positive'
    else:
        return 'Negative'

In [5]:
schema = StructType([   
        StructField("time", StringType(), True),
        StructField("text", StringType(), True),
        StructField("retweet_count", DoubleType(), True),
        StructField("location", StringType(), True),
        StructField("favorite_count", DoubleType(), True),
        StructField("user_id", StringType(), True),
        StructField("place", StringType(), True),
        StructField("user_followers_count", StringType(), True),
    
])


def castData(schema, df):
    df = df.selectExpr("CAST(value AS STRING)")
    df = df.select(from_json(col("value"), schema).alias("data")).select("data.*")
    one_row_udf = udf(predict_sentiment, StringType())
    df = df.withColumn('sentiment', one_row_udf(col('text')))
    
    return df
    
trumpDF = castData(schema, trumpDF)
bidenDF = castData(schema, bidenDF)


In [6]:
query = trumpDF.writeStream.queryName("device_counts").format("memory")\
    .start()

In [10]:
spark.sql('SELECT * FROM device_counts').show()

+--------------------+--------------------+-------------+----------------+--------------+-------------------+-----+--------------------+---------+
|                time|                text|retweet_count|        location|favorite_count|            user_id|place|user_followers_count|sentiment|
+--------------------+--------------------+-------------+----------------+--------------+-------------------+-----+--------------------+---------+
|Sun Dec 20 16:16:...|@TheRightMelissa ...|          0.0|        Virginia|           0.0|           43531918| null|                 319| Negative|
|Sun Dec 20 16:16:...|High standing #Hi...|          0.0|            null|           0.0|1071784494326263800| null|                3732| Negative|
|Sun Dec 20 16:16:...|@Mike_Pence @Perd...|          0.0|            null|           0.0|1040326859228102700| null|                   7| Negative|
|Sun Dec 20 16:16:...|@chrissaccoccia1 ...|          0.0|            null|           0.0|1337382347260440600| null|   

In [8]:
trumpDF.writeStream.trigger(processingTime='5 seconds')\
.format("csv").outputMode("append").option("checkpointLocation", "hdfs://namenode:9000/checkpoints")\
.option('path', 'hdfs://namenode:9000/raw_data/trump.csv').start()


<pyspark.sql.streaming.StreamingQuery at 0x7fe5483d18b0>

In [9]:
bidenDF.writeStream.trigger(processingTime='5 seconds')\
.format("csv").outputMode("append").option("checkpointLocation", "hdfs://namenode:9000/checkpoints")\
.option('path', 'hdfs://namenode:9000/raw_data/biden.csv').start()

<pyspark.sql.streaming.StreamingQuery at 0x7fe51e8c7bb0>