In [1]:
import os 
packages = "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1"

os.environ["PYSPARK_SUBMIT_ARGS"] = (
    "--packages {0} pyspark-shell".format(packages)
)
from pyspark.sql.functions import *
import json
import sys
from pyspark.sql.types import *
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession


from torchtext import data
from torchtext import datasets
import pickle
import spacy

from model import RNN, preprocess_tweet
nlp = spacy.load('en')

sc = SparkContext('local')
spark = SparkSession(sc)

In [2]:
kafka_df = spark.readStream.format("kafka")\
.option("kafka.bootstrap.servers", "kafka:9092")\
.option("subscribe", "Trump")\
.load()

In [3]:
with open('./model/vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)


INPUT_DIM = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = 0

import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

model.load_state_dict(torch.load('./model/tut2-model.pt', map_location=torch.device('cpu')))

  return torch._C._cuda_getDeviceCount() > 0


<All keys matched successfully>

In [4]:
def predict_sentiment(sentence):
    sentence = preprocess_tweet(sentence)
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    
    if prediction.item() >= 0.5:
        return 'Positive'
    else:
        return 'Negative'

In [5]:
schema = StructType([                                                                                          
        StructField("text", StringType(), True),
        StructField("retweet_count", DoubleType(), True),
        StructField("location", StringType(), True),
        StructField("favorite_count", DoubleType(), True),
        StructField("user_id", StringType(), True),
        StructField("place", StringType(), True),
        StructField("user_followers_count", StringType(), True),
    
])

kafka_df_string = kafka_df.selectExpr("CAST(value AS STRING)")

tweets_table = kafka_df_string.select(from_json(col("value"), schema).alias("data")).select("data.*")

one_row_udf = udf(predict_sentiment, StringType())

tweets_table = tweets_table.withColumn('sentiment', one_row_udf(col('text')))

In [None]:
query = tweets_table.writeStream.queryName("device_counts").format("memory")\
    .start()

In [None]:
spark.sql('SELECT * FROM device_counts').show()

In [None]:
tweets_table.writeStream.trigger(processingTime='5 seconds')\
.format("csv").outputMode("append").option("checkpointLocation", "hdfs://namenode:9000/checkpoints")\
.option('path', 'hdfs://namenode:9000/data/trump.csv').start()
