<a href="https://colab.research.google.com/github/ekaratnida/Data_Streaming_and_Realtime_Analytics/blob/main/Week7_twitter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget https://downloads.apache.org/spark/spark-2.4.8/spark-2.4.8-bin-hadoop2.7.tgz
!tar -xvf spark-2.4.8-bin-hadoop2.7.tgz
!pip install findspark
!wget "https://repo1.maven.org/maven2/org/apache/spark/spark-streaming-kafka-0-8-assembly_2.11/2.4.8/spark-streaming-kafka-0-8-assembly_2.11-2.4.8.jar"
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.8-bin-hadoop2.7"
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /content/spark-streaming-kafka-0-8-assembly_2.11-2.4.8.jar pyspark-shell'

In [24]:
# importing required libraries
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.streaming import StreamingContext
import pyspark.sql.types as tp
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml.feature import StopWordsRemover, Word2Vec, RegexTokenizer
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import Row, Column
import sys
from pyspark import SparkFiles

In [16]:
# define the function to get the predicted sentiment on the data received
def get_prediction(tweet_text):
  try:
    
    # remove the blank tweets
    tweet_text = tweet_text.filter(lambda x: len(x) > 0)
    
    # create the dataframe with each row contains a tweet text
    rowRdd = tweet_text.map(lambda w: Row(tweet=w))
    wordsDataFrame = spark.createDataFrame(rowRdd)
		# get the sentiments for each row
    pipelineFit.transform(wordsDataFrame).select('tweet','prediction').show()
  
  except : 
    print('No data')



In [None]:
sc = SparkContext(appName="PySparkShell")
spark = SparkSession(sc)

url = "https://raw.githubusercontent.com/lakshay-arora/PySpark/master/spark_streaming/datasets/twitter_sentiments.csv"
spark.sparkContext.addFile(url)

# define the schema
my_schema = tp.StructType([
        tp.StructField(name= 'id',          dataType= tp.IntegerType(),  nullable= True),
        tp.StructField(name= 'label',       dataType= tp.IntegerType(),  nullable= True),
        tp.StructField(name= 'tweet',       dataType= tp.StringType(),   nullable= True)    
            ])
# reading the data set
print('\n\nReading the dataset...........................\n')
my_data = spark.read.csv("file://"+SparkFiles.get("twitter_sentiments.csv"), schema=my_schema, header=True)
my_data.show(2)

my_data.printSchema()
print('\n\nDefining the pipeline stages.................\n')
stage_1 = RegexTokenizer(inputCol= 'tweet' , outputCol= 'tokens', pattern= '\\W')

stage_2 = StopWordsRemover(inputCol= 'tokens', outputCol= 'filtered_words')

stage_3 = Word2Vec(inputCol= 'filtered_words', outputCol= 'vector', vectorSize= 100)

model = LogisticRegression(featuresCol= 'vector', labelCol= 'label') 

print('\n\nStages Defined................................\n')
pipeline = Pipeline(stages= [stage_1, stage_2, stage_3, model])

print('\n\nFit the pipeline with the training data.......\n')
pipelineFit = pipeline.fit(my_data)

print('\n\nModel Trained....Waiting for the Data!!!!!!!!\n')


In [27]:
from pyspark.streaming.kafka import KafkaUtils
kafka_topic_name = "my-first-topic"
kafka_bootstrap_servers = 'ec2-13-229-46-113.ap-southeast-1.compute.amazonaws.com:9092'

In [None]:
ssc = StreamingContext(sc, batchDuration= 3)
kvs = KafkaUtils.createStream(ssc, kafka_bootstrap_servers, 'spark-streaming-consumer', {kafka_topic_name:1}) 
kvs = KafkaUtils.createDirectStream(ssc, [kafka_topic_name], {"metadata.broker.list": kafka_bootstrap_servers})
kvs = KafkaUtils.createDirectStream(ssc, [kafka_topic_name], {
                        'bootstrap.servers':kafka_bootstrap_servers,
                        'group.id':'test-group',
                        'auto.offset.reset':'largest'})

lines = kvs.map(lambda x: x[1])
words = lines.flatMap(lambda line : line.split('TWEET_APP'))

words.foreachRDD(get_prediction)

ssc.start()             # Start the computation
ssc.awaitTermination()  # Wait for the computation to terminate

In [None]:
ssc.stop()
sc.stop()