In [1]:
import findspark
findspark.init()
findspark.find()

import pyspark
findspark.find()

'C:\\Apps\\spark-2.4.7-bin-hadoop2.7'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, desc
from pyspark.sql.types import *
import os

In [3]:
## Download Jars for Spark

os.environ['PYSPARK_SUBMIT_ARGS'] = ' --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4 pyspark-shell'
#os.environ['PYSPARK_SUBMIT_ARGS'] = ' --packages org.elasticsearch:elasticsearch-hadoop:7.12.1 pyspark-shell'

In [4]:
# Initialize Spark Session

spark = SparkSession.builder.appName('Spark Structured Streaming') \
        .getOrCreate()

# spark = SparkSession.builder.appName('Spark Structured Streaming') \
#         .master("spark://0.0.0.0:7077") \
#         .getOrCreate()

In [5]:
# In case you need to DEBUG the code
spark.sparkContext.setLogLevel("DEBUG")

## kafka Input Streaming

In [5]:
# Subscribe to Kafka topic

df = spark.readStream.format('kafka').option("kafka.bootstrap.servers", "localhost:9092").option("subscribe", "twitter").load()

In [6]:
# Defining the Json Schema, important to map the Json string into Dataframe types (from pyspark.sql.types import *)

jsonSchema = StructType([StructField("created_at", DoubleType(), True), 
                         StructField("hashtags", ArrayType(StructType([
                            StructField("text", StringType(), True), 
                             StructField("indices", 
                                         ArrayType(IntegerType(), True))])), True),
                         StructField("favorite_count", DoubleType(), True), 
                         StructField("retweet_count", DoubleType(), True),
                         StructField("text", StringType(), True), 
                         StructField("id", StringType(), True),
                         StructField("geo", StructType([
                             StructField("type", StringType(), True), 
                             StructField("coordinates", 
                                         ArrayType(LongType(), True))]), True), 
                         StructField("lang", StringType(), True)])

In [7]:
# Converting the binary data into String and the Json string into Dataframe types

df = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
     .withColumn("json", from_json(col('value').cast("string"), jsonSchema))

In [8]:
df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)
 |-- json: struct (nullable = true)
 |    |-- created_at: double (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: integer (containsNull = true)
 |    |-- favorite_count: double (nullable = true)
 |    |-- retweet_count: double (nullable = true)
 |    |-- text: string (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- geo: struct (nullable = true)
 |    |    |-- type: string (nullable = true)
 |    |    |-- coordinates: array (nullable = true)
 |    |    |    |-- element: long (containsNull = true)
 |    |-- lang: string (nullable = true)



## Exercícios 

In [9]:
# Count by Language
lang_count = df.groupBy("json.lang").count().sort(desc("count")) \
            .writeStream \
            .outputMode("complete") \
            .format("console") \
            .option("truncate", "false") \
            .start()

In [10]:
#Stop the query - this is for testing purpose in produciton we use awaitTermination()
lang_count.stop()

In [9]:
# Count total
count = df.groupBy().count() \
            .writeStream.outputMode("complete").format("console").option("truncate", "false").start()

In [15]:
count.stop()

In [9]:
# Show the entries 
query = df.writeStream.outputMode("update").format("console").option("truncate", "false").start()

In [10]:
query.stop()

## Kafka Output Streaming

In [9]:
kafka = df.writeStream.format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("checkpointLocation", "kafka_checpoint/") \
        .option("failOnDataLoss", "false") \
        .option("topic", "twitter-result") \
        .start()

In [10]:
kafka.stop()