In [1]:
import findspark
findspark.init()
findspark.find()

import pyspark
findspark.find()

'C:\\Apps\\spark-2.4.7-bin-hadoop2.7'

In [20]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, desc, upper
from pyspark.sql.types import *
import os

In [3]:
## Download Jars for Spark

os.environ['PYSPARK_SUBMIT_ARGS'] = ' --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4 pyspark-shell'

In [4]:
# Initialize Spark Session

spark = SparkSession.builder.appName('Spark Structured Streaming') \
        .getOrCreate()

In [5]:
# Subscribe to Kafka topic

df = spark.readStream.format('kafka').option("kafka.bootstrap.servers", "localhost:9092").option("subscribe", "twitter").load()

In [6]:
# Defining the Json Schema, important to map the Json string into Dataframe types (from pyspark.sql.types import *)

jsonSchema = StructType([StructField("created_at", DoubleType(), True), 
                         StructField("hashtags", ArrayType(StructType([
                            StructField("text", StringType(), True), 
                             StructField("indices", 
                                         ArrayType(IntegerType(), True))])), True),
                         StructField("favorite_count", DoubleType(), True), 
                         StructField("retweet_count", DoubleType(), True),
                         StructField("text", StringType(), True), 
                         StructField("id", StringType(), True),
                         StructField("geo", StructType([
                             StructField("type", StringType(), True), 
                             StructField("coordinates", 
                                         ArrayType(LongType(), True))]), True), 
                         StructField("lang", StringType(), True)])

In [7]:
df = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
     .withColumn("json", from_json(col('value').cast("string"), jsonSchema))

In [8]:
# Count by Language
lang_count = df.groupBy("json.lang").count().sort(desc("count")) \
            .writeStream \
            .outputMode("complete") \
            .format("console") \
            .option("truncate", "false") \
            .start()

In [9]:

#Stop the query - this is for testing purpose in produciton we use awaitTermination()
lang_count.stop()

### Exercício 1

Some os valores dos campo *favorite_count* por *lang*

In [16]:
# Count by Language
favorite_sum = df.groupBy("json.lang").sum("json.favorite_count") \
            .writeStream \
            .outputMode("complete") \
            .format("console") \
            .option("truncate", "false") \
            .start()

In [17]:
favorite_sum.stop()

### Exercício 2

Deixe os valores do campo *lang* em caixa alta

Dica: upper()

In [26]:
lang_alta = df.select(upper(col("json.lang"))) \
            .writeStream \
            .outputMode("update") \
            .format("console") \
            .option("truncate", "false") \
            .start()

In [27]:
lang_alta.stop()