In [1]:
import findspark
findspark.init()
findspark.find()

import pyspark
findspark.find()

'C:\\Apps\\spark-2.4.7-bin-hadoop2.7'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, desc, trim, col, sum, avg, max, to_json

from pyspark.sql.types import *
import os

In [3]:
## Download Jars for Spark

os.environ['PYSPARK_SUBMIT_ARGS'] = ' --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4 pyspark-shell'

In [4]:
# Initialize Spark Session

spark = SparkSession.builder.appName('Spark Structured Streaming') \
        .getOrCreate()

# spark = SparkSession.builder.appName('Spark Structured Streaming') \
#         .master("spark://0.0.0.0:7077") \
#         .getOrCreate()

## kafka

In [5]:
# Subscribe to Kafka topic

df = spark.readStream.format('kafka').option("kafka.bootstrap.servers", "localhost:9092").option("subscribe", "log").load()

In [6]:
schema = StructType([StructField("Date first seen", StringType(), True),
                    StructField("Duration", StringType(), True),
                    StructField("Proto", StringType(), True),
                    StructField("Src IP Addr", StringType(), True),
                    StructField("Src Pt", StringType(), True),
                    StructField("Packets", StringType(), True),
                    StructField("Bytes", StringType(), True),
                    StructField("Flows", StringType(), True),
                    StructField("Flags", StringType(), True),
                    StructField("Tos", StringType(), True),
                    StructField("class", StringType(), True),
                    StructField("attackType", StringType(), True),
                    StructField("attackID", StringType(), True),
                    StructField("attackDescription", StringType(), True)])

In [7]:
df = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")\
        .withColumn("json", from_json(col('value').cast('string'), schema))\

In [8]:
# Cast columns
df = df.withColumn("Date first seen", col("json.Date first seen").cast(TimestampType())) \
        .withColumn("Duration", col("json.Duration").cast("double")) \
        .withColumn("Packets", trim(col("json.Packets")).cast(IntegerType()))

### Append Mode

In [11]:
query = df.select("json.*").writeStream \
            .outputMode("append") \
            .format("console") \
            .option("truncate", "false") \
            .start()

In [None]:
query.stop()

### Update Mode

In [13]:
# Avg of the numeric types

avg = df.groupBy("json.class").mean() \
        .writeStream \
        .outputMode("update") \
        .format("console") \
        .option("truncate", "false") \
        .start()

In [14]:
avg.stop()

### Complete Mode

In [16]:
# Filter and GroupBy by Class

class_query = df.select("json.*").where(col("Class") != "normal") \
                .groupBy("Class").count() \
                .writeStream \
                .outputMode("complete") \
                .format("console") \
                .option("truncate", "false") \
                .start()

In [17]:
class_query.stop()

In [9]:
# Sum, Count and AVG by class

class_query_2 = df.groupBy("json.class") \
                .agg({'json.class':'count', 'Packets': 'sum', 'Duration': 'avg' }) \
                .writeStream \
                .outputMode("complete") \
                .format("console") \
                .option("truncate", "false") \
                .start()

In [10]:
class_query_2.stop()

In [11]:
df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)
 |-- json: struct (nullable = true)
 |    |-- Date first seen: string (nullable = true)
 |    |-- Duration: string (nullable = true)
 |    |-- Proto: string (nullable = true)
 |    |-- Src IP Addr: string (nullable = true)
 |    |-- Src Pt: string (nullable = true)
 |    |-- Packets: string (nullable = true)
 |    |-- Bytes: string (nullable = true)
 |    |-- Flows: string (nullable = true)
 |    |-- Flags: string (nullable = true)
 |    |-- Tos: string (nullable = true)
 |    |-- class: string (nullable = true)
 |    |-- attackType: string (nullable = true)
 |    |-- attackID: string (nullable = true)
 |    |-- attackDescription: string (nullable = true)
 |-- Date first seen: timestamp (nullable = true)
 |-- Duration: double (nullable = true)
 |-- Packets: integer (nullable = true)



## Kafka

In [14]:
kafka_query = df.select(col("key"), col("value")).withColumn("value", col("value").cast(StringType())) \
        .writeStream.format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("checkpointLocation", "kafka_checpoint/") \
        .option("failOnDataLoss", "false") \
        .option("topic", "log-result") \
        .start()

In [15]:
kafka_query.stop()