In [2]:
#%%capture
#!wget "https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_2.12/3.5.0/spark-sql-kafka-0-10_2.12-3.5.0.jar"
#!wget "https://repo1.maven.org/maven2/org/apache/spark/spark-streaming-kafka-0-10_2.12/3.5.0/spark-streaming-kafka-0-10_2.12-3.5.0.jar"

In [3]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.5.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0 pyspark-shell'

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.appName("StreamingMLPrediction").getOrCreate()

# Optional: Set log level to reduce console noise
spark.sparkContext.setLogLevel("ERROR")

# Mock Training Data (Label: 0=Negative, 1=Positive)
training_data = spark.createDataFrame([
    (0.0, "The stream is failing due to error"),
    (0.0, "This is slow and bad performance"),
    (1.0, "PySpark is excellent and fast"),
    (1.0, "I love this streaming architecture")
], ["label", "text"])

# Define the ML Pipeline Stages
# 1. Tokenizer: Splits text into words
tokenizer = Tokenizer(inputCol="text", outputCol="words")
# 2. HashingTF: Converts words (tokens) into fixed-size feature vectors
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
# 3. IDF: Weights the features based on their importance
idf = IDF(inputCol="rawFeatures", outputCol="features")
# 4. Logistic Regression: The classification algorithm
lr = LogisticRegression(maxIter=10, regParam=0.001)

# Chain the stages into a Pipeline
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, lr])

# Train the model (Estimator) to create a PipelineModel (Transformer)
model = pipeline.fit(training_data)

print(">>> Sentiment Model trained and ready for streaming inference.")

In [None]:
lines = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "10.224.177.98:8097") \
  .option("subscribe", "input") \
  .option("startingOffsets", "latest") \
  .load()

In [6]:
lines.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [None]:
streaming_text = lines.selectExpr("CAST(value AS STRING) as text")

# Apply the trained model to the incoming Kafka messages
predictions = model.transform(streaming_text)

# Select relevant columns for output
output_df = predictions.select("text", "prediction")

# Write the results to the console
query = output_df.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

query.awaitTermination()

In [38]:
for q in spark.streams.active:
    q.stop()