In [2]:
#%%capture
#!wget "https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_2.12/3.5.0/spark-sql-kafka-0-10_2.12-3.5.0.jar"
#!wget "https://repo1.maven.org/maven2/org/apache/spark/spark-streaming-kafka-0-10_2.12/3.5.0/spark-streaming-kafka-0-10_2.12-3.5.0.jar"

In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.5.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0 pyspark-shell'

In [23]:

#!pip install river

In [24]:
import pandas as pd
import os
import pickle
import json
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from river import linear_model, feature_extraction, compose

# --- Configuration ---
# File path to save the state of the evolving River model
MODEL_STATE_PATH = "/tmp/river_sentiment_model_text.pkl" 
# ---

spark = SparkSession.builder.appName("RiverTextStream").getOrCreate()
spark.sparkContext.setLogLevel("WARN")

In [25]:
def get_or_create_model():
    """Initializes a new River pipeline or loads the existing state."""
    if os.path.exists(MODEL_STATE_PATH):
        print("Loading existing River model state...")
        with open(MODEL_STATE_PATH, 'rb') as f:
            return pickle.load(f)
    else:
        print("Initializing new River Logistic Regression Pipeline...")
        
        # 1. Feature Extraction: BagOfWords (BoW)
        # BagOfWords counts tokens. River is smart and uses a sparse, unbounded vocabulary.
        text_vectorizer = feature_extraction.BagOfWords(
            lowercase=True, 
            ngram_range=(1, 2) # Use unigrams and bigrams
        )
        
        # 2. Classifier: Logistic Regression
        # River's Logistic Regression is optimized for single-instance updates (online learning)
        classifier = linear_model.LogisticRegression()

        # 3. Build the online Pipeline: (Transformer | Estimator)
        model = compose.Pipeline(
            ('vectorizer', text_vectorizer),
            ('lr', classifier)
        )
        return model

In [26]:
def save_model(model):
    """Saves the current state of the River model."""
    try:
        with open(MODEL_STATE_PATH, 'wb') as f:
            pickle.dump(model, f)
        print(f"Model state saved to {MODEL_STATE_PATH}")
    except Exception as e:
        print(f"Error saving model state: {e}")

In [27]:
def process_batch_mixed_mode(micro_batch_df: pd.DataFrame):
    """Handles both training and prediction based on the presence of a 'label'."""
    
    model = get_or_create_model() # Load the current model state
    
    predictions = []
    
    for index, row in micro_batch_df.iterrows():
        text_sample = row['text']
        print(text_sample)
        true_label = row['label'] 
        print(true_label)
        
        # 1. Prediction (Always runs first)
        y_pred = model.predict_one(text_sample)
        predictions.append(y_pred)
        print(text_sample, " predicted as ", y_pred)
        
        # 2. Conditional Training (Only runs if a label is present)
        # Check if the label is NOT null (or not NaN/None in Pandas)
        if pd.notna(true_label):
            # Convert label to integer and update the model
            model.learn_one(text_sample, int(true_label))
            print(text_sample, " has y must train.")
        else:
            print(text_sample, " has no y.")
            
    save_model(model) # Save the state after processing the entire batch
    
    # Return results including the raw prediction and the input label
    micro_batch_df['raw_prediction'] = predictions
    return micro_batch_df[['text', 'label', 'raw_prediction']]  

In [28]:
def apply_online_learning(micro_batch_df, batch_id):
    
    """The main foreachBatch entry point."""
    print(f"--- Processing Batch ID: {batch_id} ---")
    
    # Convert Spark DataFrame to Pandas DataFrame
    pandas_df = micro_batch_df.toPandas()

    if pandas_df.empty:
        print("Empty batch, skipping ML update.")
        return
        
    # Process and learn from the batch
    # result_df = process_batch_with_online_ml(pandas_df)
    result_df = process_batch_mixed_mode(pandas_df)

    # Write results back to a sink (Console in this case)
    spark_result_df = spark.createDataFrame(result_df)
    
    spark_result_df.show(truncate=False)

In [29]:
kafka_schema = StructType([
    StructField("text", StringType(), True),
    StructField("label", IntegerType(), True) # 0 or 1 for binary classification
])

In [30]:
# 1. Read Stream from Kafka
raw_stream = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "10.10.83.206:8097") \
  .option("subscribe", "input") \
  .option("startingOffsets", "latest") \
  .load()

In [31]:
# 2. Parse the JSON value into structured columns
parsed_stream = raw_stream.select(
    from_json(col("value").cast("string"), kafka_schema).alias("data")
).select("data.*")

In [32]:
# 3. Apply foreachBatch for Online Learning
query = parsed_stream.writeStream \
    .outputMode("update") \
    .foreachBatch(apply_online_learning) \
    .option("checkpointLocation", "/tmp/spark/river_checkpoint") \
    .start()
query.awaitTermination()

--- Processing Batch ID: 9 ---
Loading existing River model state...
My cat is good.
1
My cat is good.  predicted as  True
My cat is good.  has y.
Model state saved to /tmp/river_sentiment_model_text.pkl
+---------------+-----+--------------+
|text           |label|raw_prediction|
+---------------+-----+--------------+
|My cat is good.|1    |true          |
+---------------+-----+--------------+

--- Processing Batch ID: 10 ---
Loading existing River model state...
My dog is bad.
0
My dog is bad.  predicted as  False
My dog is bad.  has y.
Model state saved to /tmp/river_sentiment_model_text.pkl
+--------------+-----+--------------+
|text          |label|raw_prediction|
+--------------+-----+--------------+
|My dog is bad.|0    |false         |
+--------------+-----+--------------+

--- Processing Batch ID: 11 ---
Loading existing River model state...
My bad is good.
nan
My bad is good.  predicted as  True
My bad is good.  has no y.
Model state saved to /tmp/river_sentiment_model_text.

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [22]:
for q in spark.streams.active:
    q.stop()