In [1]:
import os
spark_home = os.path.abspath(os.getcwd() + "/../spark-3.5.5-bin-hadoop3")
hadoop_home = os.path.abspath(os.getcwd() + "/../winutils")
print(f"I am using the following SPARK_HOME: {spark_home}")
if os.name == 'nt':
    os.environ["HADOOP_HOME"] = f"{hadoop_home}"
    print(f"Windows detected: set HADOOP_HOME to: {os.environ['HADOOP_HOME']}")
    hadoop_bin = os.path.join(hadoop_home, "bin")
    os.environ["PATH"] = f"{hadoop_bin};{os.environ['PATH']}"
    print(f"  Also added Hadoop bin directory to PATH: {hadoop_bin}")

import findspark
import pyspark
from pyspark.streaming import StreamingContext

findspark.init(spark_home)
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession.builder.getOrCreate()

I am using the following SPARK_HOME: C:\Users\Seppe\Desktop\spark\spark-3.5.5-bin-hadoop3
Windows detected: set HADOOP_HOME to: C:\Users\Seppe\Desktop\spark\winutils


In [None]:
import threading

# Helper thread to avoid the Spark StreamingContext from blocking Jupyter
        
class StreamingThread(threading.Thread):
    def __init__(self, ssc):
        super().__init__()
        self.ssc = ssc
    def run(self):
        self.ssc.start()
        self.ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)
        

In [None]:
import random
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit
from pyspark.sql.types import StringType

import numpy as np
import pickle
import pandas as pd
from pyspark.sql.functions import udf, col, concat_ws, lit
from pyspark.sql.types import ArrayType, FloatType, StringType
from sentence_transformers import SentenceTransformer


In [None]:
globals()['models_loaded'] = False
globals()['my_model'] = None

# Toy predict function that returns a random probability. Normally you'd use your loaded globals()['my_model'] here
def predict(df):
    return random.random()

predict_udf = udf(predict, StringType())

def process(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    
    # Convert to data frame
    df = spark.read.json(rdd)
    df.show()
    
    # Utilize our predict function
    df_withpreds = df.withColumn("pred", predict_udf(
        struct([df[x] for x in df.columns])
    ))
    df_withpreds.show()
    
    # Normally, you wouldn't use a UDF (User Defined Function) Python function to predict as we did here (you can)
    # but an MLlib model you've built and saved with Spark
    # In this case, you need to prevent loading your model in every call to "process" as follows:
    
    # Load in the model if not yet loaded:
    if not globals()['models_loaded']:
        # load in your models here
        globals()['my_model'] = '***' # Replace '***' with e.g.:    [...].load('my_logistic_regression')
        globals()['models_loaded'] = True
        
    # And then predict using the loaded model (uncomment below):
    
    # df_result = globals()['my_model'].transform(df)
    # df_result.show()

In [5]:
ssc = StreamingContext(sc, 10)



+--------------------+----------+-------------+--------------------+--------------------+--------------------+
|                 aid|categories|main_category|           published|             summary|               title|
+--------------------+----------+-------------+--------------------+--------------------+--------------------+
|http://arxiv.org/...|     cs.CV|        cs.CV|2025-03-25T05:00:11Z|Sketch animation,...|Multi-Object Sket...|
+--------------------+----------+-------------+--------------------+--------------------+--------------------+

+--------------------+----------+-------------+--------------------+--------------------+--------------------+------------------+
|                 aid|categories|main_category|           published|             summary|               title|              pred|
+--------------------+----------+-------------+--------------------+--------------------+--------------------+------------------+
|http://arxiv.org/...|     cs.CV|        cs.CV|2025-03

In [6]:
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(process)

In [7]:
ssc_t = StreamingThread(ssc)
ssc_t.start()

In [8]:
ssc_t.stop()

----- Stopping... this may take a few seconds -----
