In [None]:
import os
spark_home = os.path.abspath(os.getcwd() + "/spark/spark-3.5.5-bin-hadoop3")
hadoop_home = os.path.abspath(os.getcwd() + "/spark/winutils")
print(f"I am using the following SPARK_HOME: {spark_home}")
if os.name == 'nt':
    os.environ["HADOOP_HOME"] = f"{hadoop_home}"
    print(f"Windows detected: set HADOOP_HOME to: {os.environ['HADOOP_HOME']}")
    hadoop_bin = os.path.join(hadoop_home, "bin")
    os.environ["PATH"] = f"{hadoop_bin};{os.environ['PATH']}"
    print(f"  Also added Hadoop bin directory to PATH: {hadoop_bin}")

import findspark
import pyspark
from pyspark.streaming import StreamingContext

findspark.init(spark_home)
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession.builder.getOrCreate()


In [None]:
import threading

# Helper thread to avoid the Spark StreamingContext from blocking Jupyter
        
class StreamingThread(threading.Thread):
    def __init__(self, ssc):
        super().__init__()
        self.ssc = ssc
    def run(self):
        self.ssc.start()
        self.ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)
        

In [None]:
import random
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit
from pyspark.sql.types import StringType

import numpy as np
import pickle
import pandas as pd
from pyspark.sql.functions import udf, col, concat_ws, lit
from pyspark.sql.types import ArrayType, FloatType, StringType
from sentence_transformers import SentenceTransformer
from transformers import pipeline


In [None]:
# Run this separately to convert your model to ONNX
from transformers import pipeline
import torch
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer

def convert_to_onnx():
    model_name = "gpham/scibert-finetuned-arxiv-42"
    
    # Load the original model
    model = ORTModelForSequenceClassification.from_pretrained(
        model_name, 
        from_transformers=True,
        export=True
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Save ONNX model locally
    model.save_pretrained("./models/scibert-finetuned-arxiv-42-onnx")
    tokenizer.save_pretrained("./models/scibert-finetuned-arxiv-42-onnx")
    
    print("Model converted to ONNX format!")

# Run this once to create the ONNX model
convert_to_onnx()


In [None]:
# def load_model():
#     """Load and configure the model with proper labels"""
#     print("Loading model...")
#     model_name = "gpham/scibert-finetuned-arxiv-42"

#     # Create pipeline
#     classifier = pipeline(
#         "text-classification", 
#         model=model_name,
#         max_length=256)
#     print("Model loaded successfully!")
#     return classifier


# def load_and_broadcast_model():
#     """Load model on driver and broadcast to executors"""
#     print("Loading model on driver...")
#     model_name = "gpham/scibert-finetuned-arxiv-42"
#     classifier = pipeline("text-classification", model=model_name, max_length=256)
    
#     # Broadcast the model to all executors
#     broadcast_model = sc.broadcast(classifier)
#     return broadcast_model


In [None]:
# Toy predict function that returns a random probability. Normally you'd use your loaded globals()['my_model'] here

# import random

# globals()['models_loaded'] = False
# globals()['my_model'] = None

# broadcast_model = load_and_broadcast_model()

# def predict(row):
#     """Predict function using broadcast model"""
#     try:
#         # Access the broadcast model
#         model = broadcast_model.value
#         text = row.title + "\n" + row.summary
#         result = model(text)
        
#         if result and len(result) > 0:
#             return result[0]["label"]
#         else:
#             return "prediction_failed"
#     except Exception as e:
#         print(f"Error in prediction: {e}")
#         return "prediction_failed"

def predict(row):
    """Predict function with per-executor lazy loading"""
    
    # Check if model exists in current executor's global scope
    if 'executor_model' not in globals():
        try:
            print("Loading model on executor...")
            model_name = "gpham/scibert-finetuned-arxiv-42"
            globals()['executor_model'] = pipeline(
                "text-classification", 
                model=model_name,
                device=-1,  # Force CPU
                max_length=256,
                truncation=True
            )
            print("Model loaded successfully on executor!")
        except Exception as e:
            print(f"Failed to load model on executor: {e}")
            globals()['executor_model'] = None
    
    try:
        text = row.title + "\n" + row.summary
        
        # Fallback to mock if model loading failed
        if globals()['executor_model'] is None:
            print("Using fallback mock prediction")
            categories = ["cs.AI", "cs.LG", "cs.CV", "cs.CL", "cs.IT"]
            # Simple heuristic fallback
            if "neural" in text.lower() or "deep" in text.lower():
                return "cs.LG"
            elif "vision" in text.lower() or "image" in text.lower():
                return "cs.CV"
            elif "language" in text.lower() or "text" in text.lower():
                return "cs.CL"
            elif "algorithm" in text.lower():
                return "cs.AI"
            else:
                return random.choice(categories)
        
        # Use the real model
        result = globals()['executor_model'](text)
        
        if result and len(result) > 0:
            return result[0]["label"]
        else:
            return "prediction_failed"
            
    except Exception as e:
        print(f"Error in prediction: {e}")
        return "prediction_failed"

# def predict(row):
#     """Simple predict function that takes combined text and returns category"""
    
#     # Load model once per executor (lazy loading)
#     if not globals().get('models_loaded', False):
#         globals()['my_model'] = load_model()
#         globals()['models_loaded'] = True

#     text = row.title + "\n" + row.summary
#     result = globals()['my_model'](text)
        
#     if result and len(result) > 0:
#         return result[0]["label"]
#     else:
#         return "prediction_failed"
    
# def predict(row):
#     """Simple predict function that takes combined text and returns category"""
    
#     try:
#         # Simple mock prediction instead of loading heavy ML models
#         # This avoids serialization and memory issues in Spark
#         text = row.title + "\n" + row.summary
        
#         # Mock categories based on simple text analysis
#         categories = ["cs.AI", "cs.LG", "cs.CV", "cs.CL", "cs.IT"]
        
#         # Simple heuristic based on text length and content
#         if "neural" in text.lower() or "deep" in text.lower():
#             return "cs.LG"
#         elif "vision" in text.lower() or "image" in text.lower():
#             return "cs.CV"
#         elif "language" in text.lower() or "text" in text.lower():
#             return "cs.CL"
#         elif "algorithm" in text.lower():
#             return "cs.AI"
#         else:
#             # Random selection for other cases
#             return random.choice(categories)
            
#     except Exception as e:
#         print(f"Error in prediction: {e}")
#         return "prediction_failed"

predict_udf = udf(predict, StringType())

def process(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    
    # Convert to data frame
    df = spark.read.json(rdd)
        
    # Utilize our predict function
    df_withpreds = df.withColumn("pred", predict_udf(
        struct(col("title"), col("summary"))
    ))
    df_withpreds.show()   


In [None]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql import Row

def predict_partition(iterator):
    """Process entire partition with one model load"""
    
    # Convert iterator to list to process all rows
    rows = list(iterator)
    if not rows:
        return []
    
    # Try to load model once per partition
    model = None
    try:
        print(f"Loading model for partition with {len(rows)} rows...")
        model_name = "gpham/scibert-finetuned-arxiv-42"
        model = pipeline(
            "text-classification", 
            model=model_name,
            device=-1,  # Force CPU
            max_length=256,
            truncation=True
        )
        print("Model loaded successfully for partition!")
    except Exception as e:
        print(f"Failed to load model for partition: {e}")
        model = None
    
    # Process all rows in the partition
    results = []
    categories = ["cs.AI", "cs.LG", "cs.CV", "cs.CL", "cs.IT"]
    
    for row in rows:
        try:
            text = row.title + "\n" + row.summary
            
            if model is not None:
                # Use real model
                result = model(text)
                pred = result[0]["label"] if result and len(result) > 0 else "prediction_failed"
            else:
                # Fallback to mock prediction
                if "neural" in text.lower() or "deep" in text.lower():
                    pred = "cs.LG"
                elif "vision" in text.lower() or "image" in text.lower():
                    pred = "cs.CV"
                elif "language" in text.lower() or "text" in text.lower():
                    pred = "cs.CL"
                elif "algorithm" in text.lower():
                    pred = "cs.AI"
                else:
                    pred = random.choice(categories)
            
            # Return original row data plus prediction
            results.append(Row(title=row.title, summary=row.summary, pred=pred))
            
        except Exception as e:
            print(f"Error processing row: {e}")
            results.append(Row(title=row.title, summary=row.summary, pred="prediction_failed"))
    
    return results

def process(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    
    try:
        # Convert to DataFrame first
        df = spark.read.json(rdd)
        print(f"Processing {df.count()} rows")
        
        # Convert to RDD and process by partitions
        results_rdd = df.rdd.mapPartitions(predict_partition)
        
        # Convert back to DataFrame
        schema = StructType([
            StructField("title", StringType(), True),
            StructField("summary", StringType(), True),
            StructField("pred", StringType(), True)
        ])
        
        result_df = spark.createDataFrame(results_rdd, schema)
        result_df.show(truncate=False)
        
    except Exception as e:
        print(f"Error in process function: {e}")
        import traceback
        traceback.print_exc()
        

In [None]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer, pipeline
from pyspark.sql.types import StructType, StructField
from pyspark.sql import Row

def predict_partition_onnx(iterator):
    """Process entire partition with ONNX model"""
    
    # Convert iterator to list to process all rows
    rows = list(iterator)
    if not rows:
        return []
    
    # Try to load ONNX model once per partition
    model = None
    try:
        print(f"Loading ONNX model for partition with {len(rows)} rows...")
        
        # Load ONNX model (much faster than original)
        onnx_model = ORTModelForSequenceClassification.from_pretrained("./models/scibert-finetuned-arxiv-42-onnx")
        tokenizer = AutoTokenizer.from_pretrained("./models/scibert-finetuned-arxiv-42-onnx")
        
        # Create pipeline with ONNX model
        model = pipeline(
            "text-classification",
            model=onnx_model,
            tokenizer=tokenizer,
            max_length=256,
            device=-1  # CPU only
        )
        print("ONNX model loaded successfully for partition!")
        
    except Exception as e:
        print(f"Failed to load ONNX model for partition: {e}")
        model = None
    
    # Process all rows in the partition
    results = []
    categories = ["cs.AI", "cs.LG", "cs.CV", "cs.CL", "cs.IT"]
    
    for row in rows:
        try:
            text = row.title + "\n" + row.summary
            
            if model is not None:
                # Use ONNX model
                result = model(text)
                pred = result[0]["label"] if result and len(result) > 0 else "prediction_failed"
            else:
                # Fallback to mock prediction
                if "neural" in text.lower() or "deep" in text.lower():
                    pred = "cs.LG"
                elif "vision" in text.lower() or "image" in text.lower():
                    pred = "cs.CV"
                elif "language" in text.lower() or "text" in text.lower():
                    pred = "cs.CL"
                elif "algorithm" in text.lower():
                    pred = "cs.AI"
                else:
                    pred = random.choice(categories)
            
            results.append(Row(title=row.title, summary=row.summary, pred=pred))
            
        except Exception as e:
            print(f"Error processing row: {e}")
            results.append(Row(title=row.title, summary=row.summary, pred="prediction_failed"))
    
    return results

def process_onnx(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    
    try:
        # Convert to DataFrame first
        df = spark.read.json(rdd)
        print(f"Processing {df.count()} rows with ONNX")
        
        # Convert to RDD and process by partitions
        results_rdd = df.rdd.mapPartitions(predict_partition_onnx)
        
        # Convert back to DataFrame
        schema = StructType([
            StructField("title", StringType(), True),
            StructField("summary", StringType(), True),
            StructField("pred", StringType(), True)
        ])
        
        result_df = spark.createDataFrame(results_rdd, schema)
        result_df.show(truncate=False)
        
    except Exception as e:
        print(f"Error in process function: {e}")
        import traceback
        traceback.print_exc()


In [None]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer, pipeline
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, col

def load_and_broadcast_onnx_model():
    """Load ONNX model on driver and broadcast to executors"""
    try:
        print("Loading ONNX model on driver...")
        
        # Load ONNX model (much faster and smaller)
        onnx_model = ORTModelForSequenceClassification.from_pretrained("./models/scibert-finetuned-arxiv-42-onnx")
        tokenizer = AutoTokenizer.from_pretrained("./models/scibert-finetuned-arxiv-42-onnx")
        
        # Create pipeline with ONNX model
        classifier = pipeline(
            "text-classification",
            model=onnx_model,
            tokenizer=tokenizer,
            max_length=256,
            device=-1  # CPU only
        )
        
        print("ONNX model loaded successfully on driver!")
        
        # Broadcast the model to all executors
        broadcast_model = sc.broadcast(classifier)
        return broadcast_model
        
    except Exception as e:
        print(f"Failed to load ONNX model on driver: {e}")
        return None

# Load and broadcast the ONNX model once
broadcast_onnx_model = load_and_broadcast_onnx_model()

def predict_with_broadcast_onnx(row):
    """Predict function using broadcast ONNX model"""
    try:
        # Access the broadcast model
        if broadcast_onnx_model is not None and broadcast_onnx_model.value is not None:
            model = broadcast_onnx_model.value
            text = row.title + "\n" + row.summary
            result = model(text)
            
            if result and len(result) > 0:
                return result[0]["label"]
            else:
                return "prediction_failed"
        else:
            # Fallback to mock prediction if model not available
            text = row.title + "\n" + row.summary
            categories = ["cs.AI", "cs.LG", "cs.CV", "cs.CL", "cs.IT"]
            
            if "neural" in text.lower() or "deep" in text.lower():
                return "cs.LG"
            elif "vision" in text.lower() or "image" in text.lower():
                return "cs.CV"
            elif "language" in text.lower() or "text" in text.lower():
                return "cs.CL"
            elif "algorithm" in text.lower():
                return "cs.AI"
            else:
                return random.choice(categories)
                
    except Exception as e:
        print(f"Error in prediction: {e}")
        return "prediction_failed"

# Create UDF with the broadcast ONNX model
predict_onnx_udf = udf(predict_with_broadcast_onnx, StringType())

def process_broadcast_onnx(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    
    try:
        # Convert to DataFrame
        df = spark.read.json(rdd)
        print(f"Processing {df.count()} rows with broadcast ONNX")
        
        # Apply predictions using UDF
        df_withpreds = df.withColumn("pred", predict_onnx_udf(
            struct(col("title"), col("summary"))
        ))
        
        df_withpreds.show(truncate=False)
        
    except Exception as e:
        print(f"Error in process function: {e}")
        import traceback
        traceback.print_exc()
        

In [None]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer, pipeline
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql import Row

def predict_partition_onnx_optimized(iterator):
    """Optimized ONNX partition processing"""
    
    rows = list(iterator)
    if not rows:
        return []
    
    model = None
    try:
        print(f"Loading ONNX model for partition with {len(rows)} rows...")
        
        # Try to load ONNX model with error handling
        import os
        model_path = "./models/scibert-finetuned-arxiv-42-onnx"
        
        if os.path.exists(model_path):
            onnx_model = ORTModelForSequenceClassification.from_pretrained(model_path)
            tokenizer = AutoTokenizer.from_pretrained(model_path)
            
            model = pipeline(
                "text-classification",
                model=onnx_model,
                tokenizer=tokenizer,
                max_length=256,
                device=-1,
                truncation=True
            )
            print("ONNX model loaded successfully for partition!")
        else:
            print(f"ONNX model path not found: {os.path.abspath(model_path)}")
            model = None
            
    except Exception as e:
        print(f"Failed to load ONNX model: {e}")
        # Try fallback to original model
        try:
            print("Trying original model as fallback...")
            model = pipeline(
                "text-classification",
                model="gpham/scibert-finetuned-arxiv-42",
                device=-1,
                max_length=256,
                truncation=True
            )
            print("Original model loaded as fallback!")
        except Exception as e2:
            print(f"Original model also failed: {e2}")
            model = None
    
    # Process rows
    results = []
    categories = ["cs.AI", "cs.LG", "cs.CV", "cs.CL", "cs.IT"]
    
    for row in rows:
        try:
            text = row.title + "\n" + row.summary
            
            if model is not None:
                result = model(text)
                pred = result[0]["label"] if result and len(result) > 0 else "prediction_failed"
            else:
                # Enhanced fallback with better heuristics
                text_lower = text.lower()
                if any(word in text_lower for word in ["neural", "deep", "learning", "network"]):
                    pred = "cs.LG"
                elif any(word in text_lower for word in ["vision", "image", "visual", "computer vision"]):
                    pred = "cs.CV"
                elif any(word in text_lower for word in ["language", "text", "nlp", "linguistic"]):
                    pred = "cs.CL"
                elif any(word in text_lower for word in ["algorithm", "optimization", "artificial intelligence"]):
                    pred = "cs.AI"
                elif any(word in text_lower for word in ["information", "theory", "data"]):
                    pred = "cs.IT"
                else:
                    pred = random.choice(categories)
            
            results.append(Row(title=row.title, summary=row.summary, pred=pred))
            
        except Exception as e:
            print(f"Error processing row: {e}")
            results.append(Row(title=row.title, summary=row.summary, pred="prediction_failed"))
    
    return results

def process_onnx_optimized(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    
    try:
        df = spark.read.json(rdd)
        print(f"Processing {df.count()} rows with optimized ONNX")
        
        # Use optimized partition processing
        results_rdd = df.rdd.mapPartitions(predict_partition_onnx_optimized)
        
        schema = StructType([
            StructField("title", StringType(), True),
            StructField("summary", StringType(), True),
            StructField("pred", StringType(), True)
        ])
        
        result_df = spark.createDataFrame(results_rdd, schema)
        result_df.show(truncate=False)
        
    except Exception as e:
        print(f"Error in process function: {e}")
        import traceback
        traceback.print_exc()
        

In [None]:
import requests
# import random
from pyspark.sql.functions import udf, struct, col
from pyspark.sql.types import StringType

def predict_with_api(row):
    try:
        response = requests.post("http://localhost:8000/predict", 
                               json={"title": str(row.title), "summary": str(row.summary)},
                               timeout=5)
        if response.status_code == 200:
            return response.json()["prediction"]
    except:
        pass
    
    # Fallback
    text = str(row.title) + str(row.summary)
    if "neural" in text.lower(): return "cs.LG"
    elif "vision" in text.lower(): return "cs.CV"
    elif "language" in text.lower(): return "cs.CL"
    else: return "cs.AI"

predict_api_udf = udf(predict_with_api, StringType())

def process_api(time, rdd):
    if rdd.isEmpty(): return
    print("========= %s =========" % str(time))
    
    df = spark.read.json(rdd)
    df_withpreds = df.withColumn("pred", predict_api_udf(struct(col("title"), col("summary"))))
    df_withpreds.show()


In [None]:
ssc = StreamingContext(sc, 10)


In [None]:
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(process_api)


In [None]:
ssc_t = StreamingThread(ssc)
ssc_t.start()


In [None]:
ssc_t.stop()
