In [1]:
import os
spark_home = os.path.abspath(os.getcwd() + "/spark/spark-3.5.5-bin-hadoop3")
hadoop_home = os.path.abspath(os.getcwd() + "/spark/winutils")
print(f"I am using the following SPARK_HOME: {spark_home}")
if os.name == 'nt':
    os.environ["HADOOP_HOME"] = f"{hadoop_home}"
    print(f"Windows detected: set HADOOP_HOME to: {os.environ['HADOOP_HOME']}")
    hadoop_bin = os.path.join(hadoop_home, "bin")
    os.environ["PATH"] = f"{hadoop_bin};{os.environ['PATH']}"
    print(f"  Also added Hadoop bin directory to PATH: {hadoop_bin}")

import findspark
import pyspark
from pyspark.streaming import StreamingContext

findspark.init(spark_home)
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession.builder.getOrCreate()


I am using the following SPARK_HOME: d:\OneDrive - CGIAR\Master\Advanced Analytics\assignments\assignment-03\spark\spark-3.5.5-bin-hadoop3
Windows detected: set HADOOP_HOME to: d:\OneDrive - CGIAR\Master\Advanced Analytics\assignments\assignment-03\spark\winutils
  Also added Hadoop bin directory to PATH: d:\OneDrive - CGIAR\Master\Advanced Analytics\assignments\assignment-03\spark\winutils\bin


In [2]:
import threading

# Helper thread to avoid the Spark StreamingContext from blocking Jupyter
        
class StreamingThread(threading.Thread):
    def __init__(self, ssc):
        super().__init__()
        self.ssc = ssc
    def run(self):
        self.ssc.start()
        self.ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)
        

In [3]:
socketDF = spark.readStream.format("socket").option("host", "seppe.net").option("port", 7778).load()
socketDF.printSchema()


root
 |-- value: string (nullable = true)



In [4]:
from pyspark.sql.functions import from_json, schema_of_json


In [5]:
def process_row(df, epoch_id):
    print(epoch_id)
    if df.count() == 0: return
    schema = schema_of_json(df.first().value)
    df_cols = df.selectExpr('CAST(value AS STRING)')\
        .select(from_json('value', schema)\
        .alias('temp'))\
        .select('temp.*')
    df_cols.show()
    # We can also save here using something such as:
    df.write.format("json").mode("append").save("data/raw")


In [6]:
query = socketDF.writeStream.trigger(processingTime='5 seconds').foreachBatch(process_row).start()


0
1
+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+
|                 aid|          categories|    main_category|           published|             summary|               title|
+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+
|http://arxiv.org/...|               cs.CV|            cs.CV|2025-04-21T11:41:22Z|Compositional tex...|DyST-XL: Dynamic ...|
|http://arxiv.org/...|             eess.SP|          eess.SP|2025-04-21T11:41:28Z|An undesirable co...|Blinding the Wire...|
|http://arxiv.org/...|cond-mat.mtrl-sci...|cond-mat.mtrl-sci|2025-04-21T11:41:32Z|Metal-organic fra...|Predicting Methan...|
|http://arxiv.org/...|   cs.CR,cs.AI,cs.SD|            cs.CR|2025-04-21T11:43:36Z|The accelerated a...|SOLIDO: A Robust ...|
|http://arxiv.org/...|               cs.PL|            cs.PL|2025-04-21T11:44:58Z|Dynamic race dete...|Dynamic Robustnes.

In [7]:
query.stop()
