In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext

def main(timeout, func, window):
  # Create a local StreamingContext and SparkContext.
  # In the databricks environment a default SparkContext already exists so we can't create a second one.
  sc = SparkContext("local[2]", "Streaming-Exercise")
  ssc = StreamingContext(sc, window)
  stream = ssc.socketTextStream("127.0.0.1", 9999)
  
  func(stream)
  try:
    ssc.start()             # Start the computation
    ssc.awaitTerminationOrTimeout(timeout)  # To see the output in Python we have to wait until it's finished.
  except Exception as e:
    print(str(e))
  finally:
    ssc.stop(False)


def print_stream(stream):
  # TODO - Print the Stream (one line of code)
  stream.pprint()

# Let it run (start first the netcat Notebook to see some output)
main(10,print_stream,1)

ModuleNotFoundError: No module named 'pyspark'

In [1]:
#Stream aus json file

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext

# Variante 1: Erstellen eines lokalen StreamingContext und SparkContext. [Nur zum Testen!]
sc = SparkContext("local", "StructStreaming-Client-greguali")

# Initialisieren der Spark-Sitzung
spark = SparkSession.builder.appName("StructStreaming-Client-greguali").getOrCreate()

tmpdirname = "bird-detections"

# Sicherstellen, dass der Pfad zur JSON-Datei korrekt ist
json_file_path = f"bird_detections_example.json"

# Lesen der JSON-Datei in ein DataFrame
df = spark.read.json(json_file_path)

# Abrufen des Schemas des DataFrame
json_schema = df.schema

# Anzeigen des Schemas
df.printSchema()

root
 |-- algorithm: string (nullable = true)
 |-- certainty: string (nullable = true)
 |-- confidence: double (nullable = true)
 |-- id: long (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- metadata: string (nullable = true)
 |-- probability: double (nullable = true)
 |-- score: double (nullable = true)
 |-- soundscape: struct (nullable = true)
 |    |-- endTime: double (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- mode: string (nullable = true)
 |    |-- startTime: double (nullable = true)
 |    |-- url: string (nullable = true)
 |-- species: struct (nullable = true)
 |    |-- color: string (nullable = true)
 |    |-- commonName: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- imageUrl: string (nullable = true)
 |    |-- pngUrl: string (nullable = true)
 |    |-- scientificName: string (nullable = true)
 |    |-- thumbnailUrl: string (nullable = true)
 |-- stationId: long (nullable = true)
 |-- ti

In [2]:
def printDF(streamingDF):
  iter = 0
  while iter < 10:
    if(streamingDF.count() > 0):
      print("Number of entries in dataframe: "+ str(streamingDF.count()))
      streamingDF.show(20, False) # the parameter False prevents Spark from truncating the output
      iter += 1
    time.sleep(2)

In [3]:
# create the stream from the /tmp/tweets folder (change to json stream)
from pyspark.sql.functions import *
import time

# with or without file: prefix
inputPath = tmpdirname

streamingInputDF = (
  spark
    .readStream
    .schema(json_schema)
    .json(inputPath)
)


streamingInputDF.isStreaming

True

In [5]:
streamingETLQuery = streamingInputDF \
  .select("stationId", "species.id")\
  .writeStream \
  .format("memory") \
  .queryName("detectionstream") \
  .outputMode("append")\
  .start()
  
streamingDF = spark.sql("select count(*) as count, id from detectionstream group by id order by count desc")

printDF(streamingDF)

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `detections`.`stationId` cannot be resolved. Did you mean one of the following? [`stationId`, `timestamp`, `certainty`, `metadata`, `species`].;
'Project ['detections.stationId, 'detections.species.id]
+- StreamingRelation DataSource(org.apache.spark.sql.SparkSession@13fc4ff9,json,List(),Some(StructType(StructField(algorithm,StringType,true),StructField(certainty,StringType,true),StructField(confidence,DoubleType,true),StructField(id,LongType,true),StructField(lat,DoubleType,true),StructField(lon,DoubleType,true),StructField(metadata,StringType,true),StructField(probability,DoubleType,true),StructField(score,DoubleType,true),StructField(soundscape,StructType(StructField(endTime,DoubleType,true),StructField(id,LongType,true),StructField(mode,StringType,true),StructField(startTime,DoubleType,true),StructField(url,StringType,true)),true),StructField(species,StructType(StructField(color,StringType,true),StructField(commonName,StringType,true),StructField(id,LongType,true),StructField(imageUrl,StringType,true),StructField(pngUrl,StringType,true),StructField(scientificName,StringType,true),StructField(thumbnailUrl,StringType,true)),true),StructField(stationId,LongType,true),StructField(timestamp,StringType,true))),List(),None,Map(path -> bird-detections),None), FileSource[bird-detections], [algorithm#34, certainty#35, confidence#36, id#37L, lat#38, lon#39, metadata#40, probability#41, score#42, soundscape#43, species#44, stationId#45L, timestamp#46]


In [2]:
#Aktueller Code!:
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
import json

def main(timeout, func, window):

    sc = SparkContext("local", "Streaming-Exercise")
    ssc = StreamingContext(sc, window)
    stream = ssc.socketTextStream("127.0.0.1", 9999)
    
    func(stream)
    try:
        ssc.start()             
        ssc.awaitTerminationOrTimeout(timeout)  
    except Exception as e:
        print(str(e))
    finally:
        ssc.stop(False)

def print_json_stream(stream):
    
    stream.map(lambda x: json.loads(x.decode('utf-8'))).pprint()


main(10, print_json_stream, 1)



-------------------------------------------
Time: 2025-03-18 15:00:54
-------------------------------------------

-------------------------------------------
Time: 2025-03-18 15:00:55
-------------------------------------------

-------------------------------------------
Time: 2025-03-18 15:00:56
-------------------------------------------

-------------------------------------------
Time: 2025-03-18 15:00:57
-------------------------------------------

-------------------------------------------
Time: 2025-03-18 15:00:58
-------------------------------------------

-------------------------------------------
Time: 2025-03-18 15:00:59
-------------------------------------------

-------------------------------------------
Time: 2025-03-18 15:01:00
-------------------------------------------

-------------------------------------------
Time: 2025-03-18 15:01:01
-------------------------------------------

-------------------------------------------
Time: 2025-03-18 15:01:02
----------

In [1]:
sc.stop()

NameError: name 'sc' is not defined