1. Read modules and librairies

In [14]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.types import StructType
from src.churn.extract import extract_parking_data
import shutil
import os

In [15]:
spark = (
    SparkSession.builder
    .appName("job_001_availableGroupParkingPipeline")
    .master("spark://spark-master:7077")
    .getOrCreate()
)

2. Load data into a processing environment

In [16]:
# define schema for parking_group
parking_schema = StructType([
    StructField("adresse", StringType(), True),
    StructField("grp_horodatage", TimestampType(), True),
    StructField("grp_identifiant", IntegerType(), True), 
    StructField("grp_complet", IntegerType(), True),
    StructField("grp_disponible", IntegerType(), True),
    StructField("grp_exploitation", IntegerType(), True),
    StructField("grp_statut", IntegerType(), True),
    StructField("location", StructType([
        StructField("lat", DoubleType(), True),
        StructField("lon", DoubleType(), True)
    ]), True)
])
# define files repertories
path ="/data/raw/"
f_archive ="/data/archive"

In [17]:
parking_group_df = extract_parking_data(path, spark,parking_schema,f_archive)

In [18]:
# alter data types
parking_group_df = parking_group_df\
     .withColumnRenamed("grp_horodatage","grp_timestamp")

In [19]:
# Filter and aggregate parkings available for hourly
hourly_available_parking = parking_group_df\
                        .filter(F.col("grp_statut") == 5)\
                        .groupBy(F.window(F.col("grp_timestamp"), "1 hour"))\
                        .agg(F.sum("grp_disponible").alias("total_parkings_available"))
hourly_available_parking.show(truncate=False)

+------------------------------------------+------------------------+
|window                                    |total_parkings_available|
+------------------------------------------+------------------------+
|{2025-09-14 01:00:00, 2025-09-14 02:00:00}|3258                    |
|{2025-12-31 15:00:00, 2025-12-31 16:00:00}|5222                    |
+------------------------------------------+------------------------+



In [20]:
# Filter and aggregate parkings available for hourly
daily_available_parking = parking_group_df\
                        .filter(F.col("grp_statut") == 5)\
                        .groupBy(F.window(F.col("grp_timestamp"), "1 day"))\
                        .agg(F.sum("grp_disponible").alias("total_parkings_available"))
daily_available_parking.show(truncate=False)

+------------------------------------------+------------------------+
|window                                    |total_parkings_available|
+------------------------------------------+------------------------+
|{2025-09-14 00:00:00, 2025-09-15 00:00:00}|3258                    |
|{2025-12-31 00:00:00, 2026-01-01 00:00:00}|5222                    |
+------------------------------------------+------------------------+



In [21]:
print(f"hourly_available_parking has {hourly_available_parking.count()} rows")
try:
     hourly_available_parking.coalesce(1).write.format("parquet").mode("overwrite").save("/data/cleaned/hourly")
except Exception as e :
     print(f"Error writing to /data: {type(e).__name__}")

hourly_available_parking has 2 rows
Error writing to /data: Py4JJavaError


In [22]:
print(f"daily_available_parking has {daily_available_parking.count()} rows")
try:
     daily_available_parking.coalesce(1).write.format("parquet").mode("overwrite").save("/data/cleaned/daily")
except Exception as e :
     print(f"Error writing to /data: {type(e).__name__}")

daily_available_parking has 2 rows
Error writing to /data: Py4JJavaError


In [25]:
# Load parquet files from fallback path (created by Pandas write)

hourly_available_parking.createOrReplaceTempView("parking_hourly")
parking_daily = spark.sql("SELECT window.start as started, window.end as ended, total_parkings_available FROM parking_hourly")
parking_daily.show()

+-------------------+-------------------+------------------------+
|            started|              ended|total_parkings_available|
+-------------------+-------------------+------------------------+
|2025-12-31 15:00:00|2025-12-31 16:00:00|                    5222|
|2025-09-14 01:00:00|2025-09-14 02:00:00|                    3258|
+-------------------+-------------------+------------------------+



In [26]:
# Load parquet files from fallback path (created by Pandas write)

daily_available_parking.createOrReplaceTempView("parking_daily")
parking_daily = spark.sql("SELECT window.start as started, window.end as ended, total_parkings_available FROM parking_daily")
parking_daily.show()

+-------------------+-------------------+------------------------+
|            started|              ended|total_parkings_available|
+-------------------+-------------------+------------------------+
|2025-09-14 00:00:00|2025-09-15 00:00:00|                    3258|
|2025-12-31 00:00:00|2026-01-01 00:00:00|                    5222|
+-------------------+-------------------+------------------------+

