In [3]:
import os, sys, datetime, pathlib
from pyspark.sql import SparkSession, functions as F
print("Python:", sys.version)
spark = SparkSession.builder.appName("de1-lab1").getOrCreate()
print("Spark:", spark.version)

Python: 3.10.18 (main, Jun  5 2025, 13:14:17) [GCC 11.2.0]
Spark: 4.0.1


In [2]:
os.chdir("/mnt/c/Users/Justine/Data_engineering/lab1")

In [4]:
import os

# Affiche le chemin absolu du répertoire courant
print(os.getcwd())


/mnt/c/Users/Justine/Data_engineering/lab1


In [5]:
src_a = "data/lab1_dataset_a.csv"
src_b = "data/lab1_dataset_b.csv"
df_a = spark.read.option("header","true").option("inferSchema","true").csv(src_a)
df_b = spark.read.option("header","true").option("inferSchema","true").csv(src_b)
df = df_a.unionByName(df_b)
df.cache()
print("Rows:", df.count())
df.printSchema()
df.show(5, truncate=False)

Rows: 2700
root
 |-- id: integer (nullable = true)
 |-- category: string (nullable = true)
 |-- value: double (nullable = true)
 |-- text: string (nullable = true)

+---+-----------+-----+----------------------------------------------------------------------------+
|id |category   |value|text                                                                        |
+---+-----------+-----+----------------------------------------------------------------------------+
|0  |toys       |48.47|metrics ui data elt row columnar reduce warehouse shuffle join spark elt    |
|1  |books      |39.9 |metrics row lake aggregate columnar data reduce row columnar filter         |
|2  |grocery    |7.96 |lake join partition scala elt data                                          |
|3  |electronics|5.15 |spark scala elt filter join columnar lake lake plan warehouse columnar spark|
|4  |toys       |44.87|aggregate metrics row row filter lake map metrics columnar spark            |
+---+-----------+-----+----

In [25]:
import requests
import pandas as pd
import os
from datetime import datetime

def export_spark_metrics(spark, run_id, note="", output_csv="spark_metrics.csv"):
    """
    Exporte les métriques Spark (stages) dans un CSV --> toutes les métriques 
    - spark: SparkSession active
    - run_id: identifiant de ton exécution (ex: "r1", "baseline", etc.)
    - note: commentaire ou variante testée (ex: "broadcast join", etc.)
    - output_csv: nom du fichier de sortie
    """
    ui_url = spark.sparkContext.uiWebUrl
    app_id = spark.sparkContext.applicationId
    
    if not ui_url:
        print(" Pas d'UI Spark détectée (aucun job en cours ?)")
        return
    
    try:
        # Récupérer les infos de stages
        stages_url = f"{ui_url}/api/v1/applications/{app_id}/stages"
        stages = requests.get(stages_url).json()
    except Exception as e:
        print(f" Erreur API Spark: {e}")
        return
    
    data = []
    for s in stages:
        stage_id = s.get("stageId")
        job_ids = s.get("jobIds", [])
        metrics = s.get("executorSummary", {})
        
        # Métriques de base
        input_bytes = s.get("inputBytes", 0)
        shuffle_read = s.get("shuffleReadBytes", 0)
        shuffle_write = s.get("shuffleWriteBytes", 0)
        num_tasks = s.get("numTasks", 0)
        completion_time = s.get("completionTime", None)
        
        # Fallback pour Spark < 3.x
        if not completion_time and "completionTime" in s.get("completionTime", {}):
            completion_time = s["completionTime"]
        
        data.append({
            "run_id": run_id,
            "job_id": job_ids[0] if job_ids else None,
            "stage_id": stage_id,
            "task": s.get("name", ""),
            "note": note,
            "files_read": s.get("inputRecords", 0),
            "input_size_bytes": input_bytes,
            "shuffle_read_bytes": shuffle_read,
            "shuffle_write_bytes": shuffle_write,
            "timestamp": datetime.now().isoformat()
        })
    
    df = pd.DataFrame(data)
    
    if df.empty:
        print(" Aucune métrique trouvée (aucun stage terminé ?)")
        return

    # Écrire ou ajouter au CSV
    if os.path.exists(output_csv):
        df.to_csv(output_csv, mode="a", header=False, index=False)
    else:
        df.to_csv(output_csv, index=False)

    print(f" {len(df)} lignes exportées vers {output_csv}")


In [32]:
import requests
import pandas as pd
import os
from datetime import datetime

def export_spark_metrics(spark, run_id, task="", note="", output_csv="metrics/lab1_metrics.csv"):
    """
    Exporte uniquement les métriques essentielles Spark (stages) dans un CSV.
    Colonnes : run_id, task, note, files_read, input_size_bytes, shuffle_read_bytes, shuffle_write_bytes, timestamp
    """
    ui_url = spark.sparkContext.uiWebUrl
    app_id = spark.sparkContext.applicationId
    
    if not ui_url:
        print(" Pas d'UI Spark détectée.")
        return
    
    try:
        stages = requests.get(f"{ui_url}/api/v1/applications/{app_id}/stages").json()
    except Exception as e:
        print(f" Erreur API Spark: {e}")
        return
    
    if not stages:
        print("Aucun stage trouvé.")
        return

    data = []
    for s in stages:
        data.append({
            "run_id": run_id,
            "task": task,
            "note": note,
            "files_read": s.get("inputRecords", 0),
            "input_size_bytes": s.get("inputBytes", 0),
            "shuffle_read_bytes": s.get("shuffleReadBytes", 0),
            "shuffle_write_bytes": s.get("shuffleWriteBytes", 0),
            "timestamp": datetime.now().isoformat()
        })
    
    df = pd.DataFrame(data)
    if df.empty:
        print("Aucune métrique trouvée.")
        return

    # Crée le dossier si nécessaire
    os.makedirs(os.path.dirname(output_csv), exist_ok=True) if os.path.dirname(output_csv) else None

    if os.path.exists(output_csv):
        df.to_csv(output_csv, mode="a", header=False, index=False)
    else:
        df.to_csv(output_csv, index=False)

    print(f" {len(df)} lignes exportées vers {output_csv}")


In [29]:
# RDD pipeline: tokenize 'text' column and count tokens
rdd = df.select("text").rdd.flatMap(lambda row: (row[0] or "").lower().split())
pair = rdd.map(lambda t: (t, 1))
counts = pair.reduceByKey(lambda a,b: a+b)
top_rdd = counts.sortBy(lambda kv: (-kv[1], kv[0])).take(10)
top_rdd

[('lake', 1215),
 ('scala', 1200),
 ('elt', 1199),
 ('metrics', 1190),
 ('row', 1183),
 ('join', 1169),
 ('warehouse', 1168),
 ('shuffle', 1160),
 ('ui', 1145),
 ('aggregate', 1144)]

In [7]:
# Save as CSV (token,count)
pathlib.Path("outputs").mkdir(exist_ok=True)
with open("outputs/top10_rdd.csv","w",encoding="utf-8") as f:
    f.write("token,count\n")
    for t,c in top_rdd:
        f.write(f"{t},{c}\n")
print("Wrote outputs/top10_rdd.csv")

Wrote outputs/top10_rdd.csv


In [8]:
# Trigger an action and record a textual plan for evidence
_ = counts.count()
plan_rdd = df._jdf.queryExecution().executedPlan().toString()
pathlib.Path("proof").mkdir(exist_ok=True)
with open("proof/plan_rdd.txt","w") as f:
    f.write(str(datetime.datetime.now()) + "\n\n")
    f.write(plan_rdd)
print("Saved proof/plan_rdd.txt")

Saved proof/plan_rdd.txt


In [9]:
tokens = F.explode(F.split(F.lower(F.col("text")), "\\s+")).alias("token")
df_tokens = df.select(tokens).where(F.col("token") != "")
agg_df = df_tokens.groupBy("token").agg(F.count("*").alias("count"))
top_df = agg_df.orderBy(F.desc("count"), F.asc("token")).limit(10)
top_df.show(truncate=False)
top_df.coalesce(1).write.mode("overwrite").option("header","true").csv("outputs/top10_df_tmp")
# move single part file to stable path
import glob, shutil
part = glob.glob("outputs/top10_df_tmp/part*")[0]
shutil.copy(part, "outputs/top10_df.csv")
print("Wrote outputs/top10_df.csv")

                                                                                

+---------+-----+
|token    |count|
+---------+-----+
|lake     |1215 |
|scala    |1200 |
|elt      |1199 |
|metrics  |1190 |
|row      |1183 |
|join     |1169 |
|warehouse|1168 |
|shuffle  |1160 |
|ui       |1145 |
|aggregate|1144 |
+---------+-----+

Wrote outputs/top10_df.csv


In [10]:
plan_df = top_df._jdf.queryExecution().executedPlan().toString()
with open("proof/plan_df.txt","w") as f:
    f.write(str(datetime.datetime.now()) + "\n\n")
    f.write(plan_df)
print("Saved proof/plan_df.txt")

Saved proof/plan_df.txt


In [11]:
# Case A: select all columns then aggregate on 'category'
all_cols = df.select("*").groupBy("category").agg(F.sum("value").alias("sum_value"))
all_cols.explain("formatted")
_ = all_cols.count()  # trigger

# Case B: minimal projection then aggregate
proj = df.select("category","value").groupBy("category").agg(F.sum("value").alias("sum_value"))
proj.explain("formatted")
_ = proj.count()  # trigger

print("Open Spark UI at http://localhost:4040 while each job runs and record metrics into lab1_metrics_log.csv")


== Physical Plan ==
AdaptiveSparkPlan (9)
+- HashAggregate (8)
   +- Exchange (7)
      +- HashAggregate (6)
         +- InMemoryTableScan (1)
               +- InMemoryRelation (2)
                     +- Union (5)
                        :- Scan csv  (3)
                        +- Scan csv  (4)


(1) InMemoryTableScan
Output [2]: [category#18, value#19]
Arguments: [category#18, value#19]

(2) InMemoryRelation
Arguments: [id#17, category#18, value#19, text#20], StorageLevel(disk, memory, deserialized, 1 replicas)

(3) Scan csv 
Output [4]: [id#17, category#18, value#19, text#20]
Batched: false
Location: InMemoryFileIndex [file:/mnt/c/Users/Justine/Data_engineering/lab1/data/lab1_dataset_a.csv]
ReadSchema: struct<id:int,category:string,value:double,text:string>

(4) Scan csv 
Output [4]: [id#38, category#39, value#40, text#41]
Batched: false
Location: InMemoryFileIndex [file:/mnt/c/Users/Justine/Data_engineering/lab1/data/lab1_dataset_b.csv]
ReadSchema: struct<id:int,category:string,va

In [33]:
#Pour enregistrer les métriques de chaque fonction
# Top-N RDD
top_rdd = counts.sortBy(lambda kv: (-kv[1], kv[0])).take(10)
export_spark_metrics(spark, run_id="r1", task="topN_rdd")

# Top-N DataFrame
top_df.show(truncate=False)
export_spark_metrics(spark, run_id="r1", task="topN_df")

# Projection all columns
_ = all_cols.count()
export_spark_metrics(spark, run_id="r1", task="projection_all_cols")

# Projection minimal columns
_ = proj.count()
export_spark_metrics(spark, run_id="r1", task="projection_min_cols")



 151 lignes exportées vers metrics/lab1_metrics.csv
+---------+-----+
|token    |count|
+---------+-----+
|lake     |1215 |
|scala    |1200 |
|elt      |1199 |
|metrics  |1190 |
|row      |1183 |
|join     |1169 |
|warehouse|1168 |
|shuffle  |1160 |
|ui       |1145 |
|aggregate|1144 |
+---------+-----+

 154 lignes exportées vers metrics/lab1_metrics.csv
 160 lignes exportées vers metrics/lab1_metrics.csv
 166 lignes exportées vers metrics/lab1_metrics.csv


In [None]:
spark.stop()
print("Spark session stopped.")