In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, collect_list, concat_ws, first

In [None]:
# Création Session Spark
spark = (SparkSession.builder
         .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000")
         .getOrCreate())

In [None]:
activity = spark.read.csv("hdfs://namenode:9000/data/activity.csv", header=True, inferSchema=True)
code = spark.read.csv("hdfs://namenode:9000/data/code.csv", header=True, inferSchema=True)
denomination = spark.read.csv("hdfs://namenode:9000/data/denomination.csv", header=True, inferSchema=True)
address = spark.read.csv("hdfs://namenode:9000/data/address.csv", header=True, inferSchema=True)

activity.show(5)
code.show(5)
denomination.show(5)
address.show(5)

In [None]:
# Filtre du code pour Nace2008 en Français
code2008fr = code.filter(
    (col("Category") == "Nace2008") &
    (col("Language") == "FR")
).select("Code", "Description")

# Filtre uniquement des activités 2008
activity2008 = activity.filter(col("NaceVersion") == 2008)

# Joindre activity2008 avec code2008fr pour obtenir la description NACE
activity_desc = activity2008.join(
    code2008fr,
    activity2008.NaceCode == code2008fr.Code,
    how="left"
).drop(code2008fr.Code)

activity_desc.show(5)

In [None]:
# Filtrer pour récupérer le nom de l'entreprise typeofdenomination 1 FR.

denomination_fr = denomination.filter(
    (col("TypeOfDenomination") == 1) &
    (col("Language") == 1)
).select("EntityNumber", "Denomination")

denomination_fr.show(5)


In [None]:
# Joindre activity_desc avec denomination
activity_with_name = activity_desc.join(
    denomination_fr,
    "EntityNumber",
    how="left"
)

activity_with_name.show(5)

In [None]:
# Filtre de l'address pour TypeOfAddress='REGO'
address_rego = address.filter(
    col("TypeOfAddress") == "REGO"
).select("EntityNumber", "MunicipalityFR", "Zipcode")

address_rego.show(5)

In [None]:
# Joindre activity_with_name avec address_rego
df = activity_with_name.join(
    address_rego,
    "EntityNumber",
    how="left"
)

df.show(5)

In [None]:
final_df = df.groupBy("EntityNumber").agg(
    first("Denomination").alias("EnterpriseName"),
    concat_ws(";", collect_list("Description")).alias("Activities"),
    first("MunicipalityFR").alias("Municipality"),
    first("Zipcode").alias("Zipcode")
)

final_df.show(5)

# Réduire le nombre de partitions à 1
final_single = final_df.coalesce(1)

# Sauvegarder fichier CSV
final_single.write.csv("/data/output/result.csv", header=True)