In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from core.s3.settings import S3Settings
import os

def create_spark_session(access_key: str, secret_key: str) -> SparkSession:
    """
    Create a session spark with a connexion to scaleway
    """
    os.environ["PYSPARK_SUBMIT_ARGS"] = (
        "--packages org.apache.hadoop:hadoop-aws:3.3.4,org.apache.spark:spark-hadoop-cloud_2.12:3.4.0 pyspark-shell"
    )

    g1gc_options = (
        "-XX:+UseG1GC "
        "-XX:MaxGCPauseMillis=100 "
        "-XX:G1HeapRegionSize=32m "
        "-XX:+UseStringDeduplication "
        "-XX:+UnlockExperimentalVMOptions "
        "-XX:+UseZGC "
        "-XX:+DisableExplicitGC "
        "-XX:+UseGCOverheadLimit "
        "-XX:GCTimeRatio=9 "
        "-XX:+PrintGCDetails "
        "-XX:+PrintGCTimeStamps "
        "-Xloggc:/tmp/spark-gc.log"
    )
    
    spark = (
        SparkSession.builder.appName("Scaleway S3 Read JSON")
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,org.apache.spark:spark-hadoop-cloud_2.12:3.4.0")
        .config("spark.hadoop.fs.s3a.endpoint", "https://s3.fr-par.scw.cloud")
        .config("spark.hadoop.fs.s3a.access.key", access_key)
        .config("spark.hadoop.fs.s3a.secret.key", secret_key)
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .config("spark.hadoop.fs.s3a.path.style.access", "true")
        .config("spark.driver.host", "localhost")
        .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
        # Nouvelles configurations pour résoudre le ClassNotFoundException
        .config("spark.hadoop.fs.s3a.experimental.input.fadvise", "normal")
        .config("spark.hadoop.fs.s3a.connection.maximum", "1000")
        .config("spark.hadoop.fs.s3a.threads.max", "20")
        .config("spark.hadoop.fs.s3a.threads.core", "10")
        .config("spark.hadoop.fs.s3a.buffer.dir", "/tmp")
        .config("spark.hadoop.fs.s3a.block.size", "134217728")  # 128MB
        .config("spark.hadoop.fs.s3a.multipart.size", "134217728")  # 128MB
        .config("spark.hadoop.fs.s3a.multipart.threshold", "134217728")  # 128MB
        # Configuration pour éviter les problèmes de commit protocol
        .config("spark.sql.adaptive.enabled", "true")
        .config("spark.sql.adaptive.advisoryPartitionSizeInBytes", "64MB")
        .config("spark.sql.shuffle.partitions", "200")
        .config("spark.default.parallelism", "200")
        .config("spark.executor.memory", "10g")
        .config("spark.driver.memory", "10g")
        .config("spark.driver.maxResultSize", "4g")
        .config("spark.executor.extraJavaOptions", g1gc_options)
        .config("spark.driver.extraJavaOptions", g1gc_options)
        .getOrCreate()
    )

    return spark

settings = S3Settings()

spark = create_spark_session(settings.S3_KEY, settings.S3_SECRET)

In [None]:
import re
import time
import random
from abc import abstractmethod
from datetime import datetime
from itertools import islice
from logging import Logger
from typing import Optional

from pyspark.sql import DataFrame, SparkSession

from core.caching_utils import CachedETLSpark
from core.s3.s3_utils import S3Service
from core.s3.settings import S3Settings
from core.spark_utils import get_optimal_nb_partitions
from transform.raw_tss.config import (
    ALL_MAKES,
    S3_RAW_TSS_KEY_FORMAT,
    SCHEMAS,
)


class ResponseToRawTss(CachedETLSpark):
    """
    Classe pour traiter les données renvoyées par les API stockées dans /response sur Scaleway
    """

    def __init__(
        self,
        make: str,
        force_update: bool = False,
        writing_mode: Optional[str] = "append",
        spark: SparkSession = None,
        logger: Logger = None,
        **kwargs,
    ):
        """
        Initialise le processeur de télémétrie

        Args:
            make: Marque du véhicule
            bucket: Instance S3Service pour l'accès aux données
            spark: Session Spark pour le traitement des données
            force_update: Si True, force la mise à jour des données dans CachedETLSpark
        """

        self.logger = logger
        self.spark = spark
        self.make = make
        self.bucket = S3Service()
        self.settings = S3Settings()
        self.base_s3_path = f"s3a://{self.settings.S3_BUCKET}"
        self.raw_tss_path = S3_RAW_TSS_KEY_FORMAT.format(brand=self.make)
        super().__init__(
            S3_RAW_TSS_KEY_FORMAT.format(brand=self.make),
            "s3",
            force_update=force_update,
            **kwargs,
        )

    def run(self):
        start = time.time()
        keys_to_download_per_vin, paths_to_exclude = (
            self._get_keys_to_download()
        ) # Clés à télécharger par vin
        end = time.time()


        # A SUPPRIMER #
        keys_to_download_per_vin = dict(islice(keys_to_download_per_vin.items(), 20))

        self.logger.info(
            f"Temps écoulé pour récupérer les clés à télécharger: {end - start:.2f} secondes"
        )

        start = time.time()
        optimal_partitions_nb, batch_size = self._set_optimal_spark_parameters(
            keys_to_download_per_vin, paths_to_exclude
        )
        print(optimal_partitions_nb, batch_size)
        end = time.time()
        self.logger.info(
            f"Temps écoulé pour déterminer les paramètres Spark: {end - start:.2f} secondes"
        )

        print("Batch size", batch_size)
        print("Nb de vins", len(list(keys_to_download_per_vin.keys())))
        print(
            "Nb de batches",
            len(list(self._batch_dict_items(keys_to_download_per_vin, batch_size))),
        )

        for batch_num, batch in enumerate(
            self._batch_dict_items(keys_to_download_per_vin, batch_size), 1
        ):  # Boucle pour faire des batchs d'écriture et ne pas saturer la mémoire
            self.logger.info(f"Batch {batch_num}:")

            start = time.time()
            # Extract
            raw_tss_unparsed = self._download_keys(batch)
            end = time.time()
            self.logger.info(
                f"Temps écoulé pour télécharger les json en spark {batch_num}: {end - start:.2f} secondes"
            )

            start = time.time()
            # Transform
            raw_tss_parsed = self.parse_data(raw_tss_unparsed, optimal_partitions_nb)
            end = time.time()
            self.logger.info(
                f"Temps écoulé pour transformer les données du batch {batch_num}: {end - start:.2f} secondes"
            )

            start = time.time()
            # Load
            self.bucket.append_spark_df_to_parquet(raw_tss_parsed, self.raw_tss_path)
            end = time.time()
            self.logger.info(
                f"Temps écoulé pour écrire les données dans le bucket {batch_num}: {end - start:.2f} secondes"
            )

            raw_tss_parsed.unpersist()
            del raw_tss_parsed

        self.logger.info(f"Traitement terminé pour {self.make}")

    def _set_optimal_spark_parameters(
        self, keys_to_download_per_vin: dict, paths_to_exclude: list[str], nb_cores: int = 8
    ) -> tuple[int, int]:
        """
        Calcule la taille optimale des batches pour le traitement parallèle des VINs.

        Cette méthode détermine le nombre optimal de VINs à traiter par batch en fonction
        de la taille des données, du nombre de VINs et des ressources système disponibles.
        L'optimisation vise à équilibrer la charge de travail entre les cœurs CPU tout
        en maximisant l'utilisation des ressources Spark.

        Args:
            nb_cores (int, optional): Nombre de cœurs CPU disponibles pour le traitement.
                                    Défaut: 4

        Returns:
            int: Nombre optimal de VINs à traiter par batch
        """
        if nb_cores <= 0:
            raise ValueError("Nombre de cœurs doit être un entier positif")

        file_size, _ = self.bucket.get_object_size(f"response/{self.make}/", prefix_to_exclude=paths_to_exclude)

        nb_vins = len(list(keys_to_download_per_vin.keys()))

        if nb_vins == 0:
            self.logger.warning("Aucun VIN à traiter, retour de batch_size = 1")
            return 1

        optimal_partitions = get_optimal_nb_partitions(file_size, nb_vins)
        self.logger.info(f"Nombre optimal de partitions: {optimal_partitions}")

        vin_per_batch = max(1, int((nb_vins / optimal_partitions) * nb_cores * 4))
        self.logger.info(f"Vin par batch: {vin_per_batch}")

        return (4 * nb_cores, vin_per_batch)

    def _group_paths_by_vin(self, paths: list[str]) -> dict[str, list[str]]:
        grouped = {}

        for path in paths:
            if "/temp/" not in path:
                parts = path.strip("/").split("/")
                if len(parts) < 2:
                    continue  # ignorer les paths invalides
                vin = parts[-2]
                # Initialise la liste si vin pas encore vu
                if vin not in grouped:
                    grouped[vin] = []

                grouped[vin].append(path)

        return grouped

    def _batch_dict_items(self, dictionary: dict, batch_size: int):
        """Générateur pour traiter un dictionnaire par lots"""
        total_items = len(dictionary)

        for i in range(0, total_items, batch_size):
            batch = dict(islice(dictionary.items(), i, i + batch_size))
            yield batch

    def _get_keys_to_download(self) -> (dict[str, list[str]], list):
        """
        Récupère les clés S3 des fichiers à télécharger en filtrant par date de dernière analyse.

        Cette méthode compare les dates des fichiers de réponse disponibles avec la date
        de dernière analyse stockée dans les données raw TSS pour déterminer quels fichiers
        doivent être téléchargés et traités.

        Returns:
            dict[str, list[str]]: Dictionnaire où les clés sont les VINs et les valeurs sont
                                les listes des chemins S3 des fichiers à télécharger.
                                Format: {'VIN123': ['response/brand/VIN123/2024-01-01.json', ...]}

        Raises:
            Exception: Si une erreur survient lors de la lecture des données Parquet ou
                    de la liste des fichiers S3
        """

        last_parsed_date_dict = None

        if self.bucket.check_spark_file_exists(self.raw_tss_path):
            raw_tss = self.bucket.read_parquet_df_spark(self.spark, self.raw_tss_path)
            if "date" in raw_tss.columns and raw_tss:
                # Lecture optimisée
                last_dates_df = (
                    raw_tss.select("vin", "date")
                    .groupBy("vin")
                    .agg({"date": "max"})
                    .withColumnRenamed("max(date)", "last_parsed_date")
                )

                last_parsed_date_dict = (
                    last_dates_df.toPandas()
                    .set_index("vin")["last_parsed_date"]
                    .to_dict()
                )

            else:
                self.logger.info(f"Colonne 'date' non trouvée dans le dataset présent.")

        vins_paths = self.bucket.list_files(f"response/{self.make}/", type_file=".json")
        vins_paths_grouped = self._group_paths_by_vin(vins_paths)

        paths_to_exclude = []

        print(last_parsed_date_dict)

        if last_parsed_date_dict:
            for vin, paths in vins_paths_grouped.items():
                if vin in last_parsed_date_dict.keys():
                    vins_paths_grouped[vin] = [
                        path
                        for path in paths
                        if datetime.strptime(path.split("/")[-1], "%Y-%m-%d.json")
                        > datetime.strptime(str(last_parsed_date_dict[vin]).split()[0], "%Y-%m-%d")
                    ]

                    paths_to_exclude.extend([
                        path
                        for path in paths
                        if datetime.strptime(path.split("/")[-1], "%Y-%m-%d.json")
                        <= datetime.strptime(str(last_parsed_date_dict[vin]).split()[0], "%Y-%m-%d")
                    ])



        vins_paths_grouped = {k: v for k, v in vins_paths_grouped.items() if v}

        # Shuffle the vins to avoid skewness
        vins_paths_grouped = dict(random.sample(list(vins_paths_grouped.items()), k=len(vins_paths_grouped)))

        return (vins_paths_grouped, paths_to_exclude)

    def _download_keys(self, batch: dict[str, list[str]]) -> DataFrame:
        """
        Télécharge les json et retourne un DataFrame Spark
        """

        keys_to_download = []

        for _, paths in batch.items():
            keys_to_download.extend(paths)

        schema = SCHEMAS[self.make]

        keys_to_download_str = [
            f"s3a://{self.settings.S3_BUCKET}/{key}" for key in keys_to_download
        ]

        return (
            self.spark.read.option("multiline", "true")
            .schema(schema)
            .json(keys_to_download_str)
        )

    @abstractmethod
    def parse_data(self, df: DataFrame, optimal_partitions_nb: int) -> DataFrame:
        pass


In [None]:
from pyspark.sql.functions import col, explode, expr, udf
from pyspark.sql.types import *
from pyspark.sql.types import (ArrayType, BooleanType, DateType, DoubleType,
                               IntegerType, LongType, StringType, StructField,
                               StructType, TimestampType)
from typing import Optional
from pyspark.sql import SparkSession
from logging import Logger
import logging
from dotenv import load_dotenv
from core.s3.settings import S3Settings
from core.spark_utils import create_spark_session
from core.console_utils import main_decorator
import sys
from pyspark.sql import DataFrame
from pyspark.sql.functions import input_file_name, regexp_extract, to_timestamp, first
from pyspark.sql.functions import col, explode, array, struct, lit, size
from functools import reduce


class MobilisightResponseToRaw(ResponseToRawTss):
    """
    Classe pour traiter les données émises par les API Mobilisight
    stockées dans '/response/bmw/' sur Scaleway
    """

    def __init__(
        self,
        make: str = "stellantis",
        force_update: bool = False,
        writing_mode: Optional[str] = "append",
        spark: SparkSession = None,
        logger: Logger = None,
        **kwargs,
    ):

        super().__init__(
            make=make,
            force_update=force_update,
            writing_mode=writing_mode,
            spark=spark,
            logger=logger,
            **kwargs,
        )

    def parse_data(self, df: DataFrame, optimal_partitions_nb: int) -> DataFrame:
        """
        Parse dict from BMW api response

        Args:
            response (dict): Contains data to parse
            spark (SparkSession): spark session active
            vin (str): Vehicle identification number

        Returns:
            spark.DataFrame: Data with every columns
        """

        df = df.repartition('vin').coalesce(optimal_partitions_nb)

        return df

In [None]:
import logging
import sys
from transform.raw_tss.config import SCHEMAS

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    stream=sys.stdout
)

# Correction : utiliser getLogger au lieu de Logger
logger = logging.getLogger('Logger RawTss')

df = MobilisightResponseToRaw(make='stellantis', force_update=True, spark=spark, logger=logger).run()

In [None]:
s3 = S3Service()

df = s3.read_parquet_df_spark(spark=spark, key='raw_ts/stellantis/time_series/raw_ts_spark.parquet')

In [None]:
from pyspark.sql.types import StructType, ArrayType
from typing import Dict
from pyspark.sql import functions as F

def build_fields_from_schema(
    schema: StructType,
    prefix: str = "",
    naming_sep: str = "_"
) -> Dict[str, Dict]:
    """
    Parcours récursif d'un schema StructType pour générer un dict 'fields' adapté
    pour extraction à plat, en cherchant les ArrayType contenant un champ 'datetime'.
    
    Args:
      schema: schema Spark (StructType)
      prefix: chemin actuel dans la hiérarchie, ex: "electricity.level"
      naming_sep: séparateur pour noms colonnes
    """
    result = {}

    for field in schema.fields:
        field_name = field.name
        full_path = f"{prefix}.{field_name}" if prefix else field_name

        # Si c'est un ArrayType dont l'élément est un StructType, on inspecte le struct
        if isinstance(field.dataType, ArrayType) and isinstance(field.dataType.elementType, StructType):
            struct = field.dataType.elementType
            # On cherche si le struct a un champ 'datetime'
            has_datetime = any(f.name == "datetime" for f in struct.fields)

            if has_datetime:
                # On récupère tous les champs sauf datetime (qui sert pour join/filtrage)
                field_map = {}
                for f in struct.fields:
                    if f.name != "datetime":
                        # Nom de la colonne : concat des noms du chemin + nom du champ (ex: electricity_level_percentage)
                        col_name = f"{full_path.replace('.', naming_sep)}{naming_sep}{f.name}"
                        field_map[f.name] = col_name
                result[full_path] = {
                    "path": full_path,
                    "fields": field_map
                }
            else:
                # Pas de datetime dans ce struct => on descend récursivement
                deeper = build_fields_from_schema(struct, full_path, naming_sep)
                result.update(deeper)

        elif isinstance(field.dataType, StructType):
            # Struct simple, on descend récursivement
            deeper = build_fields_from_schema(field.dataType, full_path, naming_sep)
            result.update(deeper)

        else:
            # Champ simple non struct ni array, on ignore ici (car tu veux extraire depuis arrays)
            pass

    return result

fields = build_fields_from_schema(SCHEMAS["stellantis"])

print(fields)

def explode_and_select(df, path, field_mapping):
    """
    Explose un tableau structuré à l'emplacement `path` dans df,
    sélectionne les colonnes définies dans `field_mapping` (dictionnaire field_in_struct -> nom_colonne_sortie),
    et retourne un DF (vin, datetime, colonnes mappées).
    """
    # Exploser l'array
    exploded = df.select(
        "vin",
        F.explode_outer(path).alias("exploded_struct")
    )
    
    # Sélection des colonnes demandées
    cols = [
        F.col("vin"),
        F.col("exploded_struct.datetime").alias("datetime"),
    ]
    for field_in_struct, alias in field_mapping.items():
        cols.append(F.col(f"exploded_struct.{field_in_struct}").alias(alias))
        
    return exploded.select(*cols)

def merge_all_exploded(dfs):
    """
    Fusionne les DF éclatés par (vin, datetime) via full outer join
    """
    from functools import reduce
    join_expr = ["vin", "datetime"]
    df_merged = reduce(
        lambda df1, df2: df1.join(df2, on=join_expr, how="full_outer"),
        dfs
    )
    return df_merged


# On prépare la liste des DF éclatés
dfs = []
for key, params in fields.items():
    print(key, params)
    path = params["path"]
    spark_path = F.col(path)
    df_exp = explode_and_select(df, path, params["fields"])
    dfs.append(df_exp)

# Fusionner tous les dfs sur (vin, datetime)
df_final = merge_all_exploded(dfs)

In [None]:
df_final.show()

In [None]:
from pyspark.sql.functions import col, explode_outer, expr, coalesce, explode

# 1. Explode tous les champs nécessaires
df_flat = df.repartition(32)

df_flat.show()




def extract_timeseries(df_raw, vin_col, path, field_mapping):
    exploded_df = df_raw.select(
        col(vin_col),
        explode(col(path)).alias("entry")
    ).select(
        col(vin_col),
        col("entry.datetime").alias("datetime"),
        *[col(f"entry.{k}").alias(v) for k, v in field_mapping.items()]
    )
    return exploded_df

In [None]:
test[test.data.notna()].loc[0, :].data

In [None]:
df.select('signal').distinct().show()

In [None]:
from pyspark.sql.functions import col, explode, array, struct, lit

target_schema = ArrayType(StructType([
    StructField("timestamp", LongType()),
    StructField("data", StructType([
        StructField("value", DoubleType()),
        StructField("unit", StringType())
    ]))
]))

def normalize_entries(entries):
    if entries is None:
        return None
    normalized = []
    for e in entries:
        timestamp = e.get("timestamp", None)
        data = e.get("data", None)
        if isinstance(data, dict):
            value = data.get("value", None)
            unit = data.get("unit", None)
        else:
            value = data
            unit = None
        normalized.append({"timestamp": timestamp, "data": {"value": value, "unit": unit}})
    return normalized

normalize_entries_udf = udf(normalize_entries, target_schema)

df_zipped = df.select(
    "vin",
    array(
        struct(lit("odometer").alias("key"), normalize_entries_udf(col("diagnostics.odometer")).alias("entries")),
        struct(lit("battery_voltage").alias("key"), normalize_entries_udf(col("diagnostics.battery_voltage")).alias("entries")),
        struct(lit("engine_coolant_temperature").alias("key"), normalize_entries_udfcol("diagnostics.engine_coolant_temperature").alias("entries")),
        struct(lit("battery_level").alias("key"), normalize_entries_udfcol("charging.battery_level").alias("entries")),
        struct(lit("battery_level_at_departure").alias("key"), normalize_entries_udfcol("charging.battery_level_at_departure").alias("entries")),
        struct(lit("charging_rate").alias("key"), normalize_entries_udf(col("charging.charging_rate")).alias("entries")),
        struct(lit("estimated_range").alias("key"), normalize_entries_udf(col("charging.estimated_range")).alias("entries")),
        struct(lit("max_range").alias("key"), normalize_entries_udf(col("charging.max_range")).alias("entries")),
        struct(lit("plugged_in").alias("key"), normalize_entries_udfcol("charging.plugged_in").alias("entries")),
        struct(lit("fully_charged_end_times").alias("key"), normalize_entries_udfcol("charging.fully_charged_end_times").alias("entries")),
        struct(lit("preconditioning_scheduled_time").alias("key"), normalize_entries_udf(col("charging.preconditioning_scheduled_time")).alias("entries")),
        struct(lit("preconditioning_remaining_time").alias("key"), normalize_entries_udfcol("charging.preconditioning_remaining_time").alias("entries")),
        struct(lit("preconditioning_departure_status").alias("key"), normalize_entries_udfcol("charging.preconditioning_departure_status").alias("entries")),
        struct(lit("smart_charging_status").alias("key"), normalize_entries_udfcol("charging.smart_charging_status").alias("entries")),
        struct(lit("starter_battery_state").alias("key"), normalize_entries_udfcol("charging.starter_battery_state").alias("entries")),
        struct(lit("status").alias("key"), normalize_entries_udfcol("charging.status").alias("entries")),
        struct(lit("drive_start_time").alias("key"), normalize_entries_udfcol("usage.drive_start_time").alias("entries")),
        struct(lit("drive_end_time").alias("key"), normalize_entries_udfcol("usage.drive_end_time").alias("entries")),
        struct(lit("total_drive_distance").alias("key"), normalize_entries_udfcol("usage.total_drive_distance").alias("entries")),
        struct(lit("total_drive_duration").alias("key"), normalize_entries_udfcol("usage.total_drive_duration").alias("entries")),
        struct(lit("total_drive_energy_consumed").alias("key"), normalize_entries_udfcol("usage.total_drive_energy_consumed").alias("entries")),
        struct(lit("session_start_time").alias("key"), normalize_entries_udfcol("charging_session.session_start_time").alias("entries")),
        struct(lit("session_end_time").alias("key"), normalize_entries_udfcol("charging_session.session_end_time").alias("entries")),
        struct(lit("session_energy_delivered").alias("key"), normalize_entries_udfcol("charging_session.session_energy_delivered").alias("entries")),
        struct(lit("session_duration").alias("key"), normalize_entries_udfcol("charging_session.session_duration").alias("entries"))
    ).alias("signals")
)

# Explode sur les signaux
df_exploded = df_zipped.select(
    "vin",
    explode("signals").alias("signal")
).select(
    "vin",
    col("signal.key").alias("key"),
    explode(col("signal.entries")).alias("entry")
)


df_exploded.show(

# Développement

In [None]:
spark.conf.set("spark.hadoop.fs.s3a.committer.name", "directory")
spark.conf.set("spark.sql.sources.commitProtocolClass", 
               "org.apache.spark.internal.io.cloud.PathOutputCommitProtocol")
spark.conf.set("spark.sql.parquet.output.committer.class", 
               "org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter")
spark.conf.set("spark.shuffle.io.retryWait", "60s")    # Attendre plus longtemps avant retry
spark.conf.set("spark.shuffle.io.maxRetries", "10")    # Plus de retries

In [None]:
from pydantic_settings import BaseSettings
from pydantic import Field
import boto3
from abc import ABC, abstractmethod
import asyncio
import aioboto3
from botocore.exceptions import ClientError
from fastapi import Depends
from datetime import datetime
import msgspec
import asyncio
from datetime import datetime
import pandas as pd
import pyarrow.parquet as pq
import io
from abc import ABC, abstractmethod
import pyarrow as pa
import pyarrow.fs as fs
import pyarrow.dataset as ds
from typing_extensions import Annotated, Iterable
import re

In [None]:
class AsyncS3:
    def __init__(self, max_concurrency: int = 100):
        self.session = aioboto3.Session(
            aws_access_key_id="SCW9P6Q1T26F2JGSC1AS",
            aws_secret_access_key="c702e16a-5a48-45f3-8538-5783b5c58e44",
            region_name="fr-par",
        )
        self.bucket = "bib-platform-prod-data"
        self._sem = asyncio.Semaphore(max_concurrency)
        self.filesystem = fs.S3FileSystem(
            access_key="SCW9P6Q1T26F2JGSC1AS",
            secret_key="c702e16a-5a48-45f3-8538-5783b5c58e44",
            endpoint_override="https://s3.fr-par.scw.cloud",
            region="fr-par"
        )

        self.access_key = "SCW9P6Q1T26F2JGSC1AS"
        self.secret_key = "c702e16a-5a48-45f3-8538-5783b5c58e44"

        self.client_non_async = boto3.client(
            "s3",
            region_name="fr-par",
            endpoint_url="https://s3.fr-par.scw.cloud",
            aws_access_key_id=self.access_key,
            aws_secret_access_key=self.secret_key
        )

    @property
    def _client(self):
        return self.session.client(
            "s3",
            region_name="fr-par",
            endpoint_url="https://s3.fr-par.scw.cloud",
            aws_access_key_id="SCW9P6Q1T26F2JGSC1AS",
            aws_secret_access_key="c702e16a-5a48-45f3-8538-5783b5c58e44"
        )


    def list_folders(self, path: str = ""):
        folders = set()
        client = self.client_non_async
        paginator = client.get_paginator("list_objects_v2")
        for page in paginator.paginate(Bucket=self.bucket, Prefix=path, Delimiter="/"):
            for cp in page.get("CommonPrefixes", []):
                prefix = cp.get("Prefix")
                if prefix:
                    folders.add(prefix.rstrip("/").split("/")[-1])
        return sorted(folders)


    def list_files(self, path: str = "", type_file:str = ""):
        files = []
        client = self.client_non_async
        paginator = client.get_paginator("list_objects_v2")
        for page in paginator.paginate(Bucket=self.bucket, Prefix=path):
            for content in page.get("Contents", []):
                key = content["Key"]
                if key.endswith(type_file):
                    files.append(key)
        return sorted(files)

    async def get_file(self, path: str) -> bytes:
        async with self._sem:
            async with self._client as client:  # type: ignore
                try:
                    response = await client.get_object(Bucket=self.bucket, Key=path)
                    async with response["Body"] as stream:
                        return await stream.read()
                except ClientError as e:
                    if e.response["Error"]["Code"] == "NoSuchKey":
                        return None
                    raise

    async def get_files(self, paths):
        results: dict[str, bytes] = {}

        async def download(key: str):
            async with self._sem:
                data = await self.get_file(key)
                if data is not None:
                    results[key] = data
        
        await asyncio.gather(*(download(f) for f in paths))
        return results

    async def upload_file(self, path: str, file: bytes) -> None:
        async with self._client as client:  # type: ignore
            await client.put_object(Bucket=self.bucket, Key=path, Body=file)

    async def delete_file(self, path: str) -> bool:
        async with self._sem:
            async with self._client as client:  # type: ignore
                try:
                    await client.delete_object(Bucket=self.bucket, Key=path)
                    return True
                except ClientError as e:
                    if e.response["Error"]["Code"] == "NoSuchKey":
                        return False
                    raise

    async def download_folder(self, folder_path: str):
        files = await self.list_files(folder_path)
        return await self.get_files(files)
        
    async def delete_folder(self, prefix: str) -> int:
        deleted_count = 0
        async with self._sem:
            async with self._client as client:  # type: ignore
                paginator = client.get_paginator("list_objects_v2")
                async for page in paginator.paginate(Bucket=self.bucket, Prefix=prefix):
                    contents = page.get("Contents", [])
                    if not contents:
                        continue

                    # Batch delete (doc says max 1000 at a time)
                    for i in range(0, len(contents), 1000):
                        batch = contents[i : i + 1000]
                        keys = [{"Key": obj["Key"]} for obj in batch]

                        await client.delete_objects(
                            Bucket=self.bucket, Delete={"Objects": keys, "Quiet": True}
                        )

            return deleted_count
    
    def get_parquet_partitioned_file(self, path: str):
            return ds.dataset(f"{self._settings.S3_BUCKET}/{path}", filesystem=self.filesystem, format="parquet", partitioning="hive")

In [None]:


class ResponseToRaw(ABC):
    """Converter class that could be extended or modified easily to be used for all car brand conversion from response to raw. Only the  _get_files_to_add method have to be implemented to return the list of new data as dict. Each element of the list should be a dict representing one datapoint and the dict must contain at least the key 'date' and 'vin'"""

    def __init__(self, brand_prefix: str, spark) -> None:
        self._s3 = AsyncS3()
        self.brand_prefix = brand_prefix
        self.spark = spark


    async def convert(self):
        ts = await self._get_ts()
        if ts.shape[0] > 0:
            ts_last_date: datetime = pd.to_datetime(ts["readable_date"]).max().to_pydatetime()
        else:
            ts_last_date: datetime = datetime(0,0,0)
            

        new_files = await self._get_files_to_add(ts_last_date)
        new_df = pd.DataFrame(new_files)
        extended_ts = pd.concat([ts, new_df])
        await self._save_ts(extended_ts)

    async def _get_ts(self) -> pd.DataFrame:
        
        dataset = self._s3.get_parquet_partitioned_file(
            f"raw_ts/{self.brand_prefix}/time_series/spark_raw_tss.parquet"
        ).to_table(columns=['readable_date'])


        return dataset.to_pandas()
        files = await self._s3.list_files(
            f"raw_ts/{self.brand_prefix}/time_series/spark_raw_tss.parquet"
            , type_file=".parquet"
        )

        files_bytes_dict = await self._s3.get_files(files)

        print("Turn bites into pandas", datetime.now())
        # tables = []
        # for b in files_bytes_dict.values():
        #     buffer = pa.BufferReader(b)
        #     table = pq.read_table(buffer)
        #     tables.append(table)

        # combined = pa.concat_tables(tables)

        # if combined is None:
        #     return pd.DataFrame()

        return combined.to_pandas()

    async def _save_ts(self, df: pd.DataFrame):
        buffer = io.BytesIO()
        df.to_parquet(buffer, engine="pyarrow")
        parquet_bytes = buffer.getvalue()
        await self._s3.upload_file(
            f"raw_ts/{self.brand_prefix}/time_series/cp_spark_raw_tss.parquet", parquet_bytes
        )
    
    @abstractmethod
    async def _get_files_to_add(self, last_date: datetime):
        pass

    def _paths_to_download(self, last_date: datetime, **kwargs):
        vins = self._s3.list_folders(f"response/{self.brand_prefix}/")

        vins_data = [self._s3.list_files(f"response/{self.brand_prefix}/{vin}/", **kwargs) for vin in vins]


        path_to_dl = []
        for vin_daily_files in vins_data:
            pattern = re.compile(r"\d{4}-\d{2}-\d{2}\.json$")
            vin_daily_files = [p for p in vin_daily_files if pattern.search(p.split("/")[-1])]
            path_to_dl.extend(
                path
                for path in vin_daily_files
                if datetime.strptime(path.split("/")[-1], "%Y-%m-%d.json") > last_date
            )

        return path_to_dl

In [None]:
def align_dataframes_for_union(df1, df2, strategy="intersection"):
    """
    Aligne deux DataFrames pour l'union

    Args:
        df1, df2: DataFrames à unir
        strategy: "intersection" (colonnes communes) ou "union" (toutes les colonnes)
    """

    cols1 = set(df1.columns)
    cols2 = set(df2.columns)

    print(f"DataFrame 1: {len(cols1)} colonnes")
    print(f"DataFrame 2: {len(cols2)} colonnes")

    if strategy == "intersection":
        # Utiliser seulement les colonnes communes
        common_cols = cols1 & cols2
        print(f"Colonnes communes: {len(common_cols)}")

        # Colonnes manquantes dans chaque DataFrame
        missing_in_df1 = cols2 - cols1
        missing_in_df2 = cols1 - cols2

        if missing_in_df1:
            print(f"Colonnes manquantes dans df1: {missing_in_df1}")
        if missing_in_df2:
            print(f"Colonnes manquantes dans df2: {missing_in_df2}")

        # Sélectionner seulement les colonnes communes
        df1_aligned = df1.select(*sorted(common_cols))
        df2_aligned = df2.select(*sorted(common_cols))

    elif strategy == "union":
        # Utiliser toutes les colonnes, ajouter des colonnes NULL pour les manquantes
        all_cols = sorted(cols1 | cols2)
        print(f"Toutes les colonnes: {len(all_cols)}")

        # Ajouter les colonnes manquantes à df1
        for col in all_cols:
            if col not in cols1:
                df1 = df1.withColumn(col, lit(None).cast("string"))

        # Ajouter les colonnes manquantes à df2
        for col in all_cols:
            if col not in cols2:
                df2 = df2.withColumn(col, lit(None).cast("string"))

        df1_aligned = df1.select(*all_cols)
        df2_aligned = df2.select(*all_cols)

    return df1_aligned, df2_aligned

In [None]:
from pyspark.sql.types import StructType, StructField, LongType, DoubleType, StringType
from pyspark.sql.types import StructType, StructField, StringType
import time


In [None]:
from pyspark.sql.types import *

schema = StructType([
    StructField("vin", StringType(), True),
    StructField("timestamp", LongType(), True),
    StructField("readable_date", StringType(), True),
    StructField("createdAt", StringType(), True),
    StructField("data", ArrayType(
        StructType([
            StructField("key", StringType(), True),
            StructField("value", StructType([
                StructField("doubleValue", DoubleType(), True),
                StructField("intValue", IntegerType(), True),
                StructField("booleanValue", BooleanType(), True),
                StructField("stringValue", StringType(), True),
                StructField("carTypeValue", StringType(), True),
                StructField("bmsStateValue", StringType(), True),
                StructField("climateKeeperModeValue", StringType(), True),
                StructField("chargePortValue", StringType(), True),
                StructField("defrostModeValue", StringType(), True),
                StructField("detailedChargeStateValue", StringType(), True),
                StructField("fastChargerValue", StringType(), True),
                StructField("hvacAutoModeValue", StringType(), True),
                StructField("hvacPowerValue", StringType(), True),
                StructField("sentryModeStateValue", StringType(), True),
                StructField("invalid", BooleanType(), True),
            ]))
        ])
    ), True)
])

In [None]:
from pyspark.sql.functions import col, coalesce, explode, expr
from pyspark.sql.functions import udf



class TFTResponseToRawSpark(ResponseToRaw):

    def _get_files_to_add(self, last_date: datetime, schema):
        logger.info("Retrieve files to ", datetime.now())
        path_to_download: list[str] = self._paths_to_download(last_date, type_file=".json")
        logger.info("Change all files to complete path", datetime.now())
        logger.info(len(path_to_download))

        path_to_download = [f"s3a://bib-platform-prod-data/{path}" for path in path_to_download][:1000]

        path_to_download = self.group_paths_by_vin(path_to_download)
        
        df_write = None 
        
        for vin, paths in path_to_download.items():
            sp_df = self.spark.read.option("multiline", "true").schema(schema).json(paths)
            
            logger.info("Parse the data", datetime.now())
            
            parsed_df = self._parse_tesla_data(sp_df)
        
            logger.info("Pivot the data", datetime.now())
            pivoted = parsed_df.repartition("vin").groupBy("vin", "timestamp", "readable_date", "createdAt") \
                .pivot("key") \
                .agg(expr("first(value)"))    


            if df_write is not None:
                pivoted, df_write = align_dataframes_for_union(
                    pivoted, df_write, strategy="intersection"
                )
                df_write = df_write.union(pivoted).dropDuplicates()
            else:
                df_write = pivoted
                
            pivoted.unpersist()
            del pivoted
            
            
        self._save_as_parquet(df_write, "raw_ts_spark.parquet")

        return df_write

    def group_paths_by_vin(self, paths):
        grouped = {}
    
        for path in paths:
            parts = path.strip("/").split("/")
            if len(parts) < 2:
                continue  # ignorer les paths invalides
            vin = parts[-2]
            # Initialise la liste si vin pas encore vu
            if vin not in grouped:
                grouped[vin] = []
    
            grouped[vin].append(path)
    
        return grouped

    def _parse_tesla_data(self, df):
        # Explode le tableau data
        exploded_df = df.select(
            "vin", "timestamp", "readable_date", "createdAt",
            explode("data").alias("data_item")
        )
        
        # Extraire key et value
        parsed_df = exploded_df.select(
            "vin", "timestamp", "readable_date", "createdAt",
            col("data_item.key").alias("key"),
            col("data_item.value").alias("value")
        )
        
        # Extraire les valeurs selon leur type avec une UDF
        def extract_value(value_struct):
            if value_struct is None:
                return None
            
            # Essayer stringValue
            if hasattr(value_struct, 'stringValue') and value_struct.stringValue is not None:
                return str(value_struct.stringValue)
            # Essayer doubleValue
            elif hasattr(value_struct, 'doubleValue') and value_struct.doubleValue is not None:
                return str(value_struct.doubleValue)
            # Essayer intValue
            elif hasattr(value_struct, 'intValue') and value_struct.intValue is not None:
                return str(value_struct.intValue)
            # Essayer booleanValue
            elif hasattr(value_struct, 'booleanValue') and value_struct.booleanValue is not None:
                return str(value_struct.booleanValue)
            # Essayer detailedChargeStateValue
            elif hasattr(value_struct, 'detailedChargeStateValue') and value_struct.detailedChargeStateValue is not None:
                return str(value_struct.detailedChargeStateValue)
            # Essayer les autres types si nécessaire
            elif hasattr(value_struct, 'cableTypeValue') and value_struct.cableTypeValue is not None:
                return str(value_struct.cableTypeValue)
            elif hasattr(value_struct, 'climateKeeperModeValue') and value_struct.climateKeeperModeValue is not None:
                return str(value_struct.climateKeeperModeValue)
            elif hasattr(value_struct, 'defrostModeValue') and value_struct.defrostModeValue is not None:
                return str(value_struct.defrostModeValue)
            elif hasattr(value_struct, 'fastChargerValue') and value_struct.fastChargerValue is not None:
                return str(value_struct.fastChargerValue)
            elif hasattr(value_struct, 'hvacAutoModeValue') and value_struct.hvacAutoModeValue is not None:
                return str(value_struct.hvacAutoModeValue)
            elif hasattr(value_struct, 'hvacPowerValue') and value_struct.hvacPowerValue is not None:
                return str(value_struct.hvacPowerValue)
            
            return None
        extract_value_udf = udf(extract_value, StringType())
        
        result_df = parsed_df.select(
            "vin", "timestamp", "readable_date", "createdAt", "key",
            extract_value_udf("value").alias("value")
        )
        
        return result_df
        
    def _save_as_parquet(self, df, key: str):
            """
            Censée faire une concat de ce qui existe et écraser les fichiers déjà présents dans scaleway.
            Aucune idée de la rapidité ni de la viabilité.
            """

            s3_path = f"s3a://bib-platform-prod-data/raw_ts/tesla-fleet-telemetry/time_series/{key}"

            try:
                # Essayer de lire le fichier existant
                processed = self.spark.read.parquet(s3_path)

                # Vérifier si le DataFrame n'est pas vide
                if processed.count() > 0:
                    processed, df = align_dataframes_for_union(
                        processed, df, strategy="intersection"
                    )
                    df_write = processed.union(df).dropDuplicates()

                else:
                    df_write = df

            except Exception as e:
                # Si le fichier n'existe pas ou est corrompu
                if (
                    "PATH_NOT_FOUND" in str(e)
                    or "does not exist" in str(e)
                    or "UNABLE_TO_INFER_SCHEMA" in str(e)
                ):
                    df_write = df
                else:
                    # Autre erreur, on la relance
                    raise e
            

            print("Write the data", datetime.now())
            df = df.persist()
            df.count()
            df_write.coalesce(100).write \
                .mode("overwrite") \
                .option("parquet.compression", "snappy") \
                .option("parquet.block.size", 67108864) \
                .partitionBy("vin") \
                .parquet(s3_path)
            """
            df_write.coalesce(1).write \
            .mode("append") \
            .option("parquet.compression", "snappy") \
            .option("parquet.block.size", 67108864) \
            .partitionBy("vin") \
            .parquet(s3_path)
            print("✅ Écriture réussie")
            """

In [None]:
TRtR = TFTResponseToRawSpark('tesla-fleet-telemetry', spark)

paths = TRtR._paths_to_download(datetime(2025, 1, 1), type_file=".json")

In [None]:
def group_paths_by_vin(paths):
    grouped = {}

    for path in paths:
        parts = path.strip("/").split("/")
        if len(parts) < 2:
            continue  # ignorer les paths invalides
        vin = parts[-2]
        # Initialise la liste si vin pas encore vu
        if vin not in grouped:
            grouped[vin] = []

        grouped[vin].append(path)

    return grouped

paths_added = [f"s3a://bib-platform-prod-data/{path}" for path in paths]
paths_grouped = group_paths_by_vin(paths_added)

In [None]:
keys = list(paths_grouped.keys())[400:440] # Next 460 - 480
paths_final = [paths_grouped[key] for key in keys]
flattened = [item for sublist in paths_final for item in sublist]

In [None]:
start = time.time()

sp_df = spark.read.option("multiline", "true").schema(schema).json(flattened)
end = time.time()

sp_df = sp_df.repartition('vin')
sp_df = sp_df.coalesce(32)


sp_df.rdd.getNumPartitions()

print(f"Temps écoulé avec schema: {end - start:.2f} secondes")

In [None]:
print(sp_df.rdd.getNumPartitions())

In [None]:
def parse_tesla_data(df):
    # Explode le tableau data
    exploded_df = df.select(
        "vin", "timestamp", "readable_date", "createdAt",
        explode("data").alias("data_item")
    )
    
    # Extraire key et value
    parsed_df = exploded_df.select(
        "vin", "timestamp", "readable_date", "createdAt",
        col("data_item.key").alias("key"),
        col("data_item.value").alias("value")
    ) # .coalesce(32) à tester
    
    # Extraire les valeurs selon leur type avec une UDF
    def extract_value(value_struct):
        if value_struct is None:
            return None
        
        # Essayer stringValue
        if hasattr(value_struct, 'stringValue') and value_struct.stringValue is not None:
            return str(value_struct.stringValue)
        # Essayer doubleValue
        elif hasattr(value_struct, 'doubleValue') and value_struct.doubleValue is not None:
            return str(value_struct.doubleValue)
        # Essayer intValue
        elif hasattr(value_struct, 'intValue') and value_struct.intValue is not None:
            return str(value_struct.intValue)
        # Essayer booleanValue
        elif hasattr(value_struct, 'booleanValue') and value_struct.booleanValue is not None:
            return str(value_struct.booleanValue)
        # Essayer detailedChargeStateValue
        elif hasattr(value_struct, 'detailedChargeStateValue') and value_struct.detailedChargeStateValue is not None:
            return str(value_struct.detailedChargeStateValue)
        # Essayer les autres types si nécessaire
        elif hasattr(value_struct, 'cableTypeValue') and value_struct.cableTypeValue is not None:
            return str(value_struct.cableTypeValue)
        elif hasattr(value_struct, 'climateKeeperModeValue') and value_struct.climateKeeperModeValue is not None:
            return str(value_struct.climateKeeperModeValue)
        elif hasattr(value_struct, 'defrostModeValue') and value_struct.defrostModeValue is not None:
            return str(value_struct.defrostModeValue)
        elif hasattr(value_struct, 'fastChargerValue') and value_struct.fastChargerValue is not None:
            return str(value_struct.fastChargerValue)
        elif hasattr(value_struct, 'hvacAutoModeValue') and value_struct.hvacAutoModeValue is not None:
            return str(value_struct.hvacAutoModeValue)
        elif hasattr(value_struct, 'hvacPowerValue') and value_struct.hvacPowerValue is not None:
            return str(value_struct.hvacPowerValue)
        
        return None
    extract_value_udf = udf(extract_value, StringType())
    
    result_df = parsed_df.select(
        "vin", "timestamp", "readable_date", "createdAt", "key",
        extract_value_udf("value").alias("value")
    )
    
    return result_df


start = time.time()
parsed_df = parse_tesla_data(sp_df)
end = time.time()
print(f"Temps écoulé pour parsing: {end - start:.2f} secondes")

start = time.time()
pivoted = parsed_df.groupBy("vin", "timestamp", "readable_date", "createdAt") \
    .pivot("key") \
    .agg(expr("first(value)"))
end = time.time()
print(f"Temps écoulé pour pivoté: {end - start:.2f} secondes")

In [None]:
from pyspark.sql import DataFrame
from pyspark.sql.utils import AnalysisException
from datetime import datetime
import logging

logger = logging.getLogger(__name__)

def save_as_parquet(df: DataFrame, key: str):
    """
    Écrit un DataFrame en Parquet dans S3 :
    - Fusionne avec les données existantes si présentes
    - Évite les doublons
    - Partitionne par 'vin'
    - Gère les cas d'erreur proprement
    """

    s3_path = f"s3a://bib-platform-prod-data/raw_ts/tesla-fleet-telemetry/time_series/{key}"

    """
    try:
        # Lecture sécurisée du parquet existant (s'il existe)
        processed = spark.read.parquet(s3_path)
        if processed.rdd.isEmpty():
            df_write = df
        else:
            start = time.time()
            processed, df = align_dataframes_for_union(processed, df, strategy="intersection")
            df_write = processed.unionByName(df).dropDuplicates()
            end = time.time()
            print(f"Temps écoulé pour écrire: {end - start:.2f} secondes")
            
    except AnalysisException as e:
        if "Path does not exist" in str(e) or "Unable to infer schema" in str(e):
            df_write = df
        else:
            raise e
    except Exception as e:
        # Pour tout autre cas (permissions, corruptions…)
        df_write = df
    """
    
    start = time.time()
    (
        df.repartition('vin')  # Ajustable selon le volume (~100-200MB par fichier)
        .write
        .mode("append")  # ou "append" si tu veux accumuler sans relire
        .option("parquet.compression", "snappy")
        .option("parquet.block.size", 134217728)  # 128MB
        .partitionBy("vin")
        .parquet(s3_path)
    )
    end = time.time()
    print(f"Temps écoulé pour écrire: {end - start:.2f} secondes")

start = time.time()
save_as_parquet(pivoted, "raw_ts_spark.parquet")
end = time.time()
print(f"Temps écoulé pour écrire: {end - start:.2f} secondes")

# To delete

In [None]:
import boto3
from core.s3.s3_utils import S3Service


total_size = []
bucket = S3Service()


total_size = 4019.10141225962 *1024 *1024
nb_vin = 550

In [None]:
max_date = ptss.coalesce(32).orderBy("date", ascending=False).select("date").first()[0]
print(f"Date maximale: {max_date}")

In [None]:
ptss = bucket.read_parquet_df_spark(spark,'raw_results/spark_tesla-fleet-telemetry.parquet/vin=XP7YGCFS4RB395709ts









In [None]:
df_pd = ptss.toPandas()

In [None]:


# df_pd[['createdAt', 'readable_date']][df_pd.readable_date.isna()].sort_values('createdAt', ascending=False)

df_pd.sort_values('charging_status_idx')

In [None]:
from core.s3.s3_utils import S3Service

bucket = S3Service()

size, length = bucket.get_object_size("response/tesla-fleet-telemetry/")



def get_ideal_nb_partitions(file_size_bytes: float, nb_vin: int) -> int:
    """
    Calcule le nombre idéal de partitions Spark basé sur la taille du fichier et le nombre de VINs.
    
    Cette fonction détermine le nombre optimal de partitions pour optimiser les performances
    Spark en fonction de la taille moyenne par VIN et de la recommandation de 128MB par partition.
    
    Args:
        file_size_bytes (float): Taille totale du fichier en octets
        nb_vin (int): Nombre de VINs (véhicules) dans le fichier
    
    Returns:
        int: Nombre idéal de partitions Spark
        
    Raises:
        ValueError: Si file_size_bytes ou nb_vin sont négatifs ou nuls
    """
    # Validation des paramètres
    if file_size_bytes <= 0 or nb_vin <= 0:
        raise ValueError("file_size_bytes et nb_vin doivent être positifs")
    
    # Calcul de la taille moyenne par VIN
    size_file_mb = file_size_bytes / (1024 * 1024)
    print(f"Taille du fichier: {size_file_mb:.2f} MB")
    
    avg_size_file_vin_mb = size_file_mb / nb_vin
    print(f"Taille moyenne par VIN: {avg_size_file_vin_mb:.2f} MB")
    
    # Calcul du nombre idéal de VINs par partition (basé sur 128MB recommandé)
    nb_vin_ideal_size = 128 / avg_size_file_vin_mb
    print(f"Nombre idéal de VINs par partition: {nb_vin_ideal_size:.2f}")

    # Logique de décision
    if nb_vin_ideal_size < 0.5:
        print("⚠️  Taille moyenne par VIN > 256 MB, partitionnement par VIN non recommandé")
        return nb_vin
    elif nb_vin_ideal_size < 1:
        print("ℹ️  Taille par VIN optimale, utilisation du nombre de VINs")
        return nb_vin
    else:
        optimal_partitions = int(nb_vin / nb_vin_ideal_size)
        if optimal_partitions % 2 == 0:
            pass
        else:
            optimal_partitions += 1
        print(f"✅ Partitionnement optimisé: {optimal_partitions} partitions") 
        return optimal_partitions,

get_ideal_nb_partitions(size, 550)

In [None]:
total_size / (104*1024)

In [None]:
df = df.coalesce(32)

In [None]:
from pyspark.sql import functions as F
result = df.filter(F.col("soh").isNotNull()) \
          .groupBy("vin") \
        .distinct('vin')

In [None]:
# Tous les VINs
all_vins = df.select("vin").distinct().collect()
all_vin_set = {row["vin"] for row in all_vins}

# VINs avec SOH
vins_with_soh = df.filter(F.col("soh").isNotNull()) \
                 .select("vin") \
                 .distinct() \
                 .collect()
vins_with_soh_set = {row["vin"] for row in vins_with_soh}

# VINs sans SOH
vins_without_soh_set = all_vin_set - vins_with_soh_set

print(f"VINs sans SOH: {vins_without_soh_set}")

In [None]:
df.select("vin", "model", "net_capacity", "energy_added", "soc_diff", "soh").filter(F.col("vin") == "5YJ3E7EB1KF334219").toPandas()

In [None]:
vins_without_soh_set

In [None]:

result = df.groupBy("vin") \
          .agg(F.count(col("soh").isNotNull().cast("int")).alias("soh_count")) \
          .filter(col("soh_count") == 0) \
          .count()

print(f"Nombre de VINs sans SOH: {result}")

In [None]:
raw

In [None]:
from pyspark.sql.functions import *












In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from core.s3.settings import S3Settings
import os

def create_spark_session(access_key: str, secret_key: str) -> SparkSession:
    """
    Create a session spark with a connexion to scaleway
    """
    os.environ["PYSPARK_SUBMIT_ARGS"] = (
        "--packages org.apache.hadoop:hadoop-aws:3.3.4,org.apache.spark:spark-hadoop-cloud_2.12:3.4.0 pyspark-shell"
    )

    g1gc_options = (
        "-XX:+UseG1GC "
        "-XX:MaxGCPauseMillis=100 "
        "-XX:G1HeapRegionSize=32m "
        "-XX:+UseStringDeduplication "
        "-XX:+UnlockExperimentalVMOptions "
        "-XX:+UseZGC "
        "-XX:+DisableExplicitGC "
        "-XX:+UseGCOverheadLimit "
        "-XX:GCTimeRatio=9 "
        "-XX:+PrintGCDetails "
        "-XX:+PrintGCTimeStamps "
        "-Xloggc:/tmp/spark-gc.log"
    )
    
    spark = (
        SparkSession.builder.appName("Scaleway S3 Read JSON")
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,org.apache.spark:spark-hadoop-cloud_2.12:3.4.0")
        .config("spark.hadoop.fs.s3a.endpoint", "https://s3.fr-par.scw.cloud")
        .config("spark.hadoop.fs.s3a.access.key", access_key)
        .config("spark.hadoop.fs.s3a.secret.key", secret_key)
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .config("spark.hadoop.fs.s3a.path.style.access", "true")
        .config("spark.driver.host", "localhost")
        .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
        # Nouvelles configurations pour résoudre le ClassNotFoundException
        .config("spark.hadoop.fs.s3a.experimental.input.fadvise", "normal")
        .config("spark.hadoop.fs.s3a.connection.maximum", "1000")
        .config("spark.hadoop.fs.s3a.threads.max", "20")
        .config("spark.hadoop.fs.s3a.threads.core", "10")
        .config("spark.hadoop.fs.s3a.buffer.dir", "/tmp")
        .config("spark.hadoop.fs.s3a.block.size", "134217728")  # 128MB
        .config("spark.hadoop.fs.s3a.multipart.size", "134217728")  # 128MB
        .config("spark.hadoop.fs.s3a.multipart.threshold", "134217728")  # 128MB
        # Configuration pour éviter les problèmes de commit protocol
        .config("spark.sql.adaptive.enabled", "true")
        .config("spark.sql.adaptive.shuffle.targetPostShuffleInputSize", "64MB")
        .config("spark.sql.shuffle.partitions", "200")
        .config("spark.default.parallelism", "200")
        .config("spark.executor.memory", "10g")
        .config("spark.driver.memory", "10g")
        .config("spark.driver.maxResultSize", "4g")
        .config("spark.executor.extraJavaOptions", g1gc_options)
        .config("spark.driver.extraJavaOptions", g1gc_options)
        .getOrCreate()
    )

    return spark

settings = S3Settings()

spark = create_spark_session(settings.S3_KEY, settings.S3_SECRET)

In [None]:
df = bucket.read_parquet_df_spark(spark, "raw_results/spark_tesla-fleet-telemetry_v1.parquet")

In [None]:
df.columns

In [None]:
df[['consumption', 'estimated_cycles', 'odometer', 'range', 'soh', 'vin']].filter(col('consu').isNotNull()).count()

In [None]:
df.filter(F.col("vin") == "LRW3E7ET6RC169216").toPandas()

In [None]:
from pyspark.sql import functions as F

df.filter(F.col('soh').isNotNull()).show()

In [None]:
from core.sql_utils import *

prod_engine = get_sqlalchemy_engine(is_prod=True)
dev_engine = get_sqlalchemy_engine()
df_vehicle_prod = pd.read_sql("""
    select vin, soh, timestamp, version
    from vehicle_data vd
    left join vehicle vm 
    on vm.id = vd.vehicle_id
    left join vehicle_model vm2 
    on vm2.id = vm.vehicle_model_id
    left join oem o 
    on o.id= vm2.oem_id
    where 1=1
    and o.oem_name = 'tesla'
    and vm.fleet_id = '70260bd9-2449-4f5b-81f2-73d9cb6b4b93'
    """
    , prod_engine)
 
 
df_vehicle_dev = pd.read_sql("""
    select vin, soh, timestamp
    from vehicle_data vd
    left join vehicle vm 
    on vm.id = vd.vehicle_id
    left join vehicle_model vm2 
    on vm2.id = vm.vehicle_model_id
    left join oem o 
    on o.id= vm2.oem_id
    where 1=1
    and o.oem_name = 'tesla'
    """
    , dev_engine)

merged_df = pd.merge(df_vehicle_dev, df_vehicle_prod, on=['vin', 'timestamp'], how='left', suffixes=('_dev', '_prod'))
merged_df[['vin', 'timestamp', 'soh_dev', 'soh_prod']][merged_df['soh_dev'].notna() & merged_df['soh_prod'].notna()]

In [None]:
merged_df = pd.merge(df_vehicle_dev, df_vehicle_prod, on=['vin', 'timestamp'], how='left', suffixes=('_dev', '_prod'))

merged_df['soh_diff'] = merged_df['soh_dev'] - merged_df['soh_prod']
merged_df[['vin', 'timestamp', 'soh_dev', 'soh_prod', 'soh_diff']][merged_df['soh_dev'].notna() & merged_df['soh_prod'].notna()].sort_values(by='timestamp', ascending=False)

merged_df[merged_df.soh_diff.min() == merged_df.soh_diff]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Filtrer les données où on a à la fois soh_dev et soh_prod
filtered_df = merged_df[merged_df['soh_dev'].notna() & merged_df['soh_prod'].notna()]

# Créer la figure avec plusieurs sous-graphiques
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Répartition des différences de SOH (dev - prod)', fontsize=16, fontweight='bold')

# 1. Histogramme de la distribution
axes[0, 0].hist(filtered_df['soh_diff'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 0].set_title('Distribution des différences de SOH')
axes[0, 0].set_xlabel('Différence SOH (dev - prod)')
axes[0, 0].set_ylabel('Fréquence')
axes[0, 0].axvline(x=0, color='red', linestyle='--', alpha=0.7, label='Différence = 0')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Box plot
axes[0, 1].boxplot(filtered_df['soh_diff'], patch_artist=True, 
                   boxprops=dict(facecolor='lightgreen', alpha=0.7))
axes[0, 1].set_title('Box Plot des différences de SOH')
axes[0, 1].set_ylabel('Différence SOH (dev - prod)')
axes[0, 1].grid(True, alpha=0.3)

# 3. Scatter plot soh_dev vs soh_prod
axes[1, 0].scatter(filtered_df['soh_prod'], filtered_df['soh_dev'], alpha=0.6, s=20)
axes[1, 0].plot([filtered_df['soh_prod'].min(), filtered_df['soh_prod'].max()], 
                [filtered_df['soh_prod'].min(), filtered_df['soh_prod'].max()], 
                'r--', alpha=0.7, label='Ligne de parité')
axes[1, 0].set_title('SOH Dev vs SOH Prod')
axes[1, 0].set_xlabel('SOH Prod')
axes[1, 0].set_ylabel('SOH Dev')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# 4. Violin plot
axes[1, 1].violinplot(filtered_df['soh_diff'], showmeans=True)
axes[1, 1].set_title('Violin Plot des différences de SOH')
axes[1, 1].set_ylabel('Différence SOH (dev - prod)')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Statistiques descriptives
print("Statistiques descriptives des différences de SOH:")
print(filtered_df['soh_diff'].describe())

# Pourcentage de différences positives/négatives
positive_diff = (filtered_df['soh_diff'] > 0).sum()
negative_diff = (filtered_df['soh_diff'] < 0).sum()
zero_diff = (filtered_df['soh_diff'] == 0).sum()
total = len(filtered_df)

print(f"\nRépartition des différences:")
print(f"Différences positives (dev > prod): {positive_diff} ({positive_diff/total*100:.1f}%)")
print(f"Différences négatives (dev < prod): {negative_diff} ({negative_diff/total*100:.1f}%)")
print(f"Différences nulles (dev = prod): {zero_diff} ({zero_diff/total*100:.1f}%)")