In [None]:
from core.s3_utils import S3_Bucket
from core.spark_utils import create_spark_session

bucket = S3_Bucket()

# Création de la session Spark
creds = bucket.get_creds_from_dot_env()
spark_session = create_spark_session(
        creds["aws_access_key_id"],
        creds["aws_secret_access_key"]
)

spark_session.conf.set("spark.sql.adaptive.enabled", "true")
spark_session.conf.set(
    "spark.sql.adaptive.advisoryPartitionSizeInBytes", "128MB"
)
spark_session.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")
spark_session.conf.set("spark.sql.adaptive.advisoryPartitionSizeInBytes", "128m")
spark_session.conf.set("spark.sql.debug.maxToStringFields", 1000)

# Configuration spécifique pour Scaleway S3
spark_session.conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
spark_session.conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
spark_session.conf.set("spark.hadoop.fs.s3a.access.key", creds["aws_access_key_id"])
spark_session.conf.set("spark.hadoop.fs.s3a.secret.key", creds["aws_secret_access_key"])
spark_session.conf.set("spark.hadoop.fs.s3a.endpoint", "s3.fr-par.scw.cloud")
spark_session.conf.set("spark.hadoop.fs.s3a.path.style.access", "true")
spark_session.conf.set("spark.hadoop.fs.s3a.connection.ssl.enabled", "true")
spark_session.conf.set("spark.hadoop.fs.s3a.connection.maximum", "100")

# Configuration spécifique pour éviter AWS par défaut
spark_session.conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
spark_session.conf.set("spark.hadoop.fs.s3a.endpoint.region", "fr-par")

# Utiliser l'URL avec le bon endpoint
df_raw = spark_session.read \
    .option("multiline", "true") \
    .json("s3a://bib-platform-prod-data/response/tesla-fleet-telemetry/5YJ3E7EB1KF334219/2025-03-24.json")

In [None]:
import random

vins = bucket.list_subfolders("response/tesla-fleet-telemetry/")
vin_random = random.sample(vins, 1)[0]
print(vin_random)

In [None]:
sp_raw = spark_session.read.option("multiline", "true").json(f"s3a://bib-platform-prod-data/response/tesla-fleet-telemetry/{vin_random}/*.json")

In [None]:
sp_raw.rdd.getNumPartitions()

In [None]:
sp_raw.show()

In [None]:

df_spark = bucket.read_parquet_df_spark(spark_session, "raw_ts/tesla-fleet-telemetry/time_series/spark_raw_tss.parquet/vin=XP7YGCES5RB479409/part-00000-d9d5b1f6-565e-453c-96f3-c650091ecb9a.c000.snappy.parquet")
df_spark.count()

In [None]:
LIST_COL_TO_DROP = ["model"]
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, first
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

def parse_tesla_data(df):
    # Explode le tableau data
    exploded_df = df.select(
        "vin", "timestamp", "readable_date", "createdAt",
        explode("data").alias("data_item")
    )
    
    # Extraire key et value
    parsed_df = exploded_df.select(
        "vin", "timestamp", "readable_date", "createdAt",
        col("data_item.key").alias("key"),
        col("data_item.value").alias("value")
    )
    
    # Extraire les valeurs selon leur type avec une UDF
    def extract_value(value_struct):
        if value_struct is None:
            return None
        
        # Essayer stringValue
        if hasattr(value_struct, 'stringValue') and value_struct.stringValue is not None:
            return str(value_struct.stringValue)
        # Essayer doubleValue
        elif hasattr(value_struct, 'doubleValue') and value_struct.doubleValue is not None:
            return str(value_struct.doubleValue)
        # Essayer intValue
        elif hasattr(value_struct, 'intValue') and value_struct.intValue is not None:
            return str(value_struct.intValue)
        # Essayer booleanValue
        elif hasattr(value_struct, 'booleanValue') and value_struct.booleanValue is not None:
            return str(value_struct.booleanValue)
        # Essayer detailedChargeStateValue
        elif hasattr(value_struct, 'detailedChargeStateValue') and value_struct.detailedChargeStateValue is not None:
            return str(value_struct.detailedChargeStateValue)
        # Essayer les autres types si nécessaire
        elif hasattr(value_struct, 'cableTypeValue') and value_struct.cableTypeValue is not None:
            return str(value_struct.cableTypeValue)
        elif hasattr(value_struct, 'climateKeeperModeValue') and value_struct.climateKeeperModeValue is not None:
            return str(value_struct.climateKeeperModeValue)
        elif hasattr(value_struct, 'defrostModeValue') and value_struct.defrostModeValue is not None:
            return str(value_struct.defrostModeValue)
        elif hasattr(value_struct, 'fastChargerValue') and value_struct.fastChargerValue is not None:
            return str(value_struct.fastChargerValue)
        elif hasattr(value_struct, 'hvacAutoModeValue') and value_struct.hvacAutoModeValue is not None:
            return str(value_struct.hvacAutoModeValue)
        elif hasattr(value_struct, 'hvacPowerValue') and value_struct.hvacPowerValue is not None:
            return str(value_struct.hvacPowerValue)
        
        return None
    
    extract_value_udf = udf(extract_value, StringType())
    
    result_df = parsed_df.select(
        "vin", "timestamp", "readable_date", "createdAt", "key",
        extract_value_udf("value").alias("value")
    )
    
    return result_df

In [None]:
final_df = parse_tesla_data(sp_raw)

In [None]:
final_df.repartition('readable_date').filter(col('readable_date') == '2025-06-11 00:00:02').show()

In [None]:
df_spark.filter(col('readable_date') == '2025-06-11 00:00:02').show()

In [None]:
df1_size = final_df.rdd.map(lambda row: len(str(row))).reduce(lambda a, b: a + b)
df2_size = df_spark.rdd.map(lambda row: len(str(row))).reduce(lambda a, b: a + b)

print(f"Taille estimée de df1 : {df1_size / 1024:.2f} Ko")
print(f"Taille estimée de df2 : {df2_size / 1024:.2f} Ko")

In [None]:
def get_dir_size(path):
    total = 0
    for dirpath, dirnames, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total += os.path.getsize(fp)
    return total

size1 = get_dir_size("/chemin/vers/parquet1")
size2 = get_dir_size("/chemin/vers/parquet2")

print(f"Parquet1 : {size1 / 1024 / 1024:.2f} Mo")
print(f"Parquet2 : {size2 / 1024 / 1024:.2f} Mo")

In [None]:
df_spark.repartition(4).cache
final_df.coalesce(4).cache

In [None]:
df_spark.write.mode("overwrite").parquet("s3a://bib-platform-prod-data/raw_ts/tesla-fleet-telemetry/time_series/many_cols")

In [None]:
final_df.write.mode("overwrite").parquet("s3a://bib-platform-prod-data/raw_ts/tesla-fleet-telemetry/time_series//tmp/many_rows")

In [None]:
df_spark