# 0) Imports & config de base

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np

In [3]:
pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 160)

# 1) Chemins projet

In [4]:
if "__file__" in globals():
    PROJ = Path(__file__).resolve().parents[1]  
else:
    PROJ = Path.cwd().resolve().parents[0]       

DATA = PROJ / "data"              
RAW  = DATA / "raw"
OUT_INTERIM   = DATA / "interim"
OUT_PROCESSED = DATA / "processed"

for p in (OUT_INTERIM, OUT_PROCESSED):
    p.mkdir(parents=True, exist_ok=True)

print("PROJ   =", PROJ)
print("DATA   =", DATA)
print("INTERIM=", OUT_INTERIM)
print("PROC   =", OUT_PROCESSED)

PROJ   = C:\Users\chris\Desktop\Portfolio\my-portfolio\projects\scania-ml
DATA   = C:\Users\chris\Desktop\Portfolio\my-portfolio\projects\scania-ml\data
INTERIM= C:\Users\chris\Desktop\Portfolio\my-portfolio\projects\scania-ml\data\interim
PROC   = C:\Users\chris\Desktop\Portfolio\my-portfolio\projects\scania-ml\data\processed


# 2) Déclaration des fichiers attendus (split = train/validation/test)

In [5]:
FILES = {
    "train": {
        "readouts": DATA / "train_operational_readouts.csv",
        "specs":    DATA / "train_specifications.csv",
        "labels":   None,                         # pas de fichier direct, labels dérivés de tte
        "tte":      DATA / "train_tte.csv",       # time-to-event
    },
    "validation": {
        "readouts": DATA / "validation_operational_readouts.csv",
        "specs":    DATA / "validation_specifications.csv",
        "labels":   DATA / "validation_labels.csv",  # contient class_label
        "tte":      None,                            # pas fourni
    },
    "test": {
        "readouts": DATA / "test_operational_readouts.csv",
        "specs":    DATA / "test_specifications.csv",
        "labels":   DATA / "test_labels.csv",        
        "tte":      None,
    },
}

# 3) Aide : lecture mémoire-friendly (infère un dtypes map sur un échantillon)

In [6]:
def infer_dtypes(csv_path, nrows=5000):
    sample = pd.read_csv(csv_path, nrows=nrows)
    dtypes = {}
    for col in sample.columns:
        if pd.api.types.is_float_dtype(sample[col]):
            dtypes[col] = "float32"
        elif pd.api.types.is_integer_dtype(sample[col]):
            # ids -> int64 ok, mesures -> int32
            dtypes[col] = "int32" if sample[col].max() < 2_147_483_647 else "int64"
        else:
            dtypes[col] = "category" if sample[col].nunique() / max(len(sample),1) < 0.5 else "string"
    return dtypes

def read_csv_optimized(csv_path, dtype_map=None):
    if dtype_map is None:
        dtype_map = infer_dtypes(csv_path)
    return pd.read_csv(csv_path, dtype=dtype_map)

# 4) Fonctions de chargement/merging par split