Les importation des bibliothéque 

In [18]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MaxAbsScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.utils.multiclass import type_of_target
from scipy.sparse import csr_matrix, issparse


On commence par le load de tout les données

In [19]:
class Preprocess:
    def __init__(self, path):
    
        self.path = path
        # Attributs à remplir après load()
        self.data = None
        self.labels = None
        self.types = None
        self.name = None
        self.numerical_index = None
        self.categorical_index = None
        self.binary_index = None
        self.train_data = None
        self.test_data = None
        self.train_labels = None
        self.test_labels = None
        self.validation_data = None
        self.validation_labels = None
    
    
    @staticmethod
    def load_dataset(path):
        if not os.path.isdir(path):
            print("Erreur: le chemin donné n'est pas un dossier ou c'est un chemin qui n'existe pas.")
            return None
        
        folder = os.path.basename(os.path.normpath(path))
        if not folder.startswith("data_"):
            print("Erreur: veuillez donner un dossier de cette forme (data_<Lettre>).")
            return None
        
        if any(os.path.isdir(os.path.join(path, x)) for x in os.listdir(path)):
            print("Veuillez insérer le chemin d'un dossier qui contient directement les fichiers .data, .solution et .type")
            return None
        
        files = os.listdir(path)
        data_files = [f for f in files if f.endswith(".data")]
        sol_files  = [f for f in files if f.endswith(".solution")]
        type_files = [f for f in files if f.endswith(".type")]
        
        if len(data_files) != 1 or len(sol_files) != 1 or len(type_files) != 1:
            print("Erreur: le dossier doit contenir exactement 1 fichier .data, 1 fichier .solution et 1 fichier .type.")
            return None
        
        name = data_files[0].replace(".data", "")
        data_path = os.path.join(path, name + ".data")
        solution_path = os.path.join(path, name + ".solution")
        type_path = os.path.join(path, name + ".type")

        # Charger y et types d'abord pour utiliser n_features dans le cas de sparse
        labels = pd.read_csv(solution_path, sep=r"\s+", header=None)
        types = pd.read_csv(type_path, header=None)[0].tolist()
        n_features = len(types)

        is_sparse = False
        with open(data_path, "r", encoding="utf-8", errors="ignore") as f:
           for line in f:
               line = line.strip()
               if line:
                  is_sparse = (":" in line)
                  break

        if is_sparse:
            print("dataset sparse détecté (format index:val).")
            X = Preprocess.load_sparse_data_file(data_path, n_features=n_features, n_rows_expected=len(labels))
            return X, labels, types, name

        data = pd.read_csv(data_path, sep=r"\s+", header=None)
        if data.shape[1] != n_features:
             raise ValueError(f"Mismatch features: X={data.shape[1]} mais .type={n_features}")

        return data, labels, types, name
    
    @staticmethod
    def load_sparse_data_file(data_path, n_features, n_rows_expected=None):
        
        data_vals = []
        row_ind = []
        col_ind = []

        n_rows = 0

        with open(data_path, "r", encoding="utf-8", errors="ignore") as f:
            for r_idx, line in enumerate(f):
                n_rows = r_idx + 1  
                parts = line.strip().split()
                for token in parts:
                    if ":" not in token:
                        continue
                    c_str, v_str = token.split(":", 1)
                    c = int(c_str)
                    v = float(v_str)
                    row_ind.append(r_idx)
                    col_ind.append(c)
                    data_vals.append(v)

        # Cas fichier vide ou sans index:value
        if n_rows == 0:
            return csr_matrix((0, n_features))

        if len(col_ind) == 0:
            X = csr_matrix((n_rows, n_features))
        else:
            # Conversion éventuelle 1-based -> 0-based
            if min(col_ind) == 1:
                col_ind = [c - 1 for c in col_ind]

            # Pour éviter le cas d'un index hors dimension
            if max(col_ind) >= n_features:
                raise ValueError(
                    f"Index colonne hors limite: max_col={max(col_ind)} mais n_features={n_features}. "
                    "Vérifie la base (0/1) et le fichier .type."
                )

            X = csr_matrix((data_vals, (row_ind, col_ind)), shape=(n_rows, n_features))

        # Vérification cohérence avec y
        if n_rows_expected is not None and n_rows != n_rows_expected:
            raise ValueError(f"Mismatch lignes: X={n_rows} vs y={n_rows_expected}")

        return X
    
    
    
    
    # Charger Notre DataSet en faisons appel à la fonction static load_dataset avec un path valid
    # Dans le cas de path invalid des message d'erreur s'affiche 
    # Aprés le chaqrgement du DataSet le csv des fichier .data et .solution et .type est affecter respectivement aux attribut (data, labels, types)
    def load(self):
        result = Preprocess.load_dataset(self.path)
        if result is None:
            return None
        
        self.data, self.labels, self.types, self.name = result
        return result
    
    
    # Detection du type de la problématique
    def detect_task_type(self):
        if self.labels is None:
            print("Erreur: appelle load() avant detect_task_type().")
            return None
        
        y = self.labels.values
        n_cols = y.shape[1]
        
        # 1 colonne 
        if n_cols == 1:
            y_1d = self.labels.iloc[:, 0].to_numpy()
            target_type = type_of_target(y_1d)
            
            if target_type == "binary":
                return "binary"
            
            if target_type == "continuous":
                return "regression"
            
            if target_type == "multiclass":
                # Ici on gére le cas ou on Corriger le cas de "régression stockée en int"
                uniq = np.unique(y_1d)
                n = len(y_1d)
                if len(uniq) > max(20, int(0.05 * n)):
                    return "regression"
                return "multiclass"
            
            return f"inconnu_{target_type}"
        
        # plusieurs colonnes 
        target_type = type_of_target(y)
        
        if target_type == "multilabel-indicator":
            row_sums = y.sum(axis=1)
            if np.all(row_sums == 1):
                return "multiclass_onehot"
            return "multilabel"
        
        if target_type == "continuous-multioutput":
            return "regression_multioutput"
        
        return f"inconnu_{target_type}"
    
    
    def get_dataset_info(self):
        if self.data is None or self.labels is None or self.types is None:
            print("Erreur: appelle load() avant get_dataset_info().")
            return None
        
        # indices (au cas où pas encore calculés)
        if self.numerical_index is None or self.categorical_index is None or self.binary_index is None:
            self.numerical_index, self.categorical_index, self.binary_index = self.get_feature_type_indices()
        
        task = self.detect_task_type()
        if issparse(self.data):
            missing_in_data = int(np.isnan(self.data.data).sum())  
        else:
            missing_in_data = int(self.data.isna().sum().sum())
        
        info = {
            "nom_du_dataset": self.name,
            
            ".data_shape": self.data.shape,          # shape du .data
            ".solution_shape": self.labels.shape,    # shape du .solution
            ".type_shape": len(self.types),      # taille du .type (nb de features décrites)
            
            # Détail des types de features
            "nombre_numerical_feature": len(self.numerical_index),
            "nombre_categorical_feature": len(self.categorical_index),
            "nombre_binary_feature": len(self.binary_index),
            
            # Missing values les valeurs Nan
            "missing_in_data": missing_in_data,
            "missing_in_solution": int(self.labels.isna().sum().sum()),
            
            # Type de tâche
            "task_type": task,
        }
        
        # Distribution des classes / labels
        if task in ["binary", "multiclass"]:
            info["class_distribution"] = self.labels.iloc[:, 0].value_counts().to_dict()
        
        elif task == "multiclass_onehot":
            y_class = self.labels.values.argmax(axis=1)
            unique, counts = np.unique(y_class, return_counts=True)
            info["class_distribution"] = {int(k): int(v) for k, v in zip(unique, counts)}
        
        elif task == "multilabel":
            d = {}
            for j in range(self.labels.shape[1]):
                d[f"label_{j}"] = int(self.labels.iloc[:, j].sum())
            info["label_distribution"] = d
            info["class_distribution"] = None
    
        elif task in ["regression", "regression_multioutput"]:
            info["class_distribution"] = None
        
        else:
            info["class_distribution"] = None
        
        return info

    #retourner un vecteur 1D pour stratify
    #ça aide pour eviter un peu le désiquilibre de classe 
    def get_stratify_vector(self , y_df):
    
        if y_df is None:
            return None

        task = self.detect_task_type()

        if task in ["binary", "multiclass"]:
            return y_df.iloc[:, 0].to_numpy()

        if task == "multiclass_onehot":
            return y_df.values.argmax(axis=1)

        #pas de stratify dans le cas de d'autre type de probléme  
        return None

    def check_imbalance(self, rare_threshold=0.10):
         y_strat = self.get_stratify_vector(self.labels)
         if y_strat is None:
             return False, None, []

         unique, counts = np.unique(y_strat, return_counts=True)
         total = counts.sum()

         distribution_percent = {
             k : round((v / int(total)) * 100, 2)
             for k, v in zip(unique, counts)
         }

         rare_classes = [cls for cls, pct in distribution_percent.items() if pct < rare_threshold * 100]
         is_imbalanced = (len(rare_classes) > 0)
         return is_imbalanced, distribution_percent, rare_classes

    
    # On itére sur la liste types de features pour retourner la liste des index pour chaque type (numerical ou categorical ou binary)
    def get_feature_type_indices(self):
        numerical_index = [i for i, t in enumerate(self.types) if str(t).strip().lower() == "numerical"]
        categorical_index = [i for i, t in enumerate(self.types) if str(t).strip().lower() == "categorical"]
        binary_index = [i for i, t in enumerate(self.types) if str(t).strip().lower() == "binary"]
        return numerical_index, categorical_index, binary_index
    
    # Stratégie de nettoyage + encodage (+ normalisation optionnelle)
    def build_preprocessor(self, scale_numeric=True):
        
        # On gére d'abord le cas de saprse Dataset dans ce cas on fait un preprocessing simple
        if self.data is not None and issparse(self.data):
            if scale_numeric:
                return Pipeline(steps=[("scaler", MaxAbsScaler())])
            return "passthrough"
        
        self.numerical_index, self.categorical_index, self.binary_index = self.get_feature_type_indices()
        
        # Si le type de features et numeric on applique la mediane sur les donner Nan
        # StandardScaler() pour centrer et reduire (Optionnel)
        if scale_numeric:
            numerical_pipeline = Pipeline(steps=[
                ("imputer", SimpleImputer(strategy="median")),
                ("scaler", StandardScaler()),
            ])
            # Optionnel car certains model en ont besoin et D'autres non (RandomForest, GradientBoosting)
            # 
        else:
            numerical_pipeline = Pipeline(steps=[
                ("imputer", SimpleImputer(strategy="median")),
            ])
        
        # Encodage des features categorical en utilisant la methode de OneHotEncoder 
        # Remplacement des Nan par des valeur en utilisant la methode most_frequent
        categorical_pipeline = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore")),
        ])
        
        # Pas besoin d'encodage car c'est deja des chiffre 0 et 1 
        binary_pipeline = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
        ])
        
        transformers = []
        if len(self.numerical_index) > 0:
            transformers.append(("num", numerical_pipeline, self.numerical_index))
        if len(self.categorical_index) > 0:
            transformers.append(("cat", categorical_pipeline, self.categorical_index))
        if len(self.binary_index) > 0:
            transformers.append(("bin", binary_pipeline, self.binary_index))
        
        preprocessor = ColumnTransformer(transformers=transformers, remainder="drop")
        return preprocessor
    
    
    # Split (train/valid/test) après load, avant fit du preprocessing pour eviter le data leakage
    def split(self, test_size=0.2, validation_size=0.2, random_state=42):
        
        if self.data is None or self.labels is None:
            print("Erreur: appelle load() avant split().")
            return None
        
        temp_size = test_size + validation_size
        strat = self.get_stratify_vector(self.labels) 
       
        # Premier Split
        try:
            self.train_data, X_temp, self.train_labels, y_temp = train_test_split(self.data, self.labels, test_size=temp_size, random_state=random_state, stratify=strat)
        except ValueError:
            # fallback sans stratify (au cas où classe trop rare)
            self.train_data, X_temp, self.train_labels, y_temp = train_test_split(self.data, self.labels, test_size=temp_size, random_state=random_state, stratify=None)

        validation_ratio = validation_size / temp_size
        strat2 = self.get_stratify_vector(y_temp) 
        
        # Deuxième Split
        try:
            self.validation_data, self.test_data, self.validation_labels, self.test_labels = train_test_split(X_temp, y_temp, test_size=(1 - validation_ratio), random_state=random_state, stratify=strat2)
        except ValueError:
            # fallback sans stratify (au cas où classe trop rare)
            self.validation_data, self.test_data, self.validation_labels, self.test_labels = train_test_split(X_temp, y_temp, test_size=(1 - validation_ratio), random_state=random_state, stratify=None)

        return self.train_data, self.validation_data, self.test_data, self.train_labels, self.validation_labels, self.test_labels


Ici on va declarer une fonction qui permet de faire le load et retourner le csv de chaque fichier .data, .solution et .type

Ici on va essayer de tester la fonction load pour voir si tout marche bien 


In [32]:
p = Preprocess("/info/corpus/ChallengeMachineLearning/data_B")
if p.load():
   print("Data loaded succefully")
p.get_dataset_info()


Data loaded succefully


{'nom_du_dataset': 'data_B',
 '.data_shape': (5000, 16),
 '.solution_shape': (5000, 1),
 '.type_shape': 16,
 'nombre_numerical_feature': 16,
 'nombre_categorical_feature': 0,
 'nombre_binary_feature': 0,
 'missing_in_data': 0,
 'missing_in_solution': 0,
 'task_type': 'regression',
 'class_distribution': None}

In [33]:
p.get_feature_type_indices()

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [], [])

In [34]:
p.build_preprocessor()

In [35]:
p.split()

(          0   1     2       3     4     5     6    7   8       9       10  \
 4576  4.0313  32   131 -118.28   580   690  1464  105  36 -121.70 -117.60   
 1088  2.3314  30   439 -118.46   410  2259  2397  482  29 -120.63 -117.65   
 479   4.3667  32   323 -116.87  1352  3505   539  745  52 -121.44 -122.38   
 727   3.0352  36   405 -118.53   307  2696  4109  622  27 -120.47 -117.75   
 3783  5.2852  29   841 -121.67   348  1348  1643  210  25 -118.35 -118.54   
 ...      ...  ..   ...     ...   ...   ...   ...  ...  ..     ...     ...   
 4426  4.4000  30   422 -121.28   403  2095  2569  410  51 -118.19 -117.36   
 466   1.7500  23  1018 -122.04   510   908  1610  232  43 -122.41 -118.20   
 3092  3.6563  27   539 -118.52   349   838  1229  161  17 -117.79 -117.76   
 3772  2.1005   2   652 -118.03   255  1863  2600  537  35 -118.58 -117.88   
 860   2.6631  38   211 -118.17   529  3253  3558  932  26 -122.31 -121.91   
 
         11     12   13     14      15  
 4576   299  33.93  1