# Implémentation des bibliothèques

In [22]:
#!pip install import-ipynb


In [23]:
import pandas as pd  # Manipulation de données
import numpy as np  # Calculs numériques
import matplotlib.pyplot as plt  # Visualisation de données
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
import seaborn as sns  # Visualisation avancée
import sklearn
import random
from datetime import datetime
import seaborn as sns
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.over_sampling import SMOTENC
from imblearn.over_sampling import RandomOverSampler
# Importation des bibliothèques de scikit-learn pour la modélisation
from sklearn.model_selection import train_test_split  # Division des données en ensembles d'entraînement et de test
from sklearn.ensemble import RandomForestClassifier  # Modèle de classification Random Forest
from sklearn.linear_model import LinearRegression  # Modèle de régression linéaire
from sklearn.pipeline import make_pipeline  # Construction de pipelines pour le prétraitement et les modèles
from sklearn.preprocessing import StandardScaler, PolynomialFeatures  # Normalisation et transformation polynomiale
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer  # Imputation des valeurs manquantes
from sklearn.decomposition import PCA  # Réduction de dimension avec PCA
from sklearn.metrics import accuracy_score, classification_report   # Évaluation des modèles
from sklearn.metrics import confusion_matrix

# Import des outils de scikit-learn nécessaires
from sklearn.model_selection import GridSearchCV  # Optimisation des hyperparamètres
from sklearn.metrics import roc_auc_score, roc_curve  # Évaluation avec AUC-ROC
from sklearn.model_selection import cross_val_score  # Validation croisée
from sklearn.metrics import mean_squared_error

Un **Random Forest Classifier** repose su les principes suivants en partant de ** notre base de données** :
- **Bagging (Bootstrap Aggregation)** : Échantillonnage aléatoire avec remplacement.  
- **Arbres de décision (Decision Trees)** : Classificateurs individuels. (Random Subspace Method)
- **Sélection aléatoire des features** : Chaque arbre utilise un sous-ensemble des caractéristiques.  

# Importation des données

In [24]:
# Charger le fichier Excel
# df = pd.read_excel("C:\\Users\\anton\\Downloads\\sncf_data.xlsx")  
df = pd.read_excel("C:\\Users\\\scoup\\Downloads\\sncf_data.xlsx")
# Convertir en CSV
df.to_csv("data_csv", index=False, encoding="utf-8")
# On ne prend pas en compte les colonnes "qté cible" et "product policy" (trop de valeurs manquantes)*
df = df.drop(columns=['qté cible','product policy (short-term vision year - 2023)'])
df.fillna(0, inplace=True)  # Remplace les NaN par 0

# Convertir les colonnes de dates en nombre de jours depuis aujourd'hui
for col in df.select_dtypes(include=['datetime64']):
    df[col] = (datetime.today() - df[col]).dt.days 
    
# Séparer les features (X) et la cible (y)
target_column = "label"  # Remplace par le nom de ta colonne cible
X = df.drop(columns=[target_column])
y = df[target_column]

X

  df = pd.read_excel("C:\\Users\\\scoup\\Downloads\\sncf_data.xlsx")


Unnamed: 0,symbol,supplier,serial letter,repair complexity (0NA),evaluation ratio between repair cost and cost of new (0NA),\nexisting substitute product (0NA),component having a role in security (0NA),does the product/block if the component have a role in security? (0NA),several components involved? (0NA),is the product recent? (<10 years) (0NA),\nis the cost of inventory expensive (0NA),substitution component not available (0NA),do we have equipment to carry out approval tests? (0NA),have we done a technical validation? (0NA),generally at least equivalent criterion,active quantity,quantity of existing stock at SNCF?,"no longer supply (new or used), no longer supply new, reparable, supply new",processing date
0,79544181,SCLE,AB,1,1,1,1,1,1,0,0.0,0,0,0,1,1050,837,supply new,602
1,79402561,SCLE,BA,0,0,0,0,0,1,0,0.0,1,1,1,1,436,999,supply new,596
2,79540265,SCLE,BA,0,0,1,0,1,0,0,0.0,0,1,1,1,70,1655,supply new,940
3,79540265,SCLE,BB,0,0,1,0,1,0,0,0.0,0,1,1,1,149,1655,supply new,940
4,79540265,SCLE,BBM,0,0,1,0,1,0,0,0.0,0,1,1,1,24,1655,supply new,940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312,79545633,HITACHI,B__,1,0,0,1,1,1,0,0.0,1,1,1,0,0,876,supply new,2189
313,79545464,Alstom,AAA,0,0,0,1,0,0,0,0.0,0,1,1,1,6844,2305,supply new,5535
314,79545106,Alstom,AE,0,0,0,1,1,1,0,0.0,0,1,1,1,30,3583,supply new,891
315,79545106,Alstom,AE,0,0,0,1,1,1,0,0.0,0,1,1,1,30,3583,supply new,891


In [25]:
# Stocker la colonne "symbol" pour l'afficher plus tard
X["symbol"] = df["symbol"]
categorical_features = X.nunique()[X.nunique() == 2].index.tolist()
# Encodage des variables catégorielles si nécessaire
X = pd.get_dummies(X)
# je crois il faut pas supprimer la première colonne
# X = pd.get_dummies(X)

X["\nis the cost of inventory expensive (0NA)"]=X["\nis the cost of inventory expensive (0NA)"].astype(int)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 317 entries, 0 to 316
Columns: 114 entries, symbol to no longer supply (new or used), no longer supply new, reparable, supply new_supply new
dtypes: bool(98), int64(16)
memory usage: 70.1 KB


# Fonction test_train_split

In [26]:
def custom_train_test_split_df(X, y, test_size, random_state, stratify=None):
    np.random.seed(random_state)
    # Vérifie si `X` est un DataFrame pour conserver les noms des colonnes.
    if isinstance(X, pd.DataFrame):
        columns = X.columns  # Sauvegarder les noms des colonnes
    else:
        columns = None  # Gérer les cas où X n'est pas un DataFrame
    
    X = np.array(X)
    y = np.array(y)
    
    if stratify is not None:
        # Séparation stratifiée
        unique_classes, class_counts = np.unique(y, return_counts=True)
        X_train, X_test, y_train, y_test = [], [], [], []
        
        for class_label, count in zip(unique_classes, class_counts):
            # Sélectionner les indices des échantillons de la classe actuelle
            indices = np.where(y == class_label)[0]
            np.random.shuffle(indices)
            test_count = int(count * test_size)
            test_indices = indices[:test_count]
            train_indices = indices[test_count:]
            # Ajouter les échantillons aux ensembles d'entraînement et de test
            X_train.extend(X[train_indices])
            X_test.extend(X[test_indices])
            y_train.extend(y[train_indices])
            y_test.extend(y[test_indices])
        
        X_train, X_test = np.array(X_train), np.array(X_test)
        y_train, y_test = np.array(y_train), np.array(y_test)
    
    else:
        indices = np.arange(len(X))
        np.random.shuffle(indices)
        test_count = int(len(X) * test_size)
        test_indices = indices[:test_count]
        train_indices = indices[test_count:]
        X_train, X_test = X[train_indices], X[test_indices]
        y_train, y_test = y[train_indices], y[test_indices]
    
    if columns is not None:
        df_train = pd.DataFrame(X_train, columns=columns)
        df_test = pd.DataFrame(X_test, columns=columns)
    else:
        df_train = pd.DataFrame(X_train)
        df_test = pd.DataFrame(X_test)
    # Ajouter la colonne cible au train et test
    df_train['label'] = y_train
    df_test['label'] = y_test
    
    return df_train, df_test

In [27]:
df_train,df_test = custom_train_test_split_df(X, y, test_size=0.2,random_state=42,stratify=y)
len(df_train)
df_train.info()
df_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Columns: 115 entries, symbol to label
dtypes: object(115)
memory usage: 230.1+ KB


Unnamed: 0,symbol,repair complexity (0NA),evaluation ratio between repair cost and cost of new (0NA),\nexisting substitute product (0NA),component having a role in security (0NA),does the product/block if the component have a role in security? (0NA),several components involved? (0NA),is the product recent? (<10 years) (0NA),\nis the cost of inventory expensive (0NA),substitution component not available (0NA),...,serial letter_EBH,serial letter_EC,serial letter_ECF,serial letter_FA,serial letter_FB,serial letter____,"no longer supply (new or used), no longer supply new, reparable, supply new_no longer supply new","no longer supply (new or used), no longer supply new, reparable, supply new_reparable","no longer supply (new or used), no longer supply new, reparable, supply new_supply new",label
0,79545473,0,0,1,1,1,0,1,0,1,...,False,False,False,False,False,False,False,False,True,LBO
1,79545019,0,0,0,1,1,0,1,0,1,...,False,False,False,False,False,False,False,False,True,LBO
2,79545017,0,0,0,1,1,0,1,0,1,...,False,False,False,False,False,False,False,False,True,LBO
3,79520542,0,0,1,1,1,0,0,1,1,...,False,False,False,False,False,False,False,True,False,LBO
4,79545016,0,0,0,1,1,0,1,0,1,...,False,False,False,False,False,False,False,False,True,LBO


# Boostrapping

In [28]:
def bootstrapping(train_df, n_bootstrap):
    bootstrap_indices = np.random.randint(low=0, high=len(train_df), size=n_bootstrap)
    df_bootstrapped = train_df.iloc[bootstrap_indices]
    return df_bootstrapped

In [46]:
bootstrapping(df_train, 10)

Unnamed: 0,symbol,repair complexity (0NA),evaluation ratio between repair cost and cost of new (0NA),\nexisting substitute product (0NA),component having a role in security (0NA),does the product/block if the component have a role in security? (0NA),several components involved? (0NA),is the product recent? (<10 years) (0NA),\nis the cost of inventory expensive (0NA),substitution component not available (0NA),...,serial letter_EBH,serial letter_EC,serial letter_ECF,serial letter_FA,serial letter_FB,serial letter____,"no longer supply (new or used), no longer supply new, reparable, supply new_no longer supply new","no longer supply (new or used), no longer supply new, reparable, supply new_reparable","no longer supply (new or used), no longer supply new, reparable, supply new_supply new",label
49,79545625,1,0,0,1,1,1,0,0,1,...,False,False,False,False,False,False,True,False,False,redesign majeur
81,79544362,0,0,0,1,1,0,1,0,1,...,False,False,False,False,False,False,False,False,True,redesign majeur
4,79545016,0,0,0,1,1,0,1,0,1,...,False,False,False,False,False,False,False,False,True,LBO
147,79520902,0,1,1,1,1,0,1,0,0,...,False,False,False,False,False,True,False,False,True,stock
86,79544359,0,0,0,1,1,0,1,0,1,...,False,False,False,False,False,False,False,False,True,redesign majeur
125,79402561,0,0,0,0,0,1,0,0,1,...,False,False,False,False,False,False,False,False,True,redesign mineur
74,79544361,0,0,0,1,1,0,1,0,1,...,False,False,False,False,False,False,False,False,True,redesign majeur
164,79543832,0,0,0,0,1,1,0,0,0,...,False,False,False,False,False,False,True,False,False,substitution
21,79545017,0,0,0,1,1,0,1,0,1,...,False,False,False,False,False,False,False,False,True,LBO
185,79565031,0,0,0,0,1,0,1,0,0,...,False,False,False,False,False,False,False,True,False,substitution


# Fonction permettant de distinguer les variables binaires(1/0 ou Vrai/Faux) et les variables continue (active quantity,quantity existing,processing date)

In [30]:
def determine_type_of_feature(df):
    
    feature_types = []
    # On considère qu'une variable catégorielle a moins de 2 valeurs uniques vu qu'on a fait un encodage one-hot
    n_unique_values_treshold = 2
    for feature in df.columns:
        if feature != "label":
            unique_values = df[feature].unique()
            example_value = unique_values[0]

            if (isinstance(example_value, str)) or (len(unique_values) <= n_unique_values_treshold):
                feature_types.append("categorical")
            else:
                feature_types.append("continuous")
    
    return feature_types

In [31]:
determine_type_of_feature(df_train)

['continuous',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'continuous',
 'continuous',
 'continuous',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorica

# Fonction d'arbre de decision 

## Purete d'une feuille

In [32]:
# Une feuille est-elle pure 
def check_purity(data):
    label_column = data[:, -1]
    unique_classes = np.unique(label_column)
    # il y a qu'une seule classe
    if len(unique_classes) == 1:
        return True
    else:
        return False

## Determine la classe majoritaire

In [33]:
def class_majoritaire(data):
    label_column = data[:, -1]
    unique_classes, count_unique_classes = np.unique(label_column, return_counts=True)
    # on prend la classe majoritaire
    index = count_unique_classes.argmax()
    class_majoritaire = unique_classes[index]
    return class_majoritaire

In [34]:
# Quelles sont les séparartions possible dans chaque colonne? 
def get_potential_splits(data):
    potential_splits = {}
    _, n_columns = data.shape
    for column_index in range(n_columns - 1):  # excluding the last column which is the label
        values = data[:, column_index]
        unique_values = np.unique(values)
        
        potential_splits[column_index] = unique_values
    
    return potential_splits

In [35]:
# 1.4 Calcul de l'impurete de gini
def calculate_gini(data):
    label_column = data[:, -1]
    _, counts = np.unique(label_column, return_counts=True)
    # probabilité de chaque classe
    probabilities = counts / counts.sum()
    # gini
    gini = 1 - sum(probabilities ** 2)
    
    return gini

In [36]:
# calcul de l'impurete gini pour un split
def calculate_overall_gini(data_below, data_above):
    n = len(data_below) + len(data_above)
    p_data_below = len(data_below) / n
    p_data_above = len(data_above) / n
    
    overall_gini = (p_data_below * calculate_gini(data_below) 
                    + p_data_above * calculate_gini(data_above))
    
    return overall_gini

### Faire les separations pour diminuer l'impurete de gini

In [37]:
# Split data
def split_data(data, split_column, split_value):
    split_column_values = data[:, split_column]
    
    type_of_feature = FEATURE_TYPES[split_column]
    if type_of_feature == "continuous":
        data_below = data[split_column_values <= split_value]
        data_above = data[split_column_values >  split_value]
    
    # Feature is categorical   
    else:
        data_below = data[split_column_values == split_value]
        data_above = data[split_column_values != split_value]
    
    return data_below, data_above

In [38]:
# Détermination du meilleur split en utilisant l'impureté de Gini
def determine_best_split(data, potential_splits):
    overall_gini = 9999  # Initialiser avec une valeur élevée
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data, split_column=column_index, split_value=value)
            current_overall_gini = calculate_overall_gini(data_below, data_above)
            
            if current_overall_gini <= overall_gini:
                overall_gini = current_overall_gini
                best_split_column = column_index
                best_split_value = value
    
    return best_split_column, best_split_value


# Algorithme de construction de l'arbre de décision

In [39]:
def decision_tree_algorithm(df, counter=0, min_samples=1, max_depth=5, random_subspace=None):
    
    # Préparation des données
    if counter == 0:
        global COLUMN_HEADERS, FEATURE_TYPES
        COLUMN_HEADERS = df.columns
        FEATURE_TYPES = determine_type_of_feature(df)
        data = df.values
    else:
        data = df           
    
    # Cas d'arrêt
    if (check_purity(data)) or (len(data) < min_samples) or (counter == max_depth):
        classification = class_majoritaire(data)
        return classification

    # Sélection des features aléatoires pour Random Forest
    if random_subspace:
        feature_indices = np.random.choice(len(COLUMN_HEADERS) - 1, size=random_subspace, replace=False)
    else:
        feature_indices = range(len(COLUMN_HEADERS) - 1)  # Utiliser toutes les features sauf la cible

    # Incrémentation de la profondeur
    counter += 1

    # Sélection des meilleurs splits en fonction des features sélectionnées
    potential_splits = get_potential_splits(data)
    potential_splits = {k: v for k, v in potential_splits.items() if k in feature_indices}
    
    split_column, split_value = determine_best_split(data, potential_splits)
    data_below, data_above = split_data(data, split_column, split_value)
    
    # Vérifier si l'un des groupes est vide
    if len(data_below) == 0 or len(data_above) == 0:
        classification = class_majoritaire(data)
        return classification
    
    # Construire la question
    feature_name = COLUMN_HEADERS[split_column]
    type_of_feature = FEATURE_TYPES[split_column]
    if type_of_feature == "continuous":
        question = "{} <= {}".format(feature_name, split_value)
    else:
        question = "{} = {}".format(feature_name, split_value)
    
    # Initialiser le sous-arbre
    sub_tree = {question: []}
    
    # Récursion pour construire les sous-arbres
    yes_answer = decision_tree_algorithm(data_below, counter, min_samples, max_depth, random_subspace)
    no_answer = decision_tree_algorithm(data_above, counter, min_samples, max_depth, random_subspace)
    
    # Optimisation : Si les deux réponses sont identiques, on ne garde pas la question
    if yes_answer == no_answer:
        sub_tree = yes_answer
    else:
        sub_tree[question].append(yes_answer)
        sub_tree[question].append(no_answer)
    
    return sub_tree


## Faire une prédiction

In [40]:
def predict_example(example, tree):
    # extrait le noeud de l'arbre
    question = list(tree.keys())[0]
    # Divise les élèments de la racine, le nom, l'opérateur de comparaison et la valeur
    # Vérifier si la clé est un label final
    if question in ["substitution", "redesign mineur", "stock", "redesign majeur", "LBO"]:
        return question

    feature_name, comparison_operator, value = question.split(" ",maxsplit=2)

    # ask question
    if comparison_operator == "<=":
        if example[feature_name] <= float(value):
            # on va a gauche de l'arbre
            answer = tree[question][0]
        else:
            # on va a droite de l'arbre
            answer = tree[question][1]
    
    # feature is categorical
    else:
        if str(example[feature_name]) == value:
            # on va a gauche de l'arbre
            answer = tree[question][0]
        else:
            # on va a droite de l'arbre
            answer = tree[question][1]

    # on a atteint une feuille car ce n'est pas un dictionnaire, on retourne la réponse
    if not isinstance(answer, dict):
        return answer
    
    # Sinon on continue de parcourir l'arbre sur le sous-arbre
    else:
        residual_tree = answer
        return predict_example(example, residual_tree)


# 3.2 Retourne la prédiction pour chaque ligne du set de test


In [41]:
def decision_tree_predictions(test_df, tree):
    predictions = test_df.apply(predict_example, args=(tree,), axis=1)
    return predictions

# Application Arbre de décision

In [42]:
# Création de l'arbre de décision
def random_forest_algorithm(train_df, n_trees, n_bootstrap, n_features, dt_max_depth):
    forest = []
    for i in range(n_trees):
        df_bootstrapped = bootstrapping(train_df, n_bootstrap)
        tree = decision_tree_algorithm(df_bootstrapped, max_depth=dt_max_depth, random_subspace=n_features)
        forest.append(tree)
    
    return forest
# Prédiction avec un random forest
def random_forest_predictions(df_test, forest):
    df_predictions = {}
    for i in range(len(forest)):
        column_name = "tree_{}".format(i)
        predictions = decision_tree_predictions(df_test, tree=forest[i])
        df_predictions[column_name] = predictions

    df_predictions = pd.DataFrame(df_predictions)
    random_forest_predictions = df_predictions.mode(axis=1).iloc[:, 0]
    return random_forest_predictions


In [43]:
forest = random_forest_algorithm(df_train, n_trees=4, n_bootstrap=800, n_features=2, dt_max_depth=5)
print("Colonnes de df_test :", df_test.columns)
print("Colonnes utilisées dans l'entraînement :", COLUMN_HEADERS)


Colonnes de df_test : Index(['symbol', 'repair complexity (0NA)',
       'evaluation ratio between repair cost and cost of new (0NA)',
       '\nexisting substitute product (0NA)',
       'component having a role in security (0NA)',
       'does the product/block if the component have a role in security? (0NA)',
       'several components involved? (0NA)',
       'is the product recent? (<10 years) (0NA)',
       '\nis the cost of inventory expensive (0NA)',
       'substitution component not available (0NA)',
       ...
       'serial letter_EBH', 'serial letter_EC', 'serial letter_ECF',
       'serial letter_FA', 'serial letter_FB', 'serial letter____',
       'no longer supply (new or used), no longer supply new, reparable, supply new_no longer supply new',
       'no longer supply (new or used), no longer supply new, reparable, supply new_reparable',
       'no longer supply (new or used), no longer supply new, reparable, supply new_supply new',
       'label'],
      dtype='object

In [48]:
missing_cols = set(df_train.columns) - set(df_test.columns)
extra_cols = set(df_test.columns) - set(df_train.columns)

print("📌 Colonnes dans df_train :", list(df_train.columns))
print("📌 Colonnes dans df_test :", list(df_test.columns))

if missing_cols:
    print("❌ Colonnes manquantes dans df_test :", missing_cols)
if extra_cols:
    print("⚠️ Colonnes en trop dans df_test :", extra_cols)

if not missing_cols and not extra_cols:
    print("✅ Les colonnes de df_train et df_test sont identiques.")


📌 Colonnes dans df_train : ['symbol', 'repair complexity (0NA)', 'evaluation ratio between repair cost and cost of new (0NA)', '\nexisting substitute product (0NA)', 'component having a role in security (0NA)', 'does the product/block if the component have a role in security? (0NA)', 'several components involved? (0NA)', 'is the product recent? (<10 years) (0NA)', '\nis the cost of inventory expensive (0NA)', 'substitution component not available (0NA)', 'do we have equipment to carry out approval tests? (0NA)', 'have we done a technical validation? (0NA)', 'generally at least equivalent criterion', 'active quantity', 'quantity of existing stock at SNCF?', 'processing date', 'supplier_ALSETEX', 'supplier_Alstom', 'supplier_EIFFAGE', 'supplier_EIVBG', 'supplier_HITACHI', 'supplier_JPVE', 'supplier_Mersen', 'supplier_NSE', 'supplier_SCLE', 'supplier_Schneider', 'supplier_TechPower', 'serial letter_AA', 'serial letter_AAA', 'serial letter_AAB', 'serial letter_AB', 'serial letter_ABA', 'se

In [44]:
predictions = random_forest_predictions(df_test, forest)


KeyError: 'serial'

# Accuracy

In [None]:
def accuracy_metric(y_test, y_predicted):
	correct = 0
	for i in range(len(y_test)):
		if y_test[i] == y_predicted[i]:
			correct += 1
	return correct / float(len(y_test)) * 100.0

In [None]:
accuracy = calculate_accuracy(predictions, df_test.label)

print("Accuracy = {}".format(accuracy))

NameError: name 'calculate_accuracy' is not defined