In [11]:
import pandas as pd  # Manipulation de données
import numpy as np  # Calculs numériques
import matplotlib.pyplot as plt  # Visualisation de données
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
import seaborn as sns  # Visualisation avancée
import sklearn
from datetime import datetime
import seaborn as sns
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.over_sampling import SMOTENC
# Importation des bibliothèques de scikit-learn pour la modélisation
from sklearn.model_selection import train_test_split  # Division des données en ensembles d'entraînement et de test
from sklearn.ensemble import RandomForestClassifier  # Modèle de classification Random Forest
from sklearn.linear_model import LinearRegression  # Modèle de régression linéaire
from sklearn.pipeline import make_pipeline  # Construction de pipelines pour le prétraitement et les modèles
from sklearn.preprocessing import StandardScaler, PolynomialFeatures  # Normalisation et transformation polynomiale
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer  # Imputation des valeurs manquantes
from sklearn.decomposition import PCA  # Réduction de dimension avec PCA
from sklearn.metrics import accuracy_score, classification_report   # Évaluation des modèles
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import label_binarize


# Import des outils de scikit-learn nécessaires
from sklearn.model_selection import GridSearchCV  # Optimisation des hyperparamètres
from sklearn.metrics import roc_auc_score, roc_curve  # Évaluation avec AUC-ROC
from sklearn.model_selection import cross_val_score  # Validation croisée
from sklearn.metrics import mean_squared_error

In [12]:
#df = pd.read_excel("C:\\Users\\anton\\Downloads\\sncf_data.xlsx")  
df = pd.read_excel("C:\\Users\\\scoup\\Downloads\\sncf_data.xlsx")
# Convertir en CSV
df.to_csv("data_csv", index=False, encoding="utf-8")

ligns,columns = df.shape
# On ne prend pas en compte les colonnes "qté cible" et "product policy" (trop de valeurs manquantes)*

df = df.drop(columns=['qté cible','product policy (short-term vision year - 2023)'])
# On remplace les cellules vides par la valeur 0

df.fillna(0, inplace=True)  # Remplace les NaN par 0
# Convertir les colonnes de dates en nombre de jours depuis aujourd'hui
for col in df.select_dtypes(include=['datetime64']):
    df[col] = (datetime.today() - df[col]).dt.days 

# Séparer les features (X) et la cible (y)
target_column = "label"
X = df.drop(columns=[target_column])
y = df[target_column]
# Stocker la colonne "symbol" pour l'afficher plus tard
X["symbol"] = df["symbol"]
categorical_features = X.nunique()[X.nunique() == 2].index.tolist()
# Encodage des variables catégorielles si nécessaire
X = pd.get_dummies(X)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 317 entries, 0 to 316
Columns: 114 entries, symbol to no longer supply (new or used), no longer supply new, reparable, supply new_supply new
dtypes: bool(98), float64(1), int64(15)
memory usage: 70.1 KB


  df = pd.read_excel("C:\\Users\\\scoup\\Downloads\\sncf_data.xlsx")


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8,random_state=42,stratify=y)


# RANDOM FOREST "MANUEL"

In [14]:
target_column = "label"
X = df.drop(columns=[target_column])
y = df[target_column]

class DecisionTree:
    def __init__(self, max_depth=5, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y, depth=0):
        if depth >= self.max_depth or len(y) < self.min_samples_split or len(set(y)) == 1:
            return Counter(y).most_common(1)[0][0]  # Feuille avec la classe majoritaire

        best_feature, best_threshold = self.best_split(X, y)
        if best_feature is None:
            return Counter(y).most_common(1)[0][0]  

        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        left_subtree = self.fit(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self.fit(X[right_indices], y[right_indices], depth + 1)

        return (best_feature, best_threshold, left_subtree, right_subtree)

    def best_split(self, X, y):
        best_gini = float('inf')
        best_feature, best_threshold = None, None

        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_y = y[X[:, feature] <= threshold]
                right_y = y[X[:, feature] > threshold]

                gini = self.gini_impurity(left_y, right_y)
                if gini < best_gini:
                    best_gini, best_feature, best_threshold = gini, feature, threshold

        print(f"🔎 Meilleur split : Feature {best_feature}, Threshold {best_threshold}")
        
        return best_feature, best_threshold
        if best_feature is None:
            return Counter(y).most_common(1)[0][0]  # Retourne la classe majoritaire


    def gini_impurity(self, left_y, right_y):
        def gini(y):
            probs = np.array(list(Counter(y).values())) / len(y)
            return 1 - np.sum(probs**2)

        return (len(left_y) * gini(left_y) + len(right_y) * gini(right_y)) / (len(left_y) + len(right_y))

    def predict(self, X):
        return np.array([self._predict_tree(x, self.tree) for x in X])

    def _predict_tree(self, x, node):
        if not isinstance(node, tuple):
            return node
        feature, threshold, left, right = node

        if feature >= len(x):  # Si la feature n'existe pas
            print(f"❌ Erreur : feature {feature} hors limites ({len(x)})")
            return None

        return self._predict_tree(x, left if x[feature] <= threshold else right)



In [15]:
class RandomForest:
    def __init__(self, n_trees=10, max_depth=5, min_samples_split=2, sample_size=0.8):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.sample_size = sample_size
        self.trees = []

    def fit(self, X, y):
        for i in range(self.n_trees):
            idxs = np.random.choice(len(y), size=int(self.sample_size * len(y)), replace=True)
            X_sample, y_sample = X[idxs], y[idxs]

            print(f"🌱 Arbre {i+1} : {len(X_sample)} échantillons")  # Vérifier que l'on a bien des données
        
            if len(X_sample) == 0 or len(y_sample) == 0:
                print("❌ Problème : échantillon vide !")
                continue

            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            tree.tree = tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        return np.array([Counter(predictions[:, i]).most_common(1)[0][0] for i in range(X.shape[0])])

# Séparer en train/test
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2, random_state=42)
X_test = pd.DataFrame(X_test, columns=X.columns)

# Initialiser le Random Forest
rf = RandomForest(n_trees=10, max_depth=5)
rf.fit(X_train, y_train)

# Faire des prédictions
y_pred = rf.predict(X_test)

# Évaluer la précision
accuracy = np.mean(y_pred == y_test)
print(f"Précision du modèle : {accuracy:.2f}")


🌱 Arbre 1 : 202 échantillons
🔎 Meilleur split : Feature 6, Threshold 0
🔎 Meilleur split : Feature 0, Threshold 79402561
🔎 Meilleur split : Feature 0, Threshold 79502666
🔎 Meilleur split : Feature 0, Threshold 79501719
🔎 Meilleur split : Feature 18, Threshold 890
🔎 Meilleur split : Feature 8, Threshold 0
🔎 Meilleur split : Feature 0, Threshold 79520542
🔎 Meilleur split : Feature 5, Threshold 0
🔎 Meilleur split : Feature 0, Threshold 79544362
🔎 Meilleur split : Feature 0, Threshold 79520787
🔎 Meilleur split : Feature 9, Threshold 0
🔎 Meilleur split : Feature 0, Threshold 79545464
🌱 Arbre 2 : 202 échantillons
🔎 Meilleur split : Feature 11, Threshold 0
🔎 Meilleur split : Feature 13, Threshold 0
🔎 Meilleur split : Feature 0, Threshold 79544181
🔎 Meilleur split : Feature 15, Threshold 2706
🔎 Meilleur split : Feature 0, Threshold 79502666
🔎 Meilleur split : Feature 1, Threshold Alstom
🔎 Meilleur split : Feature 18, Threshold 686
🔎 Meilleur split : Feature 8, Threshold 0
🔎 Meilleur split : Fea

TypeError: '<=' not supported between instances of 'str' and 'int'

In [None]:
# ✅ Vérifier si la colonne "symbol" existe
if "symbol" in X_test.columns:
    symbol_col = X_test["symbol"].reset_index(drop=True)
else:
    print("⚠️ La colonne 'symbol' est absente de X_test !")
    symbol_col = pd.Series(["Inconnu"] * len(y_test))

# ✅ Assurer que toutes les colonnes ont la même taille
min_len = min(len(symbol_col), len(y_pred), len(y_test))

df_results = pd.DataFrame({
    "Symbol": symbol_col[:min_len],  
    "Prédiction": y_pred[:min_len],  
    "Réel": y_test[:min_len].reset_index(drop=True)
})

# ✅ Fonction pour colorer les cellules
def color_cells(row):
    color = 'background-color: green' if row["Prédiction"] == row["Réel"] else 'background-color: red'
    return [color] * len(row)

# ✅ Appliquer le style et afficher
styled_df = df_results.style.apply(color_cells, axis=1)
styled_df


NameError: name 'y_pred' is not defined