In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

file = r"C:\Users\DELL\Desktop\project_fire\data\processed\fire_soil_balanced.csv"

chunksize = 500000  # nombre de lignes par chunk
train_parts = []
test_parts = []

# --- Lecture et traitement chunk par chunk ---
for chunk in pd.read_csv(file, chunksize=chunksize):
    # Suppression de geometry si existe
    if "geometry" in chunk.columns:
        chunk = chunk.drop(columns=["geometry"])

    # Séparer X / y
    X = chunk.drop(columns=['fire'])
    y = chunk['fire']

    # Encodage One-Hot pour les colonnes catégorielles
    cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    if len(cat_cols) > 0:
        X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

    # Reconstituer chunk preprocessé
    processed = pd.concat([X, y], axis=1)

    # Split 70% train / 30% test
    mask = np.random.rand(len(processed)) < 0.7
    train_parts.append(processed[mask])
    test_parts.append(processed[~mask])

# --- Concaténer tous les chunks ---
train_df = pd.concat(train_parts, ignore_index=True)
test_df = pd.concat(test_parts, ignore_index=True)

print("Taille train :", train_df.shape)
print("Taille test :", test_df.shape)
print("Distribution train :", train_df['fire'].value_counts(normalize=True))
print("Distribution test :", test_df['fire'].value_counts(normalize=True))

# --- Séparer features / target ---
X_train = train_df.drop(columns=['fire'])
y_train = train_df['fire']
X_test = test_df.drop(columns=['fire'])
y_test = test_df['fire']

# --- Random Forest ---
rf_model = RandomForestClassifier(n_estimators=200, max_depth=20, n_jobs=-1, random_state=42)
rf_model.fit(X_train, y_train)

# --- Évaluation ---
y_pred = rf_model.predict(X_test)
print("Accuracy :", accuracy_score(y_test, y_pred))
print("\nClassification report :\n", classification_report(y_test, y_pred))
print("\nConfusion matrix :\n", confusion_matrix(y_test, y_pred))


Taille train : (73113, 43)
Taille test : (30997, 43)
Distribution train : fire
0    0.801554
1    0.198446
Name: proportion, dtype: float64
Distribution test : fire
0    0.796335
1    0.203665
Name: proportion, dtype: float64
Accuracy : 0.9748362744781753

Classification report :
               precision    recall  f1-score   support

           0       0.99      0.98      0.98     24684
           1       0.93      0.95      0.94      6313

    accuracy                           0.97     30997
   macro avg       0.96      0.97      0.96     30997
weighted avg       0.98      0.97      0.97     30997


Confusion matrix :
 [[24204   480]
 [  300  6013]]


In [2]:
from sklearn.metrics import accuracy_score

# --- Accuracy sur le training set ---
y_train_pred = rf_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Accuracy sur le training set :", train_accuracy)


Accuracy sur le training set : 0.9985228345164334


In [2]:
import numpy as np
from collections import Counter

class RandomForestScratch:

    def __init__(self, n_trees=10, max_depth=5, min_samples_split=2, max_features=None):
        """
        n_trees : nombre d'arbres
        max_depth : profondeur max des arbres
        min_samples_split : min de samples pour splitter
        max_features : nombre de variables choisies aléatoirement à chaque split
        """
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_features = max_features
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        n_samples = X.shape[0]

        for i in range(self.n_trees):
            # --- 1) Tirage bootstrap : on prend N échantillons avec remise
            idx = np.random.choice(n_samples, n_samples, replace=True)
            X_sample = X[idx]
            y_sample = y[idx]

            # --- 2) Créer un arbre
            tree = DecisionTreeScratch(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split
            )

            # --- 3) Entraîner l'arbre
            tree.fit(X_sample, y_sample)

            self.trees.append(tree)

    def predict(self, X):
        # Chaque arbre prédit
        tree_preds = np.array([tree.predict(X) for tree in self.trees])

        # Vote majoritaire
        final_pred = []
        for sample_preds in tree_preds.T:
            vote = Counter(sample_preds).most_common(1)[0][0]
            final_pred.append(vote)

        return np.array(final_pred)


In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

file = r"C:\Users\DELL\Desktop\project_fire\data\processed\fire_soil_merged_cleaned.csv"

chunksize = 500000
train_parts = []
test_parts = []

# --- Lecture et préprocessing chunk par chunk ---
for chunk in pd.read_csv(file, chunksize=chunksize):
    # Supprimer geometry si existante
    if "geometry" in chunk.columns:
        chunk = chunk.drop(columns=["geometry"])
    
    # Séparer X / y
    X = chunk.drop(columns=["fire"])
    y = chunk["fire"]
    
    # Encodage One-Hot des colonnes catégorielles
    cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
    if len(cat_cols) > 0:
        X = pd.get_dummies(X, columns=cat_cols, drop_first=True)
    
    # Reconstituer chunk
    processed = pd.concat([X, y], axis=1)
    
    # Split 70% train / 30% test
    mask = np.random.rand(len(processed)) < 0.7
    train_parts.append(processed[mask])
    test_parts.append(processed[~mask])

# --- Concaténer tous les chunks ---
train_df = pd.concat(train_parts, ignore_index=True)
test_df = pd.concat(test_parts, ignore_index=True)

print("Train set :", train_df.shape)
print("Test set :", test_df.shape)

# --- Séparer X / y ---
X_train = train_df.drop(columns=["fire"]).values
y_train = train_df["fire"].values
X_test = test_df.drop(columns=["fire"]).values
y_test = test_df["fire"].values

# --- Random Forest From Scratch ---
rf = RandomForestScratch(n_trees=20, max_depth=8, min_samples_split=20)
rf.fit(X_train, y_train)

# --- Prédictions ---
y_pred = rf.predict(X_test)

# --- Résultats ---
print("\n=== RANDOM FOREST FROM SCRATCH ===")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))


  for chunk in pd.read_csv(file, chunksize=chunksize):


MemoryError: Unable to allocate 5.58 GiB for an array with shape (500000, 11983) and data type bool

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from collections import Counter

# ---------------------------
# Random Tree / Random Forest From Scratch
# ---------------------------

class DecisionTreeNode:
    def __init__(self, gini=None, num_samples=None, num_samples_per_class=None,
                 predicted_class=None, feature_index=None, threshold=None,
                 left=None, right=None):
        self.gini = gini
        self.num_samples = num_samples
        self.num_samples_per_class = num_samples_per_class
        self.predicted_class = predicted_class
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right

class DecisionTreeScratch:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def gini(self, y):
        m = len(y)
        return 1.0 - sum((np.sum(y == c) / m) ** 2 for c in np.unique(y))

    def best_split(self, X, y):
        m, n = X.shape
        if m <= 1:
            return None, None

        parent_gini = self.gini(y)
        best_gini = 1.0
        best_idx, best_thr = None, None

        for idx in range(n):
            thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
            num_left = [0] * len(np.unique(y))
            num_right = Counter(classes)
            for i in range(1, m):
                c = classes[i-1]
                num_left[c] += 1
                num_right[c] -= 1
                gini_left = 1.0 - sum((num_left[x]/i)**2 for x in num_left)
                gini_right = 1.0 - sum((num_right[x]/(m-i))**2 for x in num_right)
                gini_total = (i*gini_left + (m-i)*gini_right)/m
                if thresholds[i] == thresholds[i-1]:
                    continue
                if gini_total < best_gini:
                    best_gini = gini_total
                    best_idx = idx
                    best_thr = (thresholds[i] + thresholds[i-1]) / 2
        return best_idx, best_thr

    def build_tree(self, X, y, depth=0):
        num_samples_per_class = [np.sum(y == i) for i in np.unique(y)]
        predicted_class = np.argmax(num_samples_per_class)
        node = DecisionTreeNode(
            gini=self.gini(y),
            num_samples=y.size,
            num_samples_per_class=num_samples_per_class,
            predicted_class=predicted_class,
        )

        if depth < self.max_depth and y.size >= self.min_samples_split:
            idx, thr = self.best_split(X, y)
            if idx is not None:
                indices_left = X[:, idx] < thr
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node.feature_index = idx
                node.threshold = thr
                node.left = self.build_tree(X_left, y_left, depth + 1)
                node.right = self.build_tree(X_right, y_right, depth + 1)
        return node

    def fit(self, X, y):
        self.tree = self.build_tree(X, y)

    def predict_sample(self, x, node):
        if node.left is None and node.right is None:
            return node.predicted_class
        if x[node.feature_index] < node.threshold:
            return self.predict_sample(x, node.left)
        else:
            return self.predict_sample(x, node.right)

    def predict(self, X):
        return np.array([self.predict_sample(x, self.tree) for x in X])

class RandomForestScratch:
    def __init__(self, n_trees=10, max_depth=5, min_samples_split=2):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        n_samples = X.shape[0]
        for _ in range(self.n_trees):
            idxs = np.random.choice(n_samples, n_samples, replace=True)
            tree = DecisionTreeScratch(max_depth=self.max_depth,
                                       min_samples_split=self.min_samples_split)
            tree.fit(X[idxs], y[idxs])
            self.trees.append(tree)

    def predict(self, X):
        # vote majoritaire
        predictions = np.array([tree.predict(X) for tree in self.trees])
        y_pred = []
        for i in range(X.shape[0]):
            counts = np.bincount(predictions[:, i])
            y_pred.append(np.argmax(counts))
        return np.array(y_pred)

# ---------------------------
# Lecture et traitement chunk par chunk
# ---------------------------

file = r"C:\Users\DELL\Desktop\project_fire\data\processed\fire_soil_merged_cleaned.csv"
chunksize = 100000  # plus petit pour réduire la mémoire
forest_trees = []
n_trees_per_chunk = 5  # exemple

# On stocke les arbres de chaque chunk
for chunk in pd.read_csv(file, chunksize=chunksize):
    if "geometry" in chunk.columns:
        chunk = chunk.drop(columns=["geometry"])
    
    X = chunk.drop(columns=["fire"])
    y = chunk["fire"]
    
    # One-hot encoding
    cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
    if cat_cols:
        X = pd.get_dummies(X, columns=cat_cols, drop_first=True)
    
    # Optimisation type
    for col in X.select_dtypes(include=['float64']).columns:
        X[col] = X[col].astype(np.float32)
    for col in X.select_dtypes(include=['int64']).columns:
        X[col] = X[col].astype(np.int32)
    
    X_values = X.values
    y_values = y.values

    # Créer des arbres pour ce chunk
    for _ in range(n_trees_per_chunk):
        idxs = np.random.choice(len(X_values), len(X_values), replace=True)
        tree = DecisionTreeScratch(max_depth=8, min_samples_split=20)
        tree.fit(X_values[idxs], y_values[idxs])
        forest_trees.append(tree)

# ---------------------------
# Prédictions sur un autre chunk ou échantillon test
# ---------------------------

# Exemple : prendre un chunk test
test_chunk = pd.read_csv(file, chunksize=100000)
test_df = next(test_chunk)
if "geometry" in test_df.columns:
    test_df = test_df.drop(columns=["geometry"])
X_test = test_df.drop(columns=["fire"])
y_test = test_df["fire"]

cat_cols = X_test.select_dtypes(include=["object", "category"]).columns.tolist()
if cat_cols:
    X_test = pd.get_dummies(X_test, columns=cat_cols, drop_first=True)

# Adapter colonnes pour match avec le training (si des colonnes manquent)
for col in X_test.columns:
    if col not in X.columns:
        X_test[col] = 0
for col in X.columns:
    if col not in X_test.columns:
        X_test[col] = 0

X_test = X_test[X.columns]  # réordonner

X_test_values = X_test.values
y_test_values = y_test.values

# vote majoritaire
predictions = np.array([tree.predict(X_test_values) for tree in forest_trees])
y_pred = []
for i in range(X_test_values.shape[0]):
    counts = np.bincount(predictions[:, i])
    y_pred.append(np.argmax(counts))
y_pred = np.array(y_pred)

# ---------------------------
# Évaluation
# ---------------------------
print(classification_report(y_test_values, y_pred))
print("Accuracy:", accuracy_score(y_test_values, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test_values, y_pred))


IndexError: list index out of range