# 🧠 Projet Deep Learning — Classification des Cellules Sanguines Cancéreuses (PyTorch)

## 📌 1. Contexte du Projet
Vous êtes un développeur IA junior au sein d’un laboratoire biomédical spécialisé en imagerie médicale.
Objectif : Automatiser l’analyse d’images médicales liées à deux pathologies critiques :
- Détection de **tumeurs cérébrales** (object detection à partir d’IRM),
- Classification de **cellules sanguines cancéreuses** (leucémies) à partir de frottis sanguins.

Cette première partie se concentre sur la **classification des cellules sanguines cancéreuses avec PyTorch**.

## 🧾 2. Importation des Bibliothèques Nécessaires


In [None]:
import os
import shutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

## 📂 3. Chargement des Images & Vérification des Extensions

- Extensions autorisées : `.jpeg`, `.jpg`, `.png`, `.bmp`
- Suppression des fichiers invalides
- Gestion des erreurs via `try-except`

In [None]:
data_dir = "../Data/Raw/Blood cell Cancer [ALL]"

valid_extensions = ('.jpg', '.jpeg', '.png', '.bmp')

# Parcourir tous les sous-dossiers (glioma, meningioma, etc.)
for folder in os.listdir(data_dir):
    folder_path = os.path.join(data_dir, folder)
    
    if os.path.isdir(folder_path):
        for file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file)
            ext = os.path.splitext(file)[1].lower()  # extraire l’extension
            
            if ext not in valid_extensions:
                os.remove(file_path)
                print(f" Fichier supprimé : {file_path}")
            else:
                print(f" Fichier avec extension valide : {file_path}")
print(" Suppression terminée.")

## 🔎 4. Exploration des Classes du Dataset
- Liste des dossiers (classes)
- Nombre d’images par classe (`countplot`)
- Affichage d’un échantillon d’images par classe

In [None]:
# Nbre d'image par classe :
labels=[]
images=[]
for folder in os.listdir(data_dir):
    folder_path=os.path.join(data_dir,folder)
    if os.path.isdir(folder_path):
        count=0
        for file in os.listdir(folder_path):
            if file.lower().endswith(valid_extensions):
                count+=1
                images.append(os.path.join(folder_path, file))
                labels.append(folder)
        print(f"Classe {folder} : {count} images")
print

# affichage de nombre image par classe countplot 

plt.figure(figsize=(8,5))
sns.countplot(x=labels)
plt.title("Nombre d'images par classe")
plt.xlabel("Classe")
plt.ylabel("Nombre d'images")
plt.show()

# Afficher les images par classe 

classes = sorted(set(labels))
print(classes)

plt.figure(figsize=(12, 6))

for i, cls in enumerate(classes):
    # Trouver le premier chemin d'image appartenant à cette classe
    for img_path, label in zip(images, labels):
        if label == cls:
            img = Image.open(img_path)
            plt.subplot(1, len(classes), i + 1)
            plt.imshow(img)
            plt.title(cls)
            plt.axis("off")
            break  # on s'arrête après la première image trouvée

plt.tight_layout()
plt.show()





## ✂️ 5. Division du Dataset en **Train / Validation / Test**
- Répartition : **70% / 15% / 15%**
- Vérification du nombre d’images par dossier

In [None]:
import os
import shutil
from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(
    images, labels,
    test_size=0.3, random_state=42
)

x_val, x_test, y_val, y_test = train_test_split(
    x_test, y_test,
    test_size=0.5, random_state=42
)


base_path = "..\Data\Processed"
base_path = "..\Data\Processed"
train_path = os.path.join(base_path, "Train")
val_path = os.path.join(base_path, "Validation")
test_path = os.path.join(base_path, "Test")

for path in [train_path, val_path, test_path]:
    os.makedirs(path, exist_ok=True)

def copy_images_to_folder(image_paths, labels, destination_path):
    for img_path, label in zip(image_paths, labels):
        class_folder = os.path.join(destination_path, label)
        os.makedirs(class_folder, exist_ok=True)
        
        img_name = os.path.basename(img_path)
        dest_path = os.path.join(class_folder, img_name)
        
        try:
            shutil.copy2(img_path, dest_path)
        except Exception as e:
            print(f"Erreur lors de la copie de {img_path}: {e}")

copy_images_to_folder(x_train, y_train, train_path)

copy_images_to_folder(x_val, y_val, val_path)

copy_images_to_folder(x_test, y_test, test_path)

print("Repartition par classe :")
for dataset_name, dataset_path in [("Train", train_path), ("Validation", val_path), ("Test", test_path)]:
    print(f"\n{dataset_name}:")
    if os.path.exists(dataset_path):
        classes = [d for d in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, d))]
        for class_name in sorted(classes):
            class_path = os.path.join(dataset_path, class_name)
            num_images = len([f for f in os.listdir(class_path) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp'))])
            print(f"   - {class_name}: {num_images} images")


## 🔄 6. Data Augmentation (Seulement sur Train)
- Transformations : `blur`, `noise`, `flip`
- Objectif : équilibrer les classes et augmenter la robustesse


In [None]:
from PIL import Image, ImageFilter
import numpy as np
import random

def add_noise(img, noise_level=20):
    arr = np.array(img)
    noise = np.random.randint(-noise_level, noise_level, arr.shape, dtype='int16')
    noisy_arr = np.clip(arr.astype('int16') + noise, 0, 255).astype('uint8')
    return Image.fromarray(noisy_arr)

def augment_image(img_path, save_dir, augmentations):
    img = Image.open(img_path)
    base_name = os.path.splitext(os.path.basename(img_path))[0]
    ext = os.path.splitext(img_path)[1]
    aug_count = 0

    if 'blur' in augmentations:
        blurred = img.filter(ImageFilter.GaussianBlur(radius=2))
        blurred.save(os.path.join(save_dir, f"{base_name}_blur{ext}"))
        aug_count += 1

    if 'noise' in augmentations:
        noisy = add_noise(img)
        noisy.save(os.path.join(save_dir, f"{base_name}_noise{ext}"))
        aug_count += 1

    if 'flip' in augmentations:
        flipped = img.transpose(Image.FLIP_LEFT_RIGHT)
        flipped.save(os.path.join(save_dir, f"{base_name}_flip{ext}"))
        aug_count += 1

    return aug_count

for class_name in os.listdir(train_path):
    class_dir = os.path.join(train_path, class_name)
    if not os.path.isdir(class_dir):
        continue
    images = [f for f in os.listdir(class_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp'))]
    for img_file in images:
        img_path = os.path.join(class_dir, img_file)
        augment_image(img_path, class_dir, augmentations=['blur', 'noise', 'flip'])


import random
import shutil

def balance_classes_oversample(dataset_path):
    classes = [d for d in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, d))]
    
    class_counts = {}
    class_images = {}
    
    for class_name in classes:
        class_dir = os.path.join(dataset_path, class_name)
        images = [f for f in os.listdir(class_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp'))]
        class_counts[class_name] = len(images)
        class_images[class_name] = images
    
    max_count = max(class_counts.values())
 
    for class_name in classes:
        class_dir = os.path.join(dataset_path, class_name)
        images = class_images[class_name]
        current_count = len(images)
        
        if current_count < max_count:
            random.seed(42) 
            num_to_add = max_count - current_count
            
            images_to_duplicate = random.choices(images, k=num_to_add)
            
            for idx, img_file in enumerate(images_to_duplicate):
                src_path = os.path.join(class_dir, img_file)
                
                base_name, ext = os.path.splitext(img_file)
                new_name = f"{base_name}_dup{idx}{ext}"
                dst_path = os.path.join(class_dir, new_name)
                
                shutil.copy2(src_path, dst_path)
            



balance_classes_oversample(train_path)
balance_classes_oversample(val_path)
balance_classes_oversample(test_path)

for dataset_name, dataset_path in [("Train", train_path), ("Validation", val_path), ("Test", test_path)]:
    print(f"\n{dataset_name}:")
    if os.path.exists(dataset_path):
        classes = [d for d in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, d))]
        for class_name in sorted(classes):
            class_path = os.path.join(dataset_path, class_name)
            num_images = len([f for f in os.listdir(class_path) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp'))])
            print(f"   - {class_name}: {num_images} images")



## 🧪 7. Préparation des Données avec `ImageFolder` & `Transforms`
- Resize
- ToTensor
- Normalize

In [None]:
from torchvision import datasets,transforms

# Chemins de train/test/validation
train_dir = "../Data/Processed/Train"
test_dir = "../Data/Processed/Test"
validation_dir = "../Data/Processed/Validation"

# Definition de transformer :
transform = transforms.Compose([
    transforms.Resize((128,128)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], # Normaliser (R, G, B)
                         std=[0.5, 0.5, 0.5])
])

# Charger les datasets avec ImageFolder
train_dataset = datasets.ImageFolder(root=train_dir, transform=transform)
validation_dataset   = datasets.ImageFolder(root=validation_dir, transform=transform)
test_dataset  = datasets.ImageFolder(root=test_dir, transform=transform)

# Vérification
print("Nombre d'images dans le train :", len(train_dataset))
print("Nombre d'images dans la validation :", len(validation_dataset))
print("Nombre d'images dans le test :", len(test_dataset))
print("Classes :", train_dataset.classes)


## 🚚 8. Création des DataLoaders
- Batch loading
- Shuffling

In [None]:

transform_train = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

transform_val_test = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset = datasets.ImageFolder(root=train_path, transform=transform_train)
val_dataset = datasets.ImageFolder(root=val_path, transform=transform_val_test)
test_dataset = datasets.ImageFolder(root=test_path, transform=transform_val_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"Train dataset: {len(train_dataset)} images")
print(f"Validation dataset: {len(val_dataset)} images")
print(f"Test dataset: {len(test_dataset)} images")
print(f"Classes: {train_dataset.classes}")



## 🧠 9. Chargement du Modèle Pré-entraîné **GoogLeNet**
- Remplacement de la partie **Fully Connected (FC)** par un `nn.Sequential`

## ⚙️ 10. Configuration de l’Entraînement
- Learning Rate
- Loss Function
- Optimizer

## 🏋️ 11. Boucle d’Entraînement


## 📊 12. Évaluation & Test du Modèle
- Accuracy / Loss
- Matrice de confusion (optionnel)

## 💾 13. Sauvegarde du Modèle

## ✅ 14. Conclusion & Observations
