In [None]:
#!pip install mlflow

In [None]:
import kagglehub
import os
import random
import shutil
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
import matplotlib.pyplot as plt
import mlflow



In [None]:
"""if os.path.exists('data'):
  shutil.rmtree('data')"""

In [None]:
#!kaggle datasets download -d ghostbat101/lung-x-ray-image-clinical-text-dataset -p data

In [None]:
#!unzip -n data/lung-x-ray-image-clinical-text-dataset.zip -d data

On remplace les espaces par _ dans les noms des dossiers et fichiers

In [None]:
def replace_spaces_with_underscores(directory):
    for root, dirs, files in os.walk(directory, topdown=False):
        for name in dirs:
            new_name = name.replace(' ', '_')
            if new_name != name:
                old_path = os.path.join(root, name)
                new_path = os.path.join(root, new_name)
                shutil.move(old_path, new_path)
                print(f"Renamed: {old_path} -> {new_path}")

replace_spaces_with_underscores('data')

# Creation d'un sample binaire imbalanced avec 20 % des données : Main_dataset_Sample_Binaire_With_Ratio

In [None]:
ratio_malade = 0.875 # >= 0.5 for imbalanced data
test_ratio = 0.2  # 20% of data for testing

# Chemin vers le dataset principal
source_dir = "/teamspace/studios/this_studio/jedha-final-project/tests/templates/data/Main_dataset"

# Chemin vers le nouveau dataset binaire
binary_sample_dir = "data/Main_dataset_Sample_Binaire_With_Ratio"
binary_sample_test_dir = "data/Main_dataset_Sample_Binaire_Test"

# Créer les répertoires de sortie pour les classes "Normal" et "Malades" (Train)
os.makedirs(binary_sample_dir, exist_ok=True)
normal_dir = os.path.join(binary_sample_dir, "Normal")
malades_dir = os.path.join(binary_sample_dir, "Malades")
os.makedirs(normal_dir, exist_ok=True)
os.makedirs(malades_dir, exist_ok=True)

# Créer les répertoires de sortie pour les classes "Normal" et "Malades" (Test)
os.makedirs(binary_sample_test_dir, exist_ok=True)
test_normal_dir = os.path.join(binary_sample_test_dir, "Normal")
test_malades_dir = os.path.join(binary_sample_test_dir, "Malades")
os.makedirs(test_normal_dir, exist_ok=True)
os.makedirs(test_malades_dir, exist_ok=True)

# Liste des classes malades
malades_classes = ['Chest_Changes', 'Degenerative_Infectious_Diseases', 'Encapsulated_Lesions',
                   'Higher_Density', 'Lower_Density', 'Mediastinal_Changes', 'Obstructive_Pulmonary_Diseases']

normal_classes = ['Normal']

# Get the number of train and test images for "Normal" and "Malades"
normal_total_images = int(len(os.listdir(os.path.join(source_dir, normal_classes[0])))*0.2)
#print(normal_total_images)
normal_test_count = int(test_ratio * normal_total_images)
#print(normal_test_count)
normal_train_count = normal_total_images - normal_test_count

malades_total_images = normal_total_images * 7
malades_tokeep_images = int(normal_total_images*ratio_malade/(1-ratio_malade))  # pour avoir ratio_malade % du jeu de données qui est malade
#print(malades_tokeep_images)
malades_test_count = int(test_ratio * malades_tokeep_images)
#print(malades_test_count)
malades_train_count = malades_tokeep_images - malades_test_count
if malades_train_count < normal_train_count*ratio_malade/(1-ratio_malade):
    print(f"pas assez d'images de malades pour ces ratio de malades ({ratio_malade}) et de test ({test_ratio}) : \
        runnez à nouveau la cellule en modifiant ces ratio")

# Process "Normal" class
for class_name in normal_classes:
    class_path = os.path.join(source_dir, class_name)
    images = os.listdir(class_path)
    random.shuffle(images)

    # Split into train and test
    test_images = images[:normal_test_count]
    train_images = images[normal_test_count:normal_total_images]

    # Copy train images
    for img in train_images:
        src_path = os.path.join(class_path, img)
        dest_path = os.path.join(normal_dir, img)
        shutil.copy(src_path, dest_path)

    # Copy test images
    for img in test_images:
        src_path = os.path.join(class_path, img)
        dest_path = os.path.join(test_normal_dir, img)
        shutil.copy(src_path, dest_path)

# Process "Malades" classes
for class_name in malades_classes:
    class_path = os.path.join(source_dir, class_name)
    images = os.listdir(class_path)
    random.shuffle(images)

    # Split into train and test
    test_images = images[:malades_test_count // len(malades_classes)]
    train_images = images[malades_test_count // len(malades_classes):malades_tokeep_images // len(malades_classes)]

    # Copy train images
    for img in train_images:
        src_path = os.path.join(class_path, img)
        dest_path = os.path.join(malades_dir, img)
        shutil.copy(src_path, dest_path)

    # Copy test images
    for img in test_images:
        src_path = os.path.join(class_path, img)
        dest_path = os.path.join(test_malades_dir, img)
        shutil.copy(src_path, dest_path)

print("Les données ont été dupliquées :")
print(f"- Dans le répertoire {binary_sample_dir} avec les classes 'Normal' et 'Malades' pour l'entraînement.")
print(f"- Dans le répertoire {binary_sample_test_dir} avec les classes 'Normal' et 'Malades' pour les tests.")

In [None]:
# test du nombre d'images dans chaque dossier

def count_files_in_each_subdirectory(directory):
    try:
        # Parcourt le répertoire et ses sous-dossiers
        result = {}
        for root, _, files in os.walk(directory):
            subfolder_name = os.path.relpath(root, directory)  # Nom relatif du sous-dossier
            result[subfolder_name] = len(files)
        return result
    except Exception as e:
        print(f"Erreur : {e}")
        return {}

#Exemple d'utilisation
chemin_du_dossier = "data/Main_dataset_Sample_Binaire_With_Ratio"#Main_dataset_Sample_Binaire_Test / Main_dataset_Sample_Binaire_With_Ratio
fichiers_par_sous_dossier = count_files_in_each_subdirectory(chemin_du_dossier)

#Affiche les résultats
for sous_dossier, nombre_de_fichiers in fichiers_par_sous_dossier.items():
    print(f"{sous_dossier} contient {nombre_de_fichiers} fichier(s).")

# Démarrage du tracking MLFLOW

In [27]:
MLFLOW_SERVER_URI = 'https://david-rem-jedha-final-project-mlops.hf.space'
EXPERIMENT_NAME = 'binary' # 'binary' ou 'multi'
TRAINER = 'sophie' # Le prénom de la personne qui a exécuté l'entrainement
MODEL_TYPE = 'vgg_20_data_aug_sur_mino_test_1epoch' # Le type de modèle utilisé

mlflow.set_tracking_uri(MLFLOW_SERVER_URI)
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.tensorflow.autolog()

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
mlflow.start_run(experiment_id = experiment.experiment_id)

<ActiveRun: >

# Preprocessing avec data augmentation de la minoritaire

In [2]:
# Paramètres de base
img_size = (128, 128)  # Taille des images pour redimensionnement
batch_size = 32  # Taille des lots pour l'augmentation
minority_dir = "data/Main_dataset_Sample_Binaire_With_Ratio/Normal"  # Classe minoritaire
majority_dir = "data/Main_dataset_Sample_Binaire_With_Ratio/Malades"  # Classe majoritaire

In [None]:
# Charger les images de la classe minoritaire
def load_images_from_directory(directory, img_size):
    images = []
    for file_name in os.listdir(directory):
        file_path = os.path.join(directory, file_name)
        if os.path.isfile(file_path):  # Vérifie que c'est bien un fichier
            img = load_img(file_path, target_size=img_size)  # Charger et redimensionner l'image
            img_array = img_to_array(img) / 255.0  # Convertir en tableau NumPy et normaliser
            images.append(img_array)
    return np.array(images)

x_normal = load_images_from_directory(minority_dir, img_size)
print(f"Nombre d'images dans la classe 'Normal' (minoritaire) : {x_normal.shape[0]}")

In [None]:
# Charger les images de la classe malades
x_malades = load_images_from_directory(majority_dir, img_size)
print(f"Nombre d'images dans la classe 'Malades' (majoritaire) : {x_malades.shape[0]}")

In [6]:
# Générateur d'augmentation pour la classe minoritaire
img_generator_for_mino = ImageDataGenerator(
    rotation_range=90,
    brightness_range=(0.5, 1.0),
    zoom_range=0.1,
    horizontal_flip=True,
    vertical_flip=True
)

In [None]:
# Nombre d'images à générer pour équilibrer parfaitement
x_malades.shape[0] - x_normal.shape[0]

In [8]:
# Nombre d'images à générer
num_augmented_samples = (x_malades.shape[0] - x_normal.shape[0]) // 2

In [None]:

# Générateur sur classe minoritaire
generator = img_generator_for_mino.flow(
    x_normal,  # Données de la classe minoritaire
    y=None,  # Pas besoin de labels ici
    batch_size=batch_size,
    shuffle=True
)

# Génération des images augmentées
x_augmented = []
for _ in range(num_augmented_samples // batch_size + 1):  # Générer par lots
    batch = next(generator)
    x_augmented.append(batch)

# Convertir en tableau NumPy
x_augmented = np.concatenate(x_augmented, axis=0)
x_augmented = x_augmented[:num_augmented_samples]  # Limiter au nombre nécessaire
print(f"Nombre d'images augmentées générées : {x_augmented.shape[0]}")


In [None]:
# Labels pour chaque classe
y_normal = np.zeros((x_normal.shape[0],))  # Classe 0 pour 'Normal'
y_malades = np.ones((x_malades.shape[0],))  # Classe 1 pour 'Malades'
y_augmented = np.zeros((x_augmented.shape[0],))  # Classe 0 pour les augmentées 'Normal'

# Combiner les données
x_train = np.concatenate([x_normal, x_malades, x_augmented], axis=0)
y_train = np.concatenate([y_normal, y_malades, y_augmented], axis=0)

print(f"Taille finale des données d'entraînement : {x_train.shape[0]}")

In [None]:
# Mélanger les données et les labels de manière synchronisée
x_train, y_train = shuffle(x_train, y_train, random_state=42)

print(f"Taille finale des données d'entraînement après mélange : {x_train.shape[0]}")

In [16]:
# Séparer les données en ensembles d'entraînement et de validation
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.3, random_state=42, stratify=y_train)

---

# Preprocessing

In [4]:
"""# Configuration de l'ImageDataGenerator avec preprocessing_function
img_generator = ImageDataGenerator(
    rescale=1/255., # Rescaling values from [0,255]->[0,1]
    rotation_range=180, # Angle range for random image rotation
    #width_shift_range=0.1, # Random shift of the image along width axis
    #height_shift_range=0.1, # Random shift of the image along height axis
    brightness_range=(0.5,1), # Random brightness modification
    #shear_range=0.1, # Random distortion of the image
    zoom_range=0.1, # Random zoom on the image
    #channel_shift_range=50.0, # Random hue modification
    horizontal_flip=True, # Randomly flips image horizontally
    vertical_flip=True, # Randomly flips image virtically
    #fill_mode='wrap',
    validation_split=0.2 # Portion of the data that can be saved for validation
)"""

In [6]:
"""# Chargement des ensembles
BATCH_SIZE = 64
img_size = (128,128)  
seed = 42"""

In [None]:
"""# Chargement des données  de train avec flow_from_directory de la classe minoritaire
img_generator_flow_train_normal = img_generator.flow_from_directory(
    directory="data/Main_dataset_Sample_Binaire_With_Ratio/Normal",  # chemin vers les données
    target_size=img_size,
    class_mode='categorical',
    batch_size=BATCH_SIZE, # The batch size of the produced batches
    shuffle = True ,#Whether to shuffle after all files have been selected once
    subset = "training"
)

# Chargement des données de validation avec flow_from_directory de la classe minoritaire
img_generator_flow_valid_normal = img_generator.flow_from_directory(
    directory="data/Main_dataset_Sample_Binaire_With_Ratio/Normal",  # chemin vers les données
    target_size=img_size,
    class_mode='categorical',
    batch_size=BATCH_SIZE, # The batch size of the produced batches
    shuffle = True, #Whether to shuffle after all files have been selected once
    subset = "validation"
)"""

# Modèle

In [17]:
CLASSES = 2
# CLASSES =7

In [18]:
base_model = tf.keras.applications.VGG19(input_shape=(128, 128, 3), 
                                                     include_top=False,
                                                     weights = "imagenet",
                                                     name="vgg19",
                                                     #input_shape=None
                                                    )

2025-01-29 09:47:00.343995: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [None]:
#base_model.summary()

In [19]:
base_model.trainable = False

In [11]:

"""# Fine-tuning des dernières couches
#fine_tune_at = len(base_model.layers) - 20
#for layer in base_model.layers[:fine_tune_at]:
#    layer.trainable = False
fine_tune_at = 2
for layer in base_model.layers[-fine_tune_at:]:
    layer.trainable = True"""


In [20]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(128, 128, 1)),  # Entrée en niveaux de gris
    #tf.keras.layers.Conv2D(3, (1, 1)),  # Conversion 1 -> 3 canaux
    base_model,  # pré-entraîné
    tf.keras.layers.GlobalMaxPooling2D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    #tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(CLASSES, activation="softmax")  
])

In [None]:
model.summary()

# Entrainement

In [21]:
# Let's create a learning rate schedule to decrease the learning rate as we train the model.
initial_learning_rate = 0.001#0.001

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=1000,
    decay_rate=0.96,
    staircase=True)

# lr_onplateau = tf.keras.callbacks.ReduceLROnPlateau(
#     monitor='val_loss',
#     factor=0.1,
#     patience=10,
#     verbose=0,
#     mode='auto',
#     min_delta=0.0001,
#     cooldown=0,
#     min_lr=0.0,
# )

In [22]:
# Créer un optimiseur avec le planning du taux d'apprentissage
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

# Compiler le modèle
model.compile(
    optimizer=optimizer,
    loss='sparse_categorical_crossentropy',
    #loss=tf.keras.losses.BinaryFocalCrossentropy(apply_class_balancing=True), #label_smoothing=0.5),
    metrics=['accuracy']
)

In [43]:
"""# Définir un callback d'early stopping
early_stopping = EarlyStopping(
    monitor='val_loss',  # Surveiller la loss de validation
    patience=5,          # Nombre d'époques sans amélioration avant d'arrêter
    restore_best_weights=True  # Rétablir les poids du meilleur modèle
)"""

In [16]:
"""class_indices = img_generator_flow_train.class_indices
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(list(class_indices.values())),
    y=img_generator_flow_train.classes
)
class_weights = dict(enumerate(class_weights))"""

In [39]:
"""class_weights = {0: 0.5714285714285714, 1: 12.0}"""

In [13]:
"""# imgs, labels = next(iter(img_generator_flow_train))
# print(labels)
# print(pd.DataFrame(labels).value_counts())
class_weights = {}
class_weights['0.0'] = (1/60)*(64/2)
class_weights['1.0'] = (1/4)*(64/2)"""

In [None]:
#class_weights

In [23]:
EPOCHS = 10

In [None]:
# Entraîner le modèle
history = model.fit(
    x_train,  # Données d'entraînement
    y_train,  # Labels d'entraînement
    epochs=EPOCHS,  # Nombre d'époques
    batch_size=batch_size,  # Taille des lots
    validation_data=(x_val, y_val),  # Validation avec l'ensemble de validation
    shuffle=True,  # Mélanger les données à chaque époque
)

In [None]:
"""# Entraîner le modèle avec early stopping
history = model.fit(
    #rgb_train_generator,
    img_generator_flow_train,
    validation_data=img_generator_flow_valid , #rgb_val_generator,
    epochs=EPOCHS,  # Plus d'époques pour laisser l'early stopping décider
    #steps_per_epoch=len(img_generator_flow_train),
    #validation_steps=len(img_generator_flow_valid),
    #callbacks=[early_stopping],
    class_weight= class_weights
)""" 

In [None]:
"""# 5 epochs de plus
history = model.fit(
    #rgb_train_generator,
    img_generator_flow_train,
    validation_data=img_generator_flow_valid , #rgb_val_generator,
    epochs=EPOCHS,  # Plus d'époques pour laisser l'early stopping décider
    #steps_per_epoch=len(img_generator_flow_train),
    #validation_steps=len(img_generator_flow_valid),
    #callbacks=[early_stopping],
    class_weight= class_weights 
)"""

# Sauvegarde du tracking MLFLOW

In [None]:
mlflow.log_param("trainer", TRAINER) 
#mlflow.log_param("epochs", EPOCHS) 
mlflow.log_param("model_type", MODEL_TYPE)

# Sauvegarde du modèle
mlflow.keras.log_model(model, "model")

# Sauvegarde des métriques par époque
history = model.history
for epoch in range(len(history.history['loss'])):
    mlflow.log_metric('loss', history.history['loss'][epoch], step=epoch)
    mlflow.log_metric('accuracy', history.history['accuracy'][epoch], step=epoch)
    mlflow.log_metric('val_loss', history.history['val_loss'][epoch], step=epoch)
    mlflow.log_metric('val_accuracy', history.history['val_accuracy'][epoch], step=epoch)

In [None]:
predictions = model.predict(x_val)
y_pred = np.argmax(predictions, axis=1)
#y_true = img_generator_flow_valid.classes

In [None]:
"""# prédictions en cherchant le threshold optimal pour la sigmoid (imbalanced categories)

seuils = np.arange(0.0, 1.0, 0.01)

#Fonction pour calculer la F1-score pour un seuil donné
def calculer_f1_score(y_true, y_pred_prob, seuil):
    y_pred = (y_pred_prob >= seuil).astype(int)
    return f1_score(y_true, y_pred)

#Trouver le seuil qui maximise la F1-score
f1_scores = [calculer_f1_score(y_true, y_pred, seuil) for seuil in seuils]
seuil_optimal = seuils[np.argmax(f1_scores)]

print("Seuil optimal :", seuil_optimal)
print("F1-score pour le seuil optimal :", max(f1_scores))"""

In [33]:
report = classification_report(y_val, y_pred)
with open("classification_report.txt", "w") as file:
    file.write(report)
mlflow.log_artifact('classification_report.txt', artifact_path="model")

In [None]:
print(classification_report(y_val, y_pred))

In [35]:
report = classification_report(y_val, y_pred, output_dict=True)
mlflow.log_metric('global_accuracy', report['accuracy'])
mlflow.log_metric('macro_avg_precision', report['macro avg']['precision'])
mlflow.log_metric('macro_avg_recall', report['macro avg']['recall'])
mlflow.log_metric('macro_avg_f1_score', report['macro avg']['f1-score'])
mlflow.log_metric('macro_avg_support', report['macro avg']['support'])
mlflow.log_metric('weighted_avg_precision', report['weighted avg']['precision'])
mlflow.log_metric('weighted_avg_recall', report['weighted avg']['recall'])
mlflow.log_metric('weighted_avg_f1_score', report['weighted avg']['f1-score'])
mlflow.log_metric('weighted_avg_support', report['weighted avg']['support'])

In [49]:
for class_mesure in list(report.items())[:CLASSES]:
    for m_name, m_value in class_mesure[1].items():
        mlflow.log_metric(m_name, m_value, step=int(float(class_mesure[0])))

In [None]:
cm = confusion_matrix(y_val, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Normal", "Malades"])
disp.plot()
plt.title("Matrice de Confusion")
plt.savefig("confusion_matrix.png")
mlflow.log_artifact("confusion_matrix.png", artifact_path='model')

In [None]:
#Calculer la courbe ROC
fpr, tpr, thresholds = roc_curve(y_val, y_pred)
roc_auc = auc(fpr, tpr)

#Tracer la courbe ROC
plt.figure()
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.savefig("roc_curve.png")
mlflow.log_artifact("roc_curve.png", artifact_path='model')

In [None]:
mlflow.end_run()