<a href="https://colab.research.google.com/github/chrismayemba/Chest_XRay_Model/blob/main/Chest_X_ray_NIH_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from google.colab import drive

# Montez Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
# Chemin vers le répertoire où les images sont stockées
image_dir = '/content/drive/MyDrive/NIH Chest X-ray/sample/images'

# Chemin vers le fichier CSV des labels
label_file = '/content/drive/MyDrive/NIH Chest X-ray/sample_labels.csv'

def load_images_and_labels(image_dir, label_file):
    labels_df = pd.read_csv(label_file)
    images = []
    labels = []

    for _, row in labels_df.iterrows():
        img_path = os.path.join(image_dir, row['Image Index'])
        if os.path.exists(img_path):
            img = load_img(img_path, target_size=(128, 128), color_mode='grayscale')
            img_array = img_to_array(img)
            images.append(img_array)
            labels.append(row['Finding Labels'])
        else:
            print(f"Fichier non trouvé : {img_path}")

    return np.array(images), np.array(labels)

images, labels = load_images_and_labels(image_dir, label_file)


In [3]:
X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=42)

# Reshape data for the CNN
X_train = X_train.reshape(-1, 128, 128, 1)
X_test = X_test.reshape(-1, 128, 128, 1)

# Normalize data
X_train = X_train / 255.0
X_test = X_test / 255.0

In [5]:
# Build the CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Sigmoid activation for binary classification
])

# Convert labels to numerical format (assuming 'Finding Labels' contains strings like 'Yes' or 'No')
y_train_numerical = np.where(y_train == 'Yes', 1, 0)
y_test_numerical = np.where(y_test == 'Yes', 1, 0)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train_numerical, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [7]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test_numerical) # Use y_test_numerical instead of y_test
print(f'Loss: {loss}, Accuracy: {accuracy}')

# Predictions
y_pred = (model.predict(X_test) > 0.5).astype("int32")

print('Confusion Matrix:')
print(confusion_matrix(y_test_numerical, y_pred)) # Use y_test_numerical for confusion matrix
print('Classification Report:')
print(classification_report(y_test_numerical, y_pred)) # Use y_test_numerical for classification report

Loss: 0.0, Accuracy: 1.0
Confusion Matrix:
[[1122]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1122

    accuracy                           1.00      1122
   macro avg       1.00      1.00      1.00      1122
weighted avg       1.00      1.00      1.00      1122



In [2]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

In [2]:
from google.colab import drive
import numpy as np
import pandas as pd
import os
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split

# Mont Google Drive
drive.mount('/content/drive')

# Chemin vers le répertoire où les images sont stockées
image_dir = '/content/drive/MyDrive/NIH Chest X-ray/sample/images'

# Chemin vers le fichier CSV des labels
label_file = '/content/drive/MyDrive/NIH Chest X-ray/sample_labels.csv'

def load_images_and_labels(image_dir, label_file):
    labels_df = pd.read_csv(label_file)
    images = []
    labels = []

    for _, row in labels_df.iterrows():
        img_path = os.path.join(image_dir, row['Image Index'])
        if os.path.exists(img_path):
            img = load_img(img_path, target_size=(128, 128), color_mode='grayscale')
            img_array = img_to_array(img)
            images.append(img_array)
            labels.append(row['Finding Labels'])
        else:
            print(f"Fichier non trouvé : {img_path}")

    return np.array(images), np.array(labels)

images, labels = load_images_and_labels(image_dir, label_file)
X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=42)

# Convertir les données en dataset Hugging Face
train_data = Dataset.from_dict({"image": X_train, "label": y_train})
test_data = Dataset.from_dict({"image": X_test, "label": y_test})

# Charger le modèle et le tokenizer
model_name = "togethercomputer/dragonfly_med"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Préparer les données pour l'entraînement
def preprocess_function(examples):
    return tokenizer(examples["image"], padding="max_length", truncation=True)

train_dataset = train_data.map(preprocess_function, batched=True)
test_dataset = test_data.map(preprocess_function, batched=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


NameError: name 'load_img' is not defined

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

In [None]:
results = trainer.evaluate()
print(results)