# Chest X-ray pneumonia
---
Read [homepage](https://www.kaggle.com/datasets/paultimothymooney/chest-xray-pneumonia) for more information.

### Imports

In [1]:
from   datetime import datetime
import os
from   pathlib import Path
import time
import random

from   IPython.display import clear_output
import matplotlib.pyplot as plt
from   mpl_toolkits.axes_grid1 import ImageGrid
import numpy as np
import pandas as pd
from   PIL import Image
import plotly.express as px
import plotly.graph_objects as go
from   plotly.subplots import make_subplots

import torch
assert torch.cuda.device_count() == 1, "Select GPU P100 in Settings > Accelerator from the right panel."
import torch.nn as nn
import torch.optim as optim
from   torch.utils.data import DataLoader

from   torchmetrics.classification import BinaryConfusionMatrix
from   sklearn.metrics import accuracy_score, confusion_matrix

import torchvision
from   torchvision.datasets import ImageFolder
import torchvision.transforms.functional as F
import torchvision.transforms as T
from   torchvision.utils import make_grid
from   torchvision.io import read_image, ImageReadMode
from   torchvision.models import efficientnet_b4, densenet121

torch.use_deterministic_algorithms(True)

pd.set_option('max_colwidth', 400)
pd.set_option('display.max_rows', 50)

!nvidia-smi -L

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-15b1a41c-7f55-77ba-223b-f035b205f63b)


In [2]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8"
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False # ajouter par moi voir si cest tjr deterministique

### Global variables and persistent files

In [3]:
data_path = Path('../input/chest-xray-pneumonia/chest_xray/chest_xray/')

csv_path = Path("./chest_xray.csv")

# create pandas dataframe from the dataset. Process takes around 1 minute to complete so we store the dataframe into a csv file for future use
if not csv_path.exists():
    df = pd.DataFrame(((path, *path.parts[-3:], *read_image(str(path)).size()) for path in data_path.rglob("*.jpeg")),
                      columns=["path", "subset", "label", "filename", "channel", "height", "width"])
    df["aspect_ratio"] = df["width"] / df["height"]
    df.to_csv(csv_path, index=False)

### Dataset analysis

In [4]:
# load dataset into pandas dataframe
df = pd.read_csv(csv_path)

In [None]:
# total size of the dataset
len(df)

In [None]:
# dataset distribution
df.groupby(["label", "subset"]).size().unstack()

In [None]:
# subset distribution
with pd.option_context('display.float_format', "{:.2%}".format):
    print(df["subset"].value_counts(normalize=True))

In [None]:
# class distribution
with pd.option_context('display.float_format', "{:.0%}".format):
    print(df["label"].value_counts(normalize=True))

There is a clear imbalance, a lot more samples with PNEUMONIA.

In [None]:
# trianing set distribution
with pd.option_context('display.float_format', "{:.0%}".format):
    print(df[df["subset"] == "train"]["label"].value_counts(normalize=True))

Training set distribution similar to whole dataset distribution.

In [None]:
# summary statistics on the images
with pd.option_context('display.float_format', '{:g}'.format):
    display(df[["height", "width", "aspect_ratio"]].describe())

In [None]:
# some images are RGBs but the 3 channels are actually equal
df["channel"].value_counts()

Some images are Grayscale while models require 3 channels as input, this is fixed when loading the image with `read_image` and using parameter `ImageReadMode.RGB`.

In [None]:
imgs = [T.Resize((224,224), antialias=True)(read_image(str(path), ImageReadMode.GRAY)) for path in df["path"].sample(n=4)]
grid = make_grid(imgs, nrow=4).permute(1,2,0)
plt.figure(figsize=(25,25))
plt.imshow(grid)
plt.axis('off')
plt.show()

### Datasets and Dataloader

In [5]:
# torch.manual_seed(0)
# seed = torch.Generator()
# seed_everything()

# def seed_worker(worker_id):
#     worker_seed = torch.initial_seed() % 2**32
#     np.random.seed(worker_seed)
#     random.seed(worker_seed)

# g = torch.Generator()
# g.manual_seed(0)

def check_file(path):
    try:
        Image.open(path)
        return True
    except:
        return False
    
def loader(path):
    return read_image(path, ImageReadMode.RGB)

# https://tcapelle.github.io/pytorch/fastai/2021/02/26/image_resizing.html
train_transform = T.Compose([
    T.Resize((224, 224), antialias=True),
#     T.RandomAffine(degrees=15, translate=(.2, .2), scale=(.8, 1.2)),
    T.ToTensor(),
#     T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    T.Normalize((0.485), (0.229)),
])

eval_transform = T.Compose([
    T.Resize((224, 224), antialias=True),
    T.ToTensor(),
#     T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    T.Normalize((0.485), (0.229)),
])

train_data = ImageFolder(
    root=data_path/"train",
    transform=train_transform,
#     is_valid_file=check_file,
#     loader=loader,
)
train_loader = DataLoader(
    dataset=train_data,
    batch_size=32,
    shuffle=True,
    num_workers=2,
#     worker_init_fn=seed_worker,
#     generator=g,
)

val_data = ImageFolder(
    root=data_path/"val",
    transform=eval_transform,
#     is_valid_file=check_file,
#     loader=loader,
)
val_loader = DataLoader(
    dataset=val_data,
    batch_size=8,
    shuffle=False,
    num_workers=2,
)

test_data = ImageFolder(
    root=data_path/"test",
    transform=eval_transform,
#     is_valid_file=check_file,
#     loader=loader,
)
test_loader = DataLoader(
    dataset=test_data,
    batch_size=32,
    shuffle=False,
    num_workers=2,
)

In [None]:
paths = df["path"].sample(n=24)
trans = [train_transform(Image.open(path)) for path in paths]

fig = plt.figure(figsize=(25., 25.))
grid = ImageGrid(fig, 111,
                 nrows_ncols=(3, 8),
                 axes_pad=.1,
                 )

for ax, im in zip(grid, trans):
    ax.imshow(im.permute((1,2,0)), cmap="gray")
    ax.axis("off")

plt.show()

### Training

In [6]:
seed_everything()

device = torch.device("cuda")

# for reproducibility, model must be run with the train loop because 
# the state of the model (weights) is changed after each iteration
model = densenet121()
model.classifier = nn.Sequential(
    nn.Linear(1024,256),
    nn.Dropout(0.3),
    nn.ReLU(),
    nn.Linear(256,32),
    nn.Dropout(0.3),
    nn.ReLU(),
    nn.Linear(32,2),
    nn.LogSoftmax(dim=1),
)

# model = efficientnet_b4()
# model.classifier = nn.Sequential(
#     nn.Linear(1792, 1024),
#     nn.Dropout(0.3),
#     nn.ReLU(),
#     nn.Linear(1024, 256),
#     nn.Dropout(0.3),
#     nn.ReLU(),
#     nn.Linear(256, 32),
#     nn.Dropout(0.3),
#     nn.ReLU(),
#     nn.Linear(32,2),
#     nn.LogSoftmax(dim=1),
# )

model.to(device)

loss_fn = nn.NLLLoss()
learning_rate = 1.
optimizer = optim.Adadelta(model.parameters(), lr=learning_rate)

epochs = 1
data = []
for epoch in range(1, epochs+1):
    train_loss = 0
    nb_train_batches = len(train_loader)
    train_acc = 0
    model.train()
    for batch, (image, target) in enumerate(train_loader, start=1):
        print(f"[Epoch {epoch:>2}] Train batch progress: {batch:>3}/{nb_train_batches}", end='\r')
        image = image.to(device)
        target = target.to(device)
        output = model(image)
        loss = loss_fn(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * image.size(0)
        _, preds = torch.max(output, 1)
        train_acc += (preds == target).float().sum().item()
    
    val_loss = 0
    nb_val_batches = len(val_loader)
    val_acc = 0
    model.eval()
    with torch.no_grad():
        for batch, (image, target) in enumerate(val_loader, start=1):
            print(f"[Epoch {epoch:>2}] Val batch progress: {batch:>3}/{nb_val_batches}", end='\r')
            image = image.to(device)
            target = target.to(device)
            output = model(image)
            loss = loss_fn(output, target)
            val_loss += loss.item() * image.size(0)
            _, preds = torch.max(output, 1)
            val_acc += (preds == target).float().sum().item()
        
    train_loss /= len(train_loader.dataset)
    val_loss /= len(val_loader.dataset)
    train_acc /= len(train_loader.dataset)
    val_acc /= len(val_loader.dataset)
    print(f"[Epoch {epoch:>2}] Train loss: {train_loss:.5f} | Train accuracy: {train_acc:>7.2%} | Val loss: {val_loss:.5f} | Val accuracy: {val_acc:>7.2%}")
    data.append([epoch, learning_rate, train_loss, train_acc, val_loss, val_acc])
    if epoch == 2:
        break

model_name = model._get_name() + '_' + datetime.now().strftime("%y%m%d-%H%M%S")

# save model summary
df = pd.DataFrame(data, columns=["epoch", "learning_rate", "train_loss", "train_acc", "val_loss", "val_acc"])
Path("./runs").mkdir(exist_ok=True)
df.to_csv("./runs/" + model_name + ".csv", index=False)

# save model
# https://stackoverflow.com/questions/54746829/pytorch-whats-the-difference-between-state-dict-and-parameters
Path("./models").mkdir(exist_ok=True)
# torch.save(model.state_dict(), "./models/" + model_name + '_weights_' + '.pth') # parameters only
torch.save(model, "./models/" + model_name + '.pth')

[Epoch  1] Train loss: 0.27831 | Train accuracy:  88.32% | Val loss: 1.12007 | Val accuracy:  56.25%


[Epoch  1] Train loss: 0.27831 | Train accuracy:  88.32% | Val loss: 1.12007 | Val accuracy:  56.25%

### Compare models

In [None]:
# available runs
runs = list(Path("./runs").rglob("*.csv"))

In [None]:
# choose best performing model for test set
summary = []
for csv in runs:
    epochs, *_, val_acc = pd.read_csv(csv).iloc[-1]
    summary.append([csv.stem, int(epochs), val_acc])
pd.DataFrame(summary, columns=["model", "epochs", "val_acc"])

In [None]:
fig = make_subplots(rows=2, cols=2,
                    shared_xaxes=True,
                    vertical_spacing=0.02)

nb_colors = len(px.colors.qualitative.Plotly)
for i, run in enumerate(runs):
    df = pd.read_csv(run)
    color = px.colors.qualitative.Plotly[i%nb_colors]
    fig.add_trace(go.Scatter(x=df["epoch"], y=df["train_loss"], name=run.stem + ".train_loss", marker=dict(color=color), mode="lines"),
                  row=1, col=1)
    fig.add_trace(go.Scatter(x=df["epoch"], y=df["val_loss"], name=run.stem + ".val_loss", marker=dict(color=color), mode="lines"),
                  row=2, col=1)
    fig.add_trace(go.Scatter(x=df["epoch"], y=df["train_acc"], name=run.stem + ".train_acc", marker=dict(color=color), mode="lines"),
                  row=1, col=2)
    fig.add_trace(go.Scatter(x=df["epoch"], y=df["val_acc"], name=run.stem + ".val_acc", marker=dict(color=color), mode="lines"),
                  row=2, col=2)

fig.update_yaxes(title_text="train_loss", row=1, col=1)
fig.update_yaxes(title_text="val_loss", row=2, col=1)
fig.update_yaxes(title_text="train_acc", row=1, col=2)
fig.update_yaxes(title_text="val_acc", row=2, col=2)
fig.update_xaxes(title_text="epoch", tick0=1, dtick=1, row=2, col=1)
fig.update_xaxes(title_text="epoch", tick0=1, dtick=1, row=2, col=2)
fig.update_layout(hovermode="x unified", margin=dict(l=0, r=0, t=0, b=0))

fig.show()

### Load model

In [None]:
# model.load_state_dict(torch.load())
model = torch.load('./models/DenseNet_230101-111116.pth')

In [None]:
test_loss = 0
nb_test_batches = len(test_loader)

metric = BinaryConfusionMatrix().to(device)

model.eval()
with torch.no_grad():
    for batch, (image, target) in enumerate(test_loader, start=1):
        print(f"Test batch progress: {batch:>3}/{nb_test_batches}", end='\r')
        image = image.to(device)
        target = target.to(device)
        output = model(image)
        loss = loss_fn(output, target)
        test_loss += loss.item() * image.size(0)
        _, preds = torch.max(output, 1)
        metric.update(preds, target)

clear_output()
test_loss /= len(test_loader.dataset)
conf_matrix = metric.compute().cpu()
accuracy = conf_matrix.diag().sum() / conf_matrix.sum()
print(f"Test loss: {test_loss:.5f}")
print(f"Accuracy: {accuracy:.2%}")

In [None]:
normalized = True

conf_matrix_normalized = conf_matrix/conf_matrix.sum(1).reshape((2,1))

fig = px.imshow(
    conf_matrix_normalized if normalized else conf_matrix,
    text_auto=".2%" if normalized else True,
    labels=dict(x="<b>Prediction</b>", y="<b>Ground Truth</b>"),
    x=train_data.classes,
    y=train_data.classes,
    title="<b>Confusion matrix</b>",
    width=700,
)
fig.update_xaxes(side="top")
# fig.update_coloraxes(showscale=False)
fig.show()
# confusion matrix shows True Positives (TP), False Positives (FP), True Negatives (TN) and False Negatives (FN)

cest logique davoir une meilleure accuracy sur pneumonia car le dataset possede
bcp plus de samples avec pneumonia donc le model a appris a vraiment reconnaitre
pneumonia