In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path

root_dir = '../input/a-covid-multiclass-dataset-of-ct-scans/New_Data_CoV2'
root_path = Path(root_dir)
covid_path = root_path / 'Covid'
healthy_path = root_path / 'Healthy'
other_path = root_path / 'Other'

images = list(root_path.rglob('*.png'))
patients = [p.parts[-2] for p in images]
target = [p.parts[-3] for p in images]

df = pd.DataFrame(np.array([images, patients, target]).T, columns=['Image path', 'Patient', 'target'])
df

Unnamed: 0,Image path,Patient,target
0,../input/a-covid-multiclass-dataset-of-ct-scan...,Patient (51),Covid
1,../input/a-covid-multiclass-dataset-of-ct-scan...,Patient (51),Covid
2,../input/a-covid-multiclass-dataset-of-ct-scan...,Patient (51),Covid
3,../input/a-covid-multiclass-dataset-of-ct-scan...,Patient (51),Covid
4,../input/a-covid-multiclass-dataset-of-ct-scan...,Patient (51),Covid
...,...,...,...
4166,../input/a-covid-multiclass-dataset-of-ct-scan...,Patient (14),Healthy
4167,../input/a-covid-multiclass-dataset-of-ct-scan...,Patient (14),Healthy
4168,../input/a-covid-multiclass-dataset-of-ct-scan...,Patient (14),Healthy
4169,../input/a-covid-multiclass-dataset-of-ct-scan...,Patient (14),Healthy


In [2]:
df.groupby('target').size()

target
Covid      2167
Healthy     757
Others     1247
dtype: int64

In [3]:
oversample = False
undersample = True
if oversample:
    healthy_rows_idxs = df[df['target'] == 'Healthy'].index.values.tolist()
    non_healthy_rows_idxs = df.index.values.tolist() 
    df = df.iloc[healthy_rows_idxs*2 + non_healthy_rows_idxs]
    
if undersample:
    non_healthy = df[df['target'] == 'Covid'].index.values
    non_healthy = np.random.choice(non_healthy, 750).tolist()
    
    non_covid = df[df['target'] != 'Covid'].index.values.tolist()
    
    df = df.iloc[non_covid + non_healthy]

In [4]:
df.groupby('target').size()

target
Covid       750
Healthy     757
Others     1247
dtype: int64

In [5]:
d = []
for n, ndf in  df.groupby(['target']):
    pnames = ndf.Patient.unique()
    d.append(pnames)
    
    
for i, dd in enumerate(d):
    print(f'd[{i}].shape = {dd.shape}')

np.intersect1d(d[0], d[1]).shape, np.intersect1d(d[0], d[2]).shape, np.intersect1d(d[2], d[1]).shape

d[0].shape = (80,)
d[1].shape = (50,)
d[2].shape = (80,)


((50,), (80,), (50,))

In [6]:
from sklearn.model_selection import train_test_split
train_patients, testval_patients = train_test_split(
    df.Patient.unique(),
    train_size = 0.8
)

test_patients, val_patients = train_test_split(
    testval_patients,
    train_size = 0.5
)

train_df = df[df['Patient'].isin(train_patients)]
test_df = df[df['Patient'].isin(test_patients)]
val_df = df[df['Patient'].isin(val_patients)]

In [7]:
train_patients.shape, val_patients.shape, test_patients.shape, train_df.shape,val_df.shape, test_df.shape

((64,), (8,), (8,), (2215, 3), (269, 3), (270, 3))

In [8]:
def infer(loader):
    gts = []
    predictions = []

    model.eval()
    with torch.no_grad():
        for images, labels in loader:
            out = model(images.to(device))
            gts.extend(labels.tolist())
            predictions.extend(out.argmax(1).tolist())

    from sklearn import metrics
    cm = metrics.confusion_matrix(gts, predictions, normalize='true')
    return gts, predictions, np.unique(gts, return_counts=True), cm


In [None]:
import torch
import torch.nn as nn
import matplotlib.image as mpimg

model = nn.Sequential(
    nn.Conv2d(3, 4, 3),
    nn.BatchNorm2d(4),
    nn.Conv2d(4, 16, 3),
    nn.BatchNorm2d(16),
    nn.ReLU(),
    nn.MaxPool2d(2),
    
    nn.Conv2d(16, 32, 3),
    nn.BatchNorm2d(32),
    nn.Conv2d(32, 32, 3),
    nn.BatchNorm2d(32),
    nn.ReLU(),
    nn.MaxPool2d(2),
    
    nn.AdaptiveAvgPool2d(1),
    nn.Flatten(),
    nn.Linear(32, 16),
    nn.BatchNorm1d(16),
    nn.ReLU(),
    nn.Linear(16, 3)
)

from torch.utils.data import Dataset, DataLoader

class_dict = {'Healthy': 0, 'Covid': 1, 'Others': 2}

class dataset(Dataset):
    def __init__(self, df, class_dict=class_dict):
        super().__init__()
        self.df = df
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx, :]
        image_path = row['Image path']
        target = class_dict[row['target']]

        image = mpimg.imread(image_path)[:,:,:3]
        image = image / 255.0 if image.max() > 1.0 else image
        return torch.Tensor(image).reshape(3, *image.shape[:2]), torch.Tensor([target])
    
def collate_fn(batch):
    dims = np.array([tuple(x[0].shape) for x in batch])
    max_dims = dims.max(0)

    out = torch.zeros(len(batch), *max_dims)
    labels = torch.zeros(len(batch))
    for i, (image, label) in enumerate(batch):
        out[i, :, :image.shape[1], :image.shape[2]] = image
        labels[i] = label
        
    return out, labels.long()

In [10]:
batch_size = 16
num_workers = 0
epochs = 10
lr = 1e-4
device = 'cuda'
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

train_set = dataset(train_df)
train_loader = DataLoader(train_set, batch_size=batch_size, collate_fn=collate_fn, num_workers=num_workers)

val_set = dataset(val_df)
val_loader = DataLoader(val_set, batch_size=batch_size, collate_fn=collate_fn, num_workers=num_workers)

test_set = dataset(test_df)
test_loader = DataLoader(test_set, batch_size=batch_size, collate_fn=collate_fn, num_workers=num_workers)

losses = []

for epoch in range(epochs):
    epoch_train_loss = 0
    model.train()
    for images, labels in train_loader:
        out = model(images.to(device))
        loss = criterion(out, labels.to(device))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_train_loss += loss.item() / len(train_loader)
                
    epoch_validation_loss = 0
    model.eval()
    with torch.no_grad():
        for images, labels in val_loader:
            out = model(images.to(device))
            loss = criterion(out, labels.to(device))

            epoch_validation_loss += loss.item() / len(val_loader)
    out = infer(val_loader)
    print(f'Epoch {epoch}:\t train loss {epoch_train_loss:0.4e}\t val loss {epoch_validation_loss:0.4e}')
    print(out[-1])
    losses.append([epoch_train_loss, epoch_validation_loss])

Epoch 0:	 train loss 1.1227e+00	 val loss 1.1175e+00
[[0.14084507 0.71830986 0.14084507]
 [0.13924051 0.59493671 0.26582278]
 [0.17647059 0.67226891 0.1512605 ]]
Epoch 1:	 train loss 1.1095e+00	 val loss 1.1078e+00
[[0.12676056 0.54929577 0.32394366]
 [0.10126582 0.48101266 0.41772152]
 [0.10084034 0.58823529 0.31092437]]
Epoch 2:	 train loss 1.1031e+00	 val loss 1.1064e+00
[[0.09859155 0.46478873 0.43661972]
 [0.12658228 0.39240506 0.48101266]
 [0.05882353 0.55462185 0.38655462]]
Epoch 3:	 train loss 1.0979e+00	 val loss 1.1041e+00
[[0.08450704 0.43661972 0.47887324]
 [0.07594937 0.36708861 0.55696203]
 [0.06722689 0.5210084  0.41176471]]
Epoch 4:	 train loss 1.0944e+00	 val loss 1.1054e+00
[[0.09859155 0.49295775 0.4084507 ]
 [0.08860759 0.29113924 0.62025316]
 [0.10084034 0.48739496 0.41176471]]
Epoch 5:	 train loss 1.0914e+00	 val loss 1.0999e+00
[[0.09859155 0.3943662  0.50704225]
 [0.05063291 0.25316456 0.69620253]
 [0.05882353 0.38655462 0.55462185]]
Epoch 6:	 train loss 1.0885e

In [11]:
len(train_set)

2215

In [12]:
def infer(loader):
    gts = []
    predictions = []

    model.eval()
    with torch.no_grad():
        for images, labels in loader:
            out = model(images.to(device))
            gts.extend(labels.tolist())
            predictions.extend(out.argmax(1).tolist())

    from sklearn import metrics
    cm = metrics.confusion_matrix(gts, predictions, normalize='true')
    return gts, predictions, np.unique(gts, return_counts=True), cm

out = infer(train_loader)
out[-1]

array([[0.03606557, 0.21639344, 0.74754098],
       [0.0762987 , 0.14448052, 0.77922078],
       [0.03336704, 0.19716886, 0.76946411]])

In [13]:
out = infer(val_loader)
out[-1]

array([[0.07042254, 0.29577465, 0.63380282],
       [0.03797468, 0.11392405, 0.84810127],
       [0.05042017, 0.18487395, 0.76470588]])

In [14]:
out = infer(test_loader)
out[-1]

array([[0.03947368, 0.14473684, 0.81578947],
       [0.09090909, 0.2       , 0.70909091],
       [0.10071942, 0.17266187, 0.72661871]])

In [15]:
loader = test_loader
gts = []
predictions = []

model.eval()
with torch.no_grad():
    for images, labels in loader:
        out = model(images.to(device))
        loss = criterion(out, labels.to(device))

        gts.extend(labels.tolist())
        predictions.extend(out.argmax(1).tolist())
        
from sklearn import metrics
print(metrics.confusion_matrix(gts, predictions, normalize='true'))
np.unique(gts, return_counts=True)

[[0.03947368 0.14473684 0.81578947]
 [0.09090909 0.2        0.70909091]
 [0.10071942 0.17266187 0.72661871]]


(array([0, 1, 2]), array([ 76,  55, 139]))

In [16]:
len(train_set), len(val_set),len(test_set)

(2215, 269, 270)

In [17]:
len(train_df), len(val_df),len(test_df)

(2215, 269, 270)

In [18]:
train_set.df.shape, val_set.df.shape,test_set.df.shape

((2215, 3), (269, 3), (270, 3))

In [19]:
target

['Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',
 'Covid',


In [20]:
# Calculate accuracy for all datasets
from sklearn import metrics

# Training set accuracy
train_gts, train_preds, _, _ = infer(train_loader)
train_accuracy = metrics.accuracy_score(train_gts, train_preds)

# Validation set accuracy
val_gts, val_preds, _, _ = infer(val_loader)
val_accuracy = metrics.accuracy_score(val_gts, val_preds)

# Test set accuracy
test_gts, test_preds, _, _ = infer(test_loader)
test_accuracy = metrics.accuracy_score(test_gts, test_preds)

print("=" * 50)
print("CLASSIFIER ACCURACY SUMMARY")
print("=" * 50)
print(f"Training Set Accuracy:   {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"Validation Set Accuracy: {val_accuracy:.4f} ({val_accuracy*100:.2f}%)")
print(f"Test Set Accuracy:       {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print("=" * 50)

CLASSIFIER ACCURACY SUMMARY
Training Set Accuracy:   0.3937 (39.37%)
Validation Set Accuracy: 0.3903 (39.03%)
Test Set Accuracy:       0.4259 (42.59%)
