In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/chexpert-v10-small/CheXpert-v1.0-small/valid.csv
/kaggle/input/chexpert-v10-small/CheXpert-v1.0-small/train.csv
/kaggle/input/chexpert-v10-small/CheXpert-v1.0-small/valid/patient64611/study1/view1_frontal.jpg
/kaggle/input/chexpert-v10-small/CheXpert-v1.0-small/valid/patient64545/study1/view1_frontal.jpg
/kaggle/input/chexpert-v10-small/CheXpert-v1.0-small/valid/patient64711/study1/view1_frontal.jpg
/kaggle/input/chexpert-v10-small/CheXpert-v1.0-small/valid/patient64612/study1/view1_frontal.jpg
/kaggle/input/chexpert-v10-small/CheXpert-v1.0-small/valid/patient64684/study1/view1_frontal.jpg
/kaggle/input/chexpert-v10-small/CheXpert-v1.0-small/valid/patient64648/study1/view1_frontal.jpg
/kaggle/input/chexpert-v10-small/CheXpert-v1.0-small/valid/patient64682/study1/view1_frontal.jpg
/kaggle/input/chexpert-v10-small/CheXpert-v1.0-small/valid/patient64587/study1/view1_frontal.jpg
/kaggle/input/chexpert-v10-small/CheXpert-v1.0-small/valid/patient64558/study1/view2_lateral.jpg
/

In [None]:
from fastai.vision.all import *
from pathlib import Path
import matplotlib.ticker as ticker
import seaborn as sns
from fastai.vision.data import *

# Load data

In [None]:
path = Path('/kaggle/input/chexpert-v10-small/CheXpert-v1.0-small/train')
path.ls

# Inspect the data

**Exploring the label distrubution**

*Used the same approach as in the lecture where we used the PlantDoc data set*

In [None]:
#Loading CSV-files
train_path = Path('/kaggle/input/chexpert-v10-small/CheXpert-v1.0-small/train.csv')

# Read the CSV files
train_df = pd.read_csv(train_path)

In [None]:
#This was done in the lecture so I wanted to see if it gave me any insight, 
#but it didn´t give much
observation = ['No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 
               'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 
               'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 
               'Pleural Other', 'Fracture', 'Support Devices']

# Creating a dictionary for train and test classes with the class name as 
#the key and a count as the value
class_counts = {obs: int(np.count_nonzero(train_df.columns.isin([obs]))) for 
                obs in observation}

# Creating a figure and a set of subplots
fig, ax = plt.subplots()

# Adding bars that has the appropriate labels for train and test classes
train_heights = list(class_counts.values())
ax.bar(class_counts.keys(), train_heights, label='TRAIN')

# For better visuality, we rotate x-axis labels
plt.xticks(rotation=90)

# Set the x-axis limits to match the range of the x-axis labels
ax.set_xlim([-0.5, len(class_counts)-0.5])

# Adding labels and observations
ax.set_xlabel('Observations')
ax.set_ylabel('Number of columns containing the observation')
ax.set_title('Number of columns containing each observation in train and test set')

# Adding a legend
ax.legend()

# Display the plot
plt.show()

# Cleaning of the data

In [None]:
train_df

In [None]:
#I am doing this to check the distrabution of the files, ideally I want them all to be equal
train_df.info()

In [None]:
# There are open gaps in the dataframes so we do some assesments to handle them. So I choose
#to replace the where there where -1.0 to a 1.0, a found, because I assume it is a typo.
#And where there are empty spaces I choose to but in a 0.0 meaning there where no present of
#the observation
train_df.replace(-1.0, 1.0, inplace=True)
train_df = train_df.fillna(0.0)
train_df.head()

In [None]:
#The column "Sex" and "Age", I assume it has little to do with the model's ability 
#to read images. Therefore they are dropped.
train_df = train_df.drop(columns='Age')
train_df = train_df.drop(columns='Sex')

#Since I only know (ish) how to read x-rays from the front, I drop the images that are 
#taken form the side, meaning the lateral ones
train_df = train_df[train_df['Frontal/Lateral'] == 'Frontal']

#I also choose to drop the "AP/PA", simply becuase I don´t know what it does so therefore 
#I drop it and I have also seen in the documentation and in other people's code, and 
#found nothing. 
train_df = train_df.drop(columns='AP/PA')

train_df.head()

In [None]:
# Converting the observations from float to int, because it messed up my diagrams earlier
#and I want to see if I can get a better diagram
train_df['Enlarged Cardiomediastinum'] = train_df['Enlarged Cardiomediastinum'].astype(int)
train_df['Cardiomegaly'] = train_df['Cardiomegaly'].astype(int)
train_df['Lung Opacity'] = train_df['Lung Opacity'].astype(int)
train_df['Lung Lesion'] = train_df['Lung Lesion'].astype(int)
train_df['Edema'] = train_df['Edema'].astype(int)
train_df['Consolidation'] = train_df['Consolidation'].astype(int)
train_df['Pneumonia'] = train_df['Pneumonia'].astype(int)
train_df['Atelectasis'] = train_df['Atelectasis'].astype(int)
train_df['Pneumothorax'] = train_df['Pneumothorax'].astype(int)
train_df['Pleural Effusion'] = train_df['Pleural Effusion'].astype(int)
train_df['Pleural Other'] = train_df['Pleural Other'].astype(int)
train_df['No Finding'] = train_df['No Finding'].astype(int)
train_df['Fracture'] = train_df['Fracture'].astype(int)
train_df['Support Devices'] = train_df['Support Devices'].astype(int)

train_df.head()

In [None]:
train_df.info()

In [None]:
#These lines of code are taken from 
#Define the class labels 
class_col1 = train_df["No Finding"].tolist()
class_col2 = train_df["Enlarged Cardiomediastinum"].tolist()
class_col3 = train_df["Cardiomegaly"].tolist()
class_col4 = train_df["Lung Opacity"].tolist()
class_col5 = train_df["Lung Lesion"].tolist()
class_col6 = train_df["Edema"].tolist()
class_col7 = train_df["Consolidation"].tolist()
class_col8 = train_df["Pneumonia"].tolist()
class_col9 = train_df["Atelectasis"].tolist()
class_col10 = train_df["Pneumothorax"].tolist()
class_col11 = train_df["Pleural Effusion"].tolist()
class_col12 = train_df["Pleural Other"].tolist()
class_col13 = train_df["Fracture"].tolist()
class_col14 = train_df["Support Devices"].tolist()

classes = class_col1 + class_col2 + class_col3 + class_col4 + class_col5 + class_col6 + class_col7 + class_col8 + class_col9 + class_col10 + class_col11 + class_col12 + class_col13 + class_col14

# Define the data block
def get_data(df, size=224, batch_size=64):
    classes = list(train_df.columns)
    data_block = DataBlock(
        blocks=(ImageBlock, CategoryBlock),
        get_items=lambda x: classes,
        get_y=lambda x: train_df["Path"].tolist(),
        splitter=RandomSplitter(valid_pct=0.2, seed=42),
        item_tfms=Resize(224),
        batch_tfms=(RandomResizedCrop(224, min_scale=0.7))
    )
    return data_block.dataloaders(train_df, size, batch_size)

dls = get_data(train_df, size=224, batch_size=64)

In [None]:
#ploting the first batch just to see what we are dealing with
dls = data_block.dataloaders(df.values)
dls.show_batch(max_n=16)

In [None]:
dls.show_batch()

# Training a model

In [None]:
learn = vision_learner(dls, resnet34, metrics=accuracy)


In [None]:
lr_sugg = learn.lr_find(suggest_funcs=[minimum, steep, valley, slide])

In [None]:
learn.fine_tune(6, lr_sugg.slide)

In [None]:
interp.plot_confusion_matrix(figsize=(12,12))

# Credits
- Approach for label distrubution :
https://github.com/HVL-ML/DAT255/blob/main/nbs/DAT255-1.1-asl-plantdoc.ipynb
