# Chest X Ray Pulmonary Diagnoses using DenseNet for Indian data
## 4A Semester project, Mahindra University
### Dipyaman Roy, Suchir R Punuru

---

## Imports

In [None]:
# Import necessary packages
import keras
from keras import backend as K
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import cv2
import sklearn
import shap
import os
import seaborn as sns
import time
import pickle

sns.set()

# This sets a common size for all the figures we will draw.
plt.rcParams['figure.figsize'] = [10, 7]

## Data Exploration
Getting familiarized with chest x-ray images taken from the public [ChestX-ray8 dataset](https://arxiv.org/abs/1705.02315). The following are using the images classified with methods outlined in [Holste et al. (2023)](https://arxiv.org/abs/2308.09180). 

In [None]:
# Read csv file containing training data
train_df = pd.read_csv("data/CXR8/PruneCXR/miccai2023_nih-cxr-lt_labels_train.csv")
valid_df = pd.read_csv("data/CXR8/PruneCXR/miccai2023_nih-cxr-lt_labels_val.csv")
test_df = pd.read_csv("data/CXR8/PruneCXR/miccai2023_nih-cxr-lt_labels_test.csv")
print(f'There are {train_df.shape[0]} rows and {train_df.shape[1]} columns in the train data frame')
train_df.head()

In [None]:
print(f"Train set: The total subject ids are {train_df['subj_id'].count()}, from those the unique ids are {train_df['subj_id'].value_counts().shape[0]} ")
print(f"Validation set: The total subject ids are {valid_df['subj_id'].count()}")
print(f"Test set: The total subject ids are {test_df['subj_id'].count()}")

### Preventing Data Leakage
It is worth noting that our dataset contains multiple images for each patient. This could be the case, for example, when a patient has taken multiple X-ray images at different times during their hospital visits. In our data splitting, we have ensured that the split is done on the patient level so that there is no data "leakage" between the train, validation, and test datasets.

def check_for_leakage(df1, df2, subj_col):
    """
    Return True if there any subjects are in both df1 and df2.

    Args:
        df1 (dataframe): dataframe describing first dataset
        df2 (dataframe): dataframe describing second dataset
        subj_col (str): string name of column with subject IDs
    
    Returns:
        leakage (bool): True if there is leakage, otherwise False
    """
    
    df1_subjects_unique = set(df1[subj_id])
    df2_subjects_unique = set(df2[subj_id])
    
    subjects_in_both_groups = list(df1_subjects_unique.intersection(df2_subjects_unique))

    # leakage contains true if there is subject overlap, otherwise false.
    leakage = len(subjects_in_both_groups) > 0 
        
    return leakage