In [3]:
import pandas as pd
import glob
from matplotlib import pyplot as plt
import os
import cv2
import seaborn as sns
import tqdm
import numpy as np

In [4]:
image_path_list = glob.glob('JustRAIGS/images/*.*')
len(image_path_list)

101423

In [5]:
df = pd.read_csv('JustRAIGS_Train_labels.csv', sep=';')
len(df)

101423

# Generating Unique Patient IDs 

In [6]:
exception_count = 0
df['generated_patient_id'] = -1
for idx, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    fellow_eye_id = row['Fellow Eye ID']
    try:
        if df[df['Eye ID'] == fellow_eye_id]['generated_patient_id'].values[0] == -1:
            df.loc[idx, 'generated_patient_id'] = idx
            # print('idx', idx)
            df.loc[df['Eye ID'] == fellow_eye_id, 'generated_patient_id'] = idx
    except Exception as e:
        exception_count += 1

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 101423/101423 [23:27<00:00, 72.04it/s]


In [8]:
next_id = df['generated_patient_id'].max()+1
for idx, row in tqdm.tqdm(df[df['generated_patient_id'] == -1].iterrows()):
    df.loc[idx, 'generated_patient_id'] = next_id
    next_id = df['generated_patient_id'].max()+1

9101it [00:04, 2058.65it/s]


In [9]:
import numpy as np
np.unique(list(df['generated_patient_id']), return_counts=True)[1].mean()

1.835311787485071

In [13]:
df['filename'] = [f.split(os.sep)[-1] for f in image_path_list]

In [14]:
df['path'] = [os.path.join('JustRAIGS/images/', f.split(os.sep)[-1]) for f in image_path_list]

In [15]:
assert len(df[~df['generated_patient_id'].isna()]) == len(image_path_list)
assert len(image_path_list) == len(df)

In [53]:
# Save prepared df
df.to_csv('JustRAIGS_Train_labels_PREPARED.csv', index=False)

# Soft Label Transformation

In [16]:
df = pd.read_csv('JustRAIGS_Train_labels_PREPARED.csv')

# Transform GT labels

In [17]:
def get_label_consensus(g1, g2, g3):
    if g1 == g2:
        consensus = g1
    else:
        consensus = g3

    if consensus == 'U':
        consensus = g3
    return consensus

## Final Label, STR -> INT mapping
'NRG': 0<br>
'RG' : 1

In [35]:
ref_target_mapping =  {'NRG': 0, 'RG' : 1, }
df['Final Label'] = df['Final Label'].replace(ref_target_mapping)

## Final Label, smooth mapping
smooth label = (positive assesments) / (number of assesments)

In [19]:
def get_smooth_label_consensus(g1, g2, g3):
    RG_count = 0
    valid_label_count = 0
    
    for v in [g1, g2, g3]:
        if v == 'RG':
            RG_count += 1

        if v == 'RG' or v == 'NRG':
            valid_label_count += 1
    
    consensus = RG_count / valid_label_count
    return consensus
    
header_g1 = 'Label G1'
header_g2 = 'Label G2'
header_g3 = 'Label G3'

df['Smooth Final Label'] = df.apply(lambda x: get_smooth_label_consensus(x[header_g1], x[header_g2], x[header_g3]), axis=1)

In [20]:
set(df['Smooth Final Label'])

{0.0, 0.3333333333333333, 0.5, 0.6666666666666666, 1.0}

In [21]:
df.value_counts('Smooth Final Label')

Smooth Final Label
0.000000    94753
0.333333     3220
1.000000     2663
0.666667      554
0.500000      233
dtype: int64

## Justification Labels, eval mapping

In [22]:
def hamming_loss(true_labels, predicted_labels):
    """Calculate the Hamming loss for the given true and predicted labels."""
    # Convert to numpy arrays for efficient computation
    true_labels = np.array(true_labels)
    predicted_labels = np.array(predicted_labels)

    # Calculate the hamming distance that is basically the total number of mismatches
    Hamming_distance = np.sum(np.not_equal(true_labels, predicted_labels))
    print("Hamming distance", Hamming_distance)
    
    # Calculate the total number of labels
    total_corrected_labels= true_labels.size

    # Compute the Modified Hamming loss
    loss = Hamming_distance / total_corrected_labels
    return loss

In [23]:
G1_label_columns = ['G1 ANRS', 'G1 ANRI', 'G1 RNFLDS', 'G1 RNFLDI', 'G1 BCLVS', 'G1 BCLVI', 'G1 NVT', 'G1 DH', 'G1 LD', 'G1 LC']
G2_label_columns = ['G2 ANRS', 'G2 ANRI', 'G2 RNFLDS', 'G2 RNFLDI', 'G2 BCLVS', 'G2 BCLVI', 'G2 NVT', 'G2 DH', 'G2 LD', 'G2 LC']
G3_label_columns = ['G3 ANRS', 'G3 ANRI', 'G3 RNFLDS', 'G3 RNFLDS', 'G3 BCLVS', 'G3 BCLVI', 'G3 NVT', 'G3 DH', 'G3 LD', 'G3 LC']

# Just for checking on training dataset
#---------------------------------------
pred_labels=[1,1,0,0,0,0,0,0,1,1]

#pred_labels=[1,0,1,0,1,0,1,0,0,1]
#----------------------------------------


# If grader 3 labels are prsent evalution check
# row = df.loc[100]

# for grader 1 and 2, there is no grader 3
# row = df.loc[34]

def get_justification_labels_for_eval(row):
    G1_labels = row[G1_label_columns].values.tolist()
    G2_labels = row[G2_label_columns].values.tolist()
    G3_labels = row[G3_label_columns].values.tolist()
    
    if (row['Final Label'] == 'RG') or (row['Final Label'] == 1):
        if row['Label G3'] == 'RG':
            true_justification_labels = G3_labels
        else:
            # G1(j) != G2(j) -> 0

            # find features which have disaggrement
            disagreed_features = np.not_equal(G1_labels, G2_labels)
            
            # Select specific columns where disagreed_features is True
            true_justification_labels = np.array(G2_labels)
            true_justification_labels[disagreed_features] = 0

    else:
        return np.zeros(len(G1_label_columns))

    return true_justification_labels


def get_justification_labels_for_training_smooth(row, justification_for_positive_only=True):
    G1_labels = row[G1_label_columns].values.tolist()
    G2_labels = row[G2_label_columns].values.tolist()
    G3_labels = row[G3_label_columns].values.tolist()
    
    if (not justification_for_positive_only) or ((row['Final Label'] == 'RG') or (row['Final Label'] == 1)):
        true_justification_labels = np.nanmean(np.array([G1_labels, G2_labels, G3_labels]), 0)

    else:
        true_justification_labels = np.zeros(len(G1_label_columns))

    return true_justification_labels

In [24]:
new_column_data = df.apply(lambda x: 
                get_justification_labels_for_eval(
                    x, 
                ), axis=1, result_type='expand',
               ).rename(
    columns={0:'Eval ANRS', 
             1:'Eval ANRI', 
             2:'Eval RNFLDS', 
             3:'Eval RNFLDI', 
             4:'Eval BCLVS',
             5:'Eval BCLVI',
             6:'Eval NVT',
             7:'Eval DH',
             8:'Eval LD',
             9:'Eval LC'})
for col in new_column_data:
    print(col)
    df[col] = new_column_data[col]


new_column_data = df.apply(lambda x: 
                get_justification_labels_for_training_smooth(
                    x, 
                    justification_for_positive_only=True
                ), axis=1, result_type='expand'
               ).rename(
    columns={0:'Smooth Only RG ANRS', 
             1:'Smooth Only RG ANRI', 
             2:'Smooth Only RG RNFLDS', 
             3:'Smooth Only RG RNFLDI', 
             4:'Smooth Only RG BCLVS',
             5:'Smooth Only RG BCLVI',
             6:'Smooth Only RG NVT',
             7:'Smooth Only RG DH',
             8:'Smooth Only RG LD',
             9:'Smooth Only RG LC'})
for col in new_column_data:
    print(col)
    df[col] = new_column_data[col]

new_column_data = df.apply(lambda x: 
                get_justification_labels_for_training_smooth(
                    x, 
                    justification_for_positive_only=False
                ), axis=1, result_type='expand'
               ).rename(
    columns={0:'Smooth Include NRG ANRS', 
             1:'Smooth Include NRG ANRI', 
             2:'Smooth Include NRG RNFLDS', 
             3:'Smooth Include NRG RNFLDI', 
             4:'Smooth Include NRG BCLVS',
             5:'Smooth Include NRG BCLVI',
             6:'Smooth Include NRG NVT',
             7:'Smooth Include NRG DH',
             8:'Smooth Include NRG LD',
             9:'Smooth Include NRG LC'})
for col in new_column_data:
    print(col)
    df[col] = new_column_data[col]

Eval ANRS
Eval ANRI
Eval RNFLDS
Eval RNFLDI
Eval BCLVS
Eval BCLVI
Eval NVT
Eval DH
Eval LD
Eval LC
Smooth Only RG ANRS
Smooth Only RG ANRI
Smooth Only RG RNFLDS
Smooth Only RG RNFLDI
Smooth Only RG BCLVS
Smooth Only RG BCLVI
Smooth Only RG NVT
Smooth Only RG DH
Smooth Only RG LD
Smooth Only RG LC
Smooth Include NRG ANRS
Smooth Include NRG ANRI
Smooth Include NRG RNFLDS
Smooth Include NRG RNFLDI
Smooth Include NRG BCLVS
Smooth Include NRG BCLVI
Smooth Include NRG NVT
Smooth Include NRG DH
Smooth Include NRG LD
Smooth Include NRG LC


In [25]:
for cols in zip(
    ['Eval ANRS', 'Eval ANRI', 'Eval RNFLDS', 'Eval RNFLDI', 'Eval BCLVS',
       'Eval BCLVI', 'Eval NVT', 'Eval DH', 'Eval LD', 'Eval LC',
       ],
    ['Smooth Only RG ANRS', 'Smooth Only RG ANRI', 'Smooth Only RG RNFLDS',
       'Smooth Only RG RNFLDI', 'Smooth Only RG BCLVS', 'Smooth Only RG BCLVI',
       'Smooth Only RG NVT', 'Smooth Only RG DH', 'Smooth Only RG LD',
       'Smooth Only RG LC', ],
    ['Smooth Include NRG ANRS',
       'Smooth Include NRG ANRI', 'Smooth Include NRG RNFLDS',
       'Smooth Include NRG RNFLDI', 'Smooth Include NRG BCLVS',
       'Smooth Include NRG BCLVI', 'Smooth Include NRG NVT',
       'Smooth Include NRG DH', 'Smooth Include NRG LD',
       'Smooth Include NRG LC'],
):
    print(cols)
    df.loc[(df[cols[0]] == 1) & (df[cols[1]] < 1), cols[1]] = df[cols[0]]
    df.loc[(df[cols[0]] == 1) & (df[cols[2]] < 1), cols[2]] = df[cols[0]]

('Eval ANRS', 'Smooth Only RG ANRS', 'Smooth Include NRG ANRS')
('Eval ANRI', 'Smooth Only RG ANRI', 'Smooth Include NRG ANRI')
('Eval RNFLDS', 'Smooth Only RG RNFLDS', 'Smooth Include NRG RNFLDS')
('Eval RNFLDI', 'Smooth Only RG RNFLDI', 'Smooth Include NRG RNFLDI')
('Eval BCLVS', 'Smooth Only RG BCLVS', 'Smooth Include NRG BCLVS')
('Eval BCLVI', 'Smooth Only RG BCLVI', 'Smooth Include NRG BCLVI')
('Eval NVT', 'Smooth Only RG NVT', 'Smooth Include NRG NVT')
('Eval DH', 'Smooth Only RG DH', 'Smooth Include NRG DH')
('Eval LD', 'Smooth Only RG LD', 'Smooth Include NRG LD')
('Eval LC', 'Smooth Only RG LC', 'Smooth Include NRG LC')


In [None]:
# Save prepared df
df.to_csv('JustRAIGS_Train_labels_PREPARED.csv', index=False)

## Label Smoothing Result

In [34]:
df[[
    'Final Label', 
    'Label G1', 'Label G2','Label G3', 
    'G1 ANRS', 'G2 ANRS', 'G3 ANRS', 
    'Eval ANRS', 
    'Smooth Only RG ANRS',
    'Smooth Include NRG ANRS', 
]].drop_duplicates(
    [
        'G1 ANRS', 'G2 ANRS', 'G3 ANRS',
        'Eval ANRS', 
        'Smooth Only RG ANRS',
        'Smooth Include NRG ANRS', 
    ]).sort_values(
    by=[
        'Final Label', 
        'Label G1', 'Label G2','Label G3', 
        'G1 ANRS', 'G2 ANRS', 'G3 ANRS',
    ])

Unnamed: 0,Final Label,Label G1,Label G2,Label G3,G1 ANRS,G2 ANRS,G3 ANRS,Eval ANRS,Smooth Only RG ANRS,Smooth Include NRG ANRS
0,0,NRG,NRG,,0.0,0.0,,0.0,0.0,0.0
137,0,NRG,RG,NRG,0.0,1.0,0.0,0.0,0.0,0.333333
65,0,NRG,U,NRG,0.0,0.0,0.0,0.0,0.0,0.0
33,0,NRG,,NRG,0.0,,0.0,0.0,0.0,0.0
11,0,RG,NRG,NRG,1.0,0.0,0.0,0.0,0.0,0.333333
1285,0,RG,,NRG,1.0,,0.0,0.0,0.0,0.5
660,1,NRG,RG,RG,0.0,0.0,1.0,1.0,1.0,1.0
7119,1,NRG,,RG,0.0,,1.0,1.0,1.0,1.0
373,1,RG,NRG,RG,1.0,0.0,0.0,0.0,0.333333,0.333333
68,1,RG,NRG,RG,1.0,0.0,1.0,1.0,1.0,1.0
