In [None]:
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from ast import literal_eval
from scipy.stats import ks_2samp
import itertools
from scipy.stats import anderson_ksamp, kruskal
from statsmodels.stats.multitest import multipletests

In [24]:
file_path = '../densenet_test_embeddings.csv'
df = pd.read_csv(file_path)


Data Cleaning

In [25]:
test = df
test['embeddings'] = test['embeddings'].apply(ast.literal_eval)
test = test.drop(columns=['path_to_image', 'path_to_dcm'])

In [26]:
initial_size = test.shape[0]

# The previous logic with transforming the list to string and filtering on the length of said string is not necessarily stable and misleading.
# Let's implement a more explicit test for what we actually care about:

test = test[test['embeddings'].apply(type) == list]

final_size = test.shape[0]

print(f'Number of test removed rows = {initial_size - final_size}')

Number of test removed rows = 51


In [27]:
a = 70
test['age_categ'] = (test['age'] >= a).astype(int)

In [28]:
test

Unnamed: 0,age,sex,race,insurance_type,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,embeddings,age_categ
0,78.0,1,0,1,0,0,1,0,0,1,0,0,1,0,1,0,1,1,"[0.0029132624622434378, 0.1020001769065857, 0....",1
1,63.0,0,1,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[0.0014348188415169716, 0.0543656125664711, 0....",0
2,70.0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,"[0.001982336398214102, 0.040021587163209915, 0...",1
3,79.0,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,"[0.001741771469824016, 0.0560498870909214, 0.1...",1
4,67.0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,"[9.678312198957428e-05, 0.12247737497091293, 0...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40353,35.0,1,1,2,0,0,0,1,0,0,0,0,0,0,0,0,0,1,"[0.006717992480844259, 0.026951316744089127, 0...",0
40354,81.0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,"[0.0, 0.06270646303892136, 0.24802044034004211...",1
40355,52.0,1,0,2,0,0,0,0,0,0,0,0,1,0,1,0,0,1,"[0.0, 0.024455098435282707, 0.1902244687080383...",0
40356,47.0,1,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,1,"[0.0, 0.009000508114695549, 0.0751149877905845...",0


Kolmogrov Smirnov Test

In [36]:
# Step 1: Create a subset of the dataframe with specific diseases
diseases_of_interest = ['Cardiomegaly', 'Lung Opacity', 'Edema', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion']
new_df = test[test[diseases_of_interest].any(axis=1)].copy()

# Step 2: Extract embeddings and normalize the data
embeddings = np.array(new_df['embeddings'].tolist())
scaler = StandardScaler()
embeddings_normalized = scaler.fit_transform(embeddings)

# Step 3: Apply PCA
pca = PCA(n_components=4)
pca_result = pca.fit_transform(embeddings_normalized)

# Add PCA results to the dataframe
for i in range(4):
    new_df[f'PCA_{i+1}'] = pca_result[:, i]


Sex

In [37]:
# Separate data based on 'sex' values
group_male = new_df[new_df['sex'] == 0]

group_female = new_df[new_df['sex'] == 1]

# Columns to test
pca_columns = ['PCA_1', 'PCA_2', 'PCA_3', 'PCA_4']
ks_results = {}

# Run Kolmogorov-Smirnov test for each PCA column
for column in pca_columns:
    # Perform KS test
    ks_stat, p_value = ks_2samp(group_male[column], group_female[column])
    # Store the results
    ks_results[column] = {'KS Statistic': ks_stat, 'p-value': p_value}

# Display the results
ks_results


{'PCA_1': {'KS Statistic': 0.1074059710024966,
  'p-value': 3.603951537363955e-78},
 'PCA_2': {'KS Statistic': 0.16894257636112675,
  'p-value': 1.7357119833921794e-193},
 'PCA_3': {'KS Statistic': 0.07811850926789321,
  'p-value': 1.6541959868948176e-41},
 'PCA_4': {'KS Statistic': 0.17340901814870918,
  'p-value': 6.711258293494411e-204}}

Race

In [38]:
# List of PCA columns to test
pca_columns = ['PCA_1', 'PCA_2', 'PCA_3', 'PCA_4']
# Unique race categories (assuming three races)
race_categories = new_df['race'].unique()

# Dictionary to store KS test results
ks_results_race = {}

# Loop over each PCA column
for column in pca_columns:
    ks_results_race[column] = {}

    # Generate all unique pairs of race categories
    for race_pair in itertools.combinations(race_categories, 2):
        race_1, race_2 = race_pair

        # Filter data for each race group
        group_race_1 = new_df[new_df['race'] == race_1]
        group_race_2 = new_df[new_df['race'] == race_2]

        # Perform KS test
        ks_stat, p_value = ks_2samp(group_race_1[column], group_race_2[column])

        # Store the result in the dictionary
        ks_results_race[column][column,f"{race_1} vs {race_2}"] = {'KS Statistic': ks_stat, 'p-value': p_value}

# Display results
ks_results_race

{'PCA_1': {('PCA_1', '0 vs 1'): {'KS Statistic': 0.07548192343031945,
   'p-value': 6.941088698750058e-20},
  ('PCA_1', '0 vs 2'): {'KS Statistic': 0.04943049162583424,
   'p-value': 0.00010448734893743983},
  ('PCA_1', '1 vs 2'): {'KS Statistic': 0.0447984095646252,
   'p-value': 0.004911669193629346}},
 'PCA_2': {('PCA_2', '0 vs 1'): {'KS Statistic': 0.16202601019403207,
   'p-value': 1.8840222301002718e-90},
  ('PCA_2', '0 vs 2'): {'KS Statistic': 0.039887361980813774,
   'p-value': 0.0032443559990622834},
  ('PCA_2', '1 vs 2'): {'KS Statistic': 0.18235858893082924,
   'p-value': 1.055987780988705e-43}},
 'PCA_3': {('PCA_3', '0 vs 1'): {'KS Statistic': 0.07270673549912943,
   'p-value': 1.7657106057443392e-18},
  ('PCA_3', '0 vs 2'): {'KS Statistic': 0.020117603192403966,
   'p-value': 0.38500336923300915},
  ('PCA_3', '1 vs 2'): {'KS Statistic': 0.08600015920077281,
   'p-value': 5.047185001652804e-10}},
 'PCA_4': {('PCA_4', '0 vs 1'): {'KS Statistic': 0.19438729566519292,
   'p-va

Age

In [39]:
# Separate data based on 'sex' values
group_young = new_df[new_df['age_categ'] == 0]
group_old = new_df[new_df['age_categ'] == 1]

# Columns to test
pca_columns = ['PCA_1', 'PCA_2', 'PCA_3', 'PCA_4']
ks_results = {}

# Run Kolmogorov-Smirnov test for each PCA column
for column in pca_columns:
    # Perform KS test
    ks_stat, p_value = ks_2samp(group_young[column], group_old[column])
    # Store the results
    ks_results[column] = {'KS Statistic': ks_stat, 'p-value': p_value}

# Display the results
ks_results


{'PCA_1': {'KS Statistic': 0.13356417131991927,
  'p-value': 2.4511082506780252e-118},
 'PCA_2': {'KS Statistic': 0.03147198913622337,
  'p-value': 5.915427211757379e-07},
 'PCA_3': {'KS Statistic': 0.09125630647134664,
  'p-value': 2.3286575524836533e-55},
 'PCA_4': {'KS Statistic': 0.15392698688588674,
  'p-value': 3.126624002361944e-157}}

Insurance Type

In [40]:

# List of PCA columns to test
pca_columns = ['PCA_1', 'PCA_2', 'PCA_3', 'PCA_4']
# Unique race categories (assuming three races)
insurance_categories = new_df['insurance_type'].unique()

# Dictionary to store KS test results
ks_results_insurance = {}

# Loop over each PCA column
for column in pca_columns:
    ks_results_insurance[column] = {}

    # Generate all unique pairs of race categories
    for race_pair in itertools.combinations(insurance_categories, 2):
        race_1, race_2 = race_pair

        # Filter data for each race group
        group_race_1 = new_df[new_df['insurance_type'] == race_1]
        group_race_2 = new_df[new_df['insurance_type'] == race_2]

        # Perform KS test
        ks_stat, p_value = ks_2samp(group_race_1[column], group_race_2[column])

        # Store the result in the dictionary
        ks_results_insurance[column][column,f"{race_1} vs {race_2}"] = {'KS Statistic': ks_stat, 'p-value': p_value}

# Display results
ks_results_insurance


{'PCA_1': {('PCA_1', '1 vs 2'): {'KS Statistic': 0.10781983564835101,
   'p-value': 1.0668023103607894e-59},
  ('PCA_1', '1 vs 0'): {'KS Statistic': 0.13184883297665606,
   'p-value': 7.965014794026655e-36},
  ('PCA_1', '2 vs 0'): {'KS Statistic': 0.02633727091807395,
   'p-value': 0.12516352164559258}},
 'PCA_2': {('PCA_2', '1 vs 2'): {'KS Statistic': 0.03551575608406543,
   'p-value': 7.54522307164046e-07},
  ('PCA_2', '1 vs 0'): {'KS Statistic': 0.06447297967113161,
   'p-value': 7.1121597438958685e-09},
  ('PCA_2', '2 vs 0'): {'KS Statistic': 0.03184330976904387,
   'p-value': 0.03499770477852689}},
 'PCA_3': {('PCA_3', '1 vs 2'): {'KS Statistic': 0.038584362013408424,
   'p-value': 5.252064744362301e-08},
  ('PCA_3', '1 vs 0'): {'KS Statistic': 0.056127227014570735,
   'p-value': 7.891170903665137e-07},
  ('PCA_3', '2 vs 0'): {'KS Statistic': 0.019838477860306316,
   'p-value': 0.40989481210438466}},
 'PCA_4': {('PCA_4', '1 vs 2'): {'KS Statistic': 0.1176509801079269,
   'p-value'

Kruskal Wallis Test

In [49]:

# List of PCA columns
pca_columns = ['PCA_1', 'PCA_2', 'PCA_3', 'PCA_4']
kruskal_results = {}

# Run Kruskal-Wallis test for each PCA column across races
for column in pca_columns:
    groups = [new_df[new_df['race'] == race][column] for race in new_df['race'].unique()]
    kruskal_stat, p_value = kruskal(*groups)
    kruskal_results[column] = {'Kruskal-Wallis Statistic': kruskal_stat, 'p-value': p_value}

# Display results
kruskal_results


{'PCA_1': {'Kruskal-Wallis Statistic': 94.03916340320447,
  'p-value': 3.798876067499975e-21},
 'PCA_2': {'Kruskal-Wallis Statistic': 626.2866084929128,
  'p-value': 1.0083026783677734e-136},
 'PCA_3': {'Kruskal-Wallis Statistic': 118.90108257019965,
  'p-value': 1.5169035709838102e-26},
 'PCA_4': {'Kruskal-Wallis Statistic': 790.9687012263499,
  'p-value': 1.7511721816635826e-172}}

In [50]:
# List of PCA columns
pca_columns = ['PCA_1', 'PCA_2', 'PCA_3', 'PCA_4']
race_categories = new_df['race'].unique()

# Dictionary to store KS test results
ks_results_race = {}
all_p_values = []

# Loop over each PCA column
for column in pca_columns:
    ks_results_race[column] = {}

    # Generate all unique pairs of race categories
    for race_pair in itertools.combinations(race_categories, 2):
        race_1, race_2 = race_pair

        # Filter data for each race group
        group_race_1 = new_df[new_df['race'] == race_1]
        group_race_2 = new_df[new_df['race'] == race_2]

        # Perform KS test
        ks_stat, p_value = ks_2samp(group_race_1[column], group_race_2[column])

        # Store the result
        ks_results_race[column][f"{race_1} vs {race_2}"] = {'KS Statistic': ks_stat, 'p-value': p_value}

        # Collect the p-value for BY correction
        all_p_values.append(p_value)

# Apply Benjamini-Yekutieli correction
_, by_corrected_p_values, _, _ = multipletests(all_p_values, method='fdr_by')

# Insert corrected p-values back into the results dictionary
p_value_index = 0
for column in pca_columns:
    for race_pair in itertools.combinations(race_categories, 2):
        race_1, race_2 = race_pair
        ks_results_race[column][f"{race_1} vs {race_2}"]['BY-corrected p-value'] = by_corrected_p_values[p_value_index]
        p_value_index += 1

# Display results
ks_results_race


{'PCA_1': {'0 vs 1': {'KS Statistic': 0.07548192343031945,
   'p-value': 6.941088698750058e-20,
   'BY-corrected p-value': 5.169518536408474e-19},
  '0 vs 2': {'KS Statistic': 0.04943049162583424,
   'p-value': 0.00010448734893743983,
   'BY-corrected p-value': 0.0004323283426141179},
  '1 vs 2': {'KS Statistic': 0.0447984095646252,
   'p-value': 0.004911669193629346,
   'BY-corrected p-value': 0.01662757558855529}},
 'PCA_2': {'0 vs 1': {'KS Statistic': 0.16202601019403207,
   'p-value': 1.8840222301002718e-90,
   'BY-corrected p-value': 3.5079107414600755e-89},
  '0 vs 2': {'KS Statistic': 0.039887361980813774,
   'p-value': 0.0032443559990622834,
   'BY-corrected p-value': 0.012081504216248342},
  '1 vs 2': {'KS Statistic': 0.18235858893082924,
   'p-value': 1.055987780988705e-43,
   'BY-corrected p-value': 1.3107810232096594e-42}},
 'PCA_3': {'0 vs 1': {'KS Statistic': 0.07270673549912943,
   'p-value': 1.7657106057443392e-18,
   'BY-corrected p-value': 1.0958744012751357e-17},
  '

In [44]:
ad_results = {}

# Run Anderson-Darling for each PCA column and race pairs
for column in pca_columns:
    ad_results[column] = {}
    for race_pair in itertools.combinations(new_df['race'].unique(), 2):
        race_1, race_2 = race_pair
        group_race_1 = new_df[new_df['race'] == race_1][column]
        group_race_2 = new_df[new_df['race'] == race_2][column]
        ad_stat, p_value, _ = anderson_ksamp([group_race_1, group_race_2])
        ad_results[column][f"{race_1} vs {race_2}"] = {'AD Statistic': ad_stat, 'p-value': p_value}

# Display results
ad_results


NameError: name 'anderson_ksamp' is not defined