In [None]:
import pandas as pd
import re
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import seaborn as sns

pd.set_option('display.max_colwidth', None)



# Demographics

In [None]:
demo_df = pd.read_csv('processed_data/demo.csv')
cohort_df = pd.read_csv('processed_data/cohort.csv')

In [None]:
demo_df.head()

In [None]:
cohort_df.head()

In [None]:
merged_demo_df = demo_df.merge(cohort_df, how = 'left', on = 'MRN')

In [None]:
merged_demo_df.head()

In [None]:
def calculate_demographics(df, condition):
    subset = df[condition]
    total = len(subset)
    demographics = {
        'N': total,
        'Male': (subset['sx_birth_male'] == 1).sum(),
        'Female': (subset['sx_birth_female'] == 1).sum(),
        'Non-Hispanic White': ((subset['race_white'] == 1) & (subset['ethnicity_hispanic'] != 1)).sum(),
        'Non-Hispanic Black': ((subset['race_black'] == 1) & (subset['ethnicity_hispanic'] != 1)).sum(),
        'Non-Hispanic Asian': (((subset['race_asian'] == 1 ) |
                                (subset['race_nhpi'] == 1)) & (subset['ethnicity_hispanic'] != 1)).sum(),
        'Hispanic': (subset['ethnicity_hispanic'] == 1).sum(),
        'Other Race/Ethnicity': (((subset['race_aian'] == 1) | (subset['race_other'] == 1)) & (subset['ethnicity_hispanic'] != 1)).sum()
    }
    percentages = {key: (value / total) * 100 if total > 0 else 0 for key, value in demographics.items()}
    return demographics, percentages

total_demographics, total_percentages = calculate_demographics(merged_demo_df, np.ones(len(merged_demo_df), dtype=bool))

glaucoma_demographics, glaucoma_percentages = calculate_demographics(merged_demo_df, merged_demo_df['outcome'] == 1)

non_glaucoma_demographics, non_glaucoma_percentages = calculate_demographics(merged_demo_df, merged_demo_df['outcome'] == 0)



In [None]:
demographics_df = pd.DataFrame({
    'Total Population': total_demographics,
    'Total Population %': total_percentages,
    'Glaucoma Patients': glaucoma_demographics,
    'Glaucoma Patients %': glaucoma_percentages,
    'Non-Glaucoma Patients': non_glaucoma_demographics,
    'Non-Glaucoma Patients %': non_glaucoma_percentages
})

demographics_df

In [None]:
overall_mean = merged_demo_df['age'].mean()
overall_sd = merged_demo_df['age'].std()
glaucoma_mean = merged_demo_df[merged_demo_df['outcome'] == 1]['age'].mean()
glaucoma_sd = merged_demo_df[merged_demo_df['outcome'] == 1]['age'].std()
non_glaucoma_mean = merged_demo_df[merged_demo_df['outcome'] == 0]['age'].mean()
non_glaucoma_sd = merged_demo_df[merged_demo_df['outcome'] == 0]['age'].std()


age_df = pd.DataFrame({
    'Overall Mean': [overall_mean],
    'Overall SD': [overall_sd],
    'Glaucoma Mean': [glaucoma_mean],
    'Glaucoma SD': [glaucoma_sd],
    'Non-Glaucoma Mean': [non_glaucoma_mean],
    'Non-Glaucoma SD': [non_glaucoma_sd]
}, index=['Age'])

age_df


In [None]:
def calculate_specific_groups(df):
    total = len(df)
    non_hispanic_white = ((df['race_white'] == 1) & (df['ethnicity_hispanic'] != 1)).sum()
    non_hispanic_black = ((df['race_black'] == 1) & (df['ethnicity_hispanic'] != 1)).sum()
    non_hispanic_asian = (((df['race_asian'] == 1 ) | (df['race_nhpi'] == 1)) & (df['ethnicity_hispanic'] != 1)).sum()
    hispanic = (df['ethnicity_hispanic'] == 1).sum()
    other = (((df['race_aian'] == 1) | (df['race_other'] == 1)) & (df['ethnicity_hispanic'] != 1)).sum()
    groups = {
        'Non-Hispanic White': non_hispanic_white,
        'Non-Hispanic Black': non_hispanic_black,
        'Non-Hispanic Asian': non_hispanic_asian,
        'Hispanic': hispanic,
        'Other Race/Ethnicity': other
    }
    
    percentages = {key: (value / total) * 100 if total > 0 else 0 for key, value in groups.items()}
    
    return groups, percentages

# Calculate the values
groups, percentages = calculate_specific_groups(merged_demo_df)

print("Number of individuals in each group:")
print(groups)
print("\nPercentage of individuals in each group:")
print(percentages)



In [None]:
labels = list(groups.keys())
sizes  = list(groups.values())
total  = sum(sizes)
pct = [100*s/total for s in sizes]
colors = ['#4c78a8', '#f58518', '#54a24b', '#e45756']  # optional palette


fig = plt.figure(figsize=(6.4, 3.6), layout='constrained')
gs = fig.add_gridspec(ncols=2, width_ratios=[1.0, 0.55])

ax = fig.add_subplot(gs[0])
legax = fig.add_subplot(gs[1]); legax.axis('off')

wedges, *_ = ax.pie(sizes, startangle=90, counterclock=False)
ax.axis('equal'); ax.set_title('Race/Ethnicity')

legax.legend(
    wedges,
    [f'{l}: {p:.1f}% (n={s})' for l, p, s in zip(labels, pct, sizes)],
    loc='center', frameon=False,
    fontsize=12
)

plt.savefig('figures/pie_grid.png', bbox_inches='tight', pad_inches=0.02)
plt.show()

In [None]:
demographics_df = pd.DataFrame({
    'Total Population': total_demographics,
    'Total Population %': total_percentages,
    'Glaucoma Patients': glaucoma_demographics,
    'Glaucoma Patients %': glaucoma_percentages,
    'Non-Glaucoma Patients': non_glaucoma_demographics,
    'Non-Glaucoma Patients %': non_glaucoma_percentages
})

demographics_df.head()

In [None]:
demographics_df.to_csv('final_demo_table.csv', index=False)

In [None]:
merged_demo_df['age'].plot(kind='hist', bins=20, edgecolor='black', figsize=(6,4))
plt.xlabel('Age')
plt.title('Age Distribution')
plt.tight_layout()
plt.savefig('figures/age_distro.png')
plt.show()


In [None]:
age_df.to_csv('final_age_table.csv', index=False)

In [None]:
csv_file = 'test_metrics_4_21.csv' # post thresholding
df = pd.read_csv(csv_file)

In [None]:
df.head()

In [None]:
def parse_model_info(model):
    pattern_with_pct = r'-(\d+)-(\d+)?pct\.csv'
    
    match_with_pct = re.search(pattern_with_pct, model)
    
    frozen_layers = int(match_with_pct.group(1))
    if match_with_pct.group(2):
        percentage = int(match_with_pct.group(2))
    else:
        percentage = 100
    return frozen_layers, percentage


In [None]:
test_metrics_df = pd.read_csv('test_metrics_4_21.csv')
thresholds_df = pd.read_csv('thresholds_f1s_4_21.csv')

thresholds_df['Model'] = thresholds_df['Model'].str.replace('val', 'test')

test_metrics_df[['Unfrozen_Layers', 'Percentage']] = test_metrics_df['Model'].apply(lambda x: pd.Series(parse_model_info(x)))
sorted_df = test_metrics_df.sort_values(by=['Unfrozen_Layers', 'Percentage'])
columns_order = ['Model', 'Unfrozen_Layers', 'Percentage'] + [col for col in test_metrics_df.columns if col not in ['Model', 'Unfrozen_Layers', 'Percentage']]
sorted_df = sorted_df[columns_order].reset_index(drop=True)

merged_df = sorted_df.merge(thresholds_df[['Model', 'Best Threshold']], on='Model', how='left')



In [None]:
merged_df.head()

In [None]:
# merged_df.to_csv('frozen_autoencoders_merged_table.csv', index=False)

In [None]:
drop_model_df = merged_df.drop(columns=['Model'])
drop_model_df.round(3).to_csv('rounded_4_21_merged_table.csv', index=False)

In [None]:
df[['Unfrozen_Layers', 'Percentage']] = df['Model'].apply(lambda x: pd.Series(parse_model_info(x)))


In [None]:
df[['Model', 'Unfrozen_Layers', 'Percentage' ]].head()

In [None]:
sorted_df = df.sort_values(by=['Unfrozen_Layers', 'Percentage'])
columns_order = ['Unfrozen_Layers', 'Percentage'] + [col for col in df.columns if col not in ['Unfrozen_Layers', 'Percentage']]
sorted_df = sorted_df[columns_order].reset_index(drop=True)


In [None]:
selected_sorted_df = sorted_df.loc[:, sorted_df.columns != 'Model']
exclude_0_unfrozen_df = selected_sorted_df[selected_sorted_df['Unfrozen_Layers'] != 0]

In [None]:
pivot_table = exclude_0_unfrozen_df.pivot(index='Percentage', columns='Unfrozen_Layers', values='AUROC')

plt.figure(figsize=(10, 8))
sns.heatmap(pivot_table, annot=True, fmt=".3f", cmap='viridis')
plt.title("Glaucoma Pre-Screening Model AUROC", fontsize=20)
plt.xlabel("Trainable Layers", fontsize=16)
plt.ylabel("Percent of Total Training Data", fontsize=16)
plt.gca().invert_yaxis()
plt.savefig('figures/heatmap.tiff', format='tiff')
plt.show()


In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

# Plot 1: Unfrozen Layers vs. Performance (AUROC) with multiple lines for each percentage of training data
for percentage in selected_sorted_df['Percentage'].unique():
    subset = selected_sorted_df[
        (selected_sorted_df['Percentage'] == percentage) &
        (selected_sorted_df['Unfrozen_Layers'] != 0)
    ]
    ax1.plot(subset['Unfrozen_Layers'], subset['AUROC'], marker='o', label=f'{percentage}% Training Data')
ax1.set_xlabel('Number of Trainable Layers', fontsize=20)
ax1.set_ylabel('Performance (AUROC)', fontsize=20)
ax1.set_title('Trainable Layers vs. Performance (AUROC)', fontsize=24)
ax1.legend(fontsize=14)
ax1.grid(True)
ax1.tick_params(axis='both', labelsize=14)
ax1.set_xticks(range(0, int(selected_sorted_df['Unfrozen_Layers'].max()) + 2, 2))
ax1.text(-0.1, 1.1, 'A', transform=ax1.transAxes, fontsize=24, fontweight='bold', va='top', ha='right')

# Plot 2: Percentage of Training Data vs. Performance (AUROC) with multiple lines for each number of unfrozen layers
for layers in selected_sorted_df['Unfrozen_Layers'].unique():
    if layers == 0:
        continue
    subset = selected_sorted_df[selected_sorted_df['Unfrozen_Layers'] == layers]
    ax2.plot(subset['Percentage'], subset['AUROC'], marker='o', label=f'{layers} Unfrozen Layers')
ax2.set_xlabel('Percent of Training Data', fontsize=20)
ax2.set_ylabel('Performance (AUROC)', fontsize=20)
ax2.set_title('Percent of Training Data vs. Performance (AUROC)', fontsize=24)
ax2.legend(loc='upper left', fontsize=14)
ax2.grid(True)
ax2.tick_params(axis='both', labelsize=14)
ax2.text(-0.1, 1.1, 'B', transform=ax2.transAxes, fontsize=24, fontweight='bold', va='top', ha='right')

plt.tight_layout()
plt.savefig('figures/layers_and_trains_size.tiff', format='tiff')
plt.show()

