In [1]:
import os
import sys
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from lifelines import CoxPHFitter

sys.path.append(os.path.abspath('..'))
from utils.utils_constants import (VESSEL_NEPTUNE_PAT_INFO_W_SCORE_W_FEATURE_PATH as  VESSEL_PAT_INFO_W_SCORE_W_FEATURE_PATH,
                                   DISEASE_TYPES, ARTERY_TYPES,
)

SEVERITY_MAPPING = {
    '0:absent': 0,
    '1:mild (1-25%)': 1,
    '2:moderate (26-50%)': 2,
    '3:severe (>50%)': 3,
}

DEMOGRAPHICS = ['PAT_Sex', 'PAT_Race', 'PAT_Hispanic', 'PAT_AgeV3']
CLININCAL_DATA = ['PAT_Cohort', 'eGFRatBx', 'UPCRatBx']
DESCRIPTOR = ['ArterioSclerosis', "ArterialHyalinosis"]

In [2]:
def binary_map(x, positive_value):
    return 1 if x == positive_value else 0

In [3]:
suffix = "_measurements_exclude_hya_manual"
agg_feature_path = VESSEL_PAT_INFO_W_SCORE_W_FEATURE_PATH.replace(".csv", f"{suffix}.csv")
pat_df = pd.read_csv(agg_feature_path)

pat_df.dropna(subset = DEMOGRAPHICS + CLININCAL_DATA, inplace=True)
total_samples_before_qc = len(pat_df)
print("Number of Samples before QC:", total_samples_before_qc)
print(pat_df['PAT_Cohort'].value_counts())

Number of Samples before QC: 243
PAT_Cohort
4 - FSGS    134
2 - MCD     109
Name: count, dtype: int64


In [4]:
pat_df = pat_df[pat_df["Num_All_Arteries"] > 0]
total_samples = len(pat_df)
print("Number of Samples:", total_samples, f'with {total_samples_before_qc - total_samples} excluded')

Number of Samples: 225 with 18 excluded


In [5]:
# Calculating percentages for the cohort
print(pat_df['PAT_Cohort'].value_counts())
pat_df['PAT_Cohort'] = pat_df['PAT_Cohort'].apply(lambda x: binary_map(x, '2 - MCD'))
mcd_count = np.sum(pat_df['PAT_Cohort'])  # Assuming 'PAT_Cohort' counts MCD occurrences directly
fsgs_count = total_samples - mcd_count  # Assuming remaining are FSGS
# Calculate percentages
mcd_percent = (mcd_count / total_samples) * 100
fsgs_percent = (fsgs_count / total_samples) * 100
print(f"Percentage of MCD: {mcd_count} ({mcd_percent:.2f}%), FSGS: {fsgs_count} ({fsgs_percent:.2f}%)")

print(pat_df['PAT_Sex'].value_counts())
pat_df['PAT_Sex'] = pat_df['PAT_Sex'].apply(lambda x: binary_map(x, '2: Female'))
male_count = total_samples - np.sum(pat_df['PAT_Sex'])
male_percent = (male_count / total_samples) * 100
female_count = np.sum(pat_df['PAT_Sex'])
female_percent = (female_count / total_samples) * 100
print(f"Percentage of Males: {male_count} ({male_percent:.2f}%), Females: {female_count} ({female_percent:.2f}%)")

# Calculating under 18 count and percentage
under_18_count = np.sum(pat_df['PAT_AgeV3'] < 18)
under_18_percent = (under_18_count / total_samples) * 100
print(f"Number of samples under 18: {under_18_count}, Percentage of samples under 18: {under_18_percent:.2f}%")


PAT_Cohort
4 - FSGS    127
2 - MCD      98
Name: count, dtype: int64
Percentage of MCD: 98 (43.56%), FSGS: 127 (56.44%)
PAT_Sex
1: Male      130
2: Female     95
Name: count, dtype: int64
Percentage of Males: 130 (57.78%), Females: 95 (42.22%)
Number of samples under 18: 106, Percentage of samples under 18: 47.11%


In [6]:
race_mapping = {
    '2: Asian/Asian American': 'Asian or Asian American',
    '3: Black/African American': 'Black or African American',
    '5: White/Caucasian': 'White or Caucasian',
    '0: Multi-Racial': 'Others (multiracial or unknown)',
    '97: Unknown': 'Others (multiracial or unknown)'
}

# Applying the mapping
pat_df['Mapped_Race'] = pat_df['PAT_Race'].map(race_mapping)
new_race_counts = pat_df['Mapped_Race'].value_counts()

# Calculating percentages
new_race_percentages = (new_race_counts / total_samples) * 100

# Printing the results in LaTeX format
print("Race distribution:")
for race, count in new_race_counts.items():
    percentage = new_race_percentages[race]
    print(f"{race} & {count} ({percentage:.2f}%)")
pat_df['PAT_Race'] = pat_df['PAT_Race'].apply(lambda x: binary_map(x, '3: Black/African American'))


print(pat_df['PAT_Hispanic'].value_counts())
pat_df['PAT_Hispanic'] = pat_df['PAT_Hispanic'].apply(lambda x: binary_map(x, '1: Hispanic or Latino'))
hispanic_count = np.sum(pat_df['PAT_Hispanic'])
hispanic_percent = (hispanic_count / total_samples) * 100
print(f"Percentage of Hispanic or Latino: {hispanic_count} ({hispanic_percent:.2f}%)")

Race distribution:
White or Caucasian & 118 (52.44%)
Black or African American & 59 (26.22%)
Others (multiracial or unknown) & 25 (11.11%)
Asian or Asian American & 23 (10.22%)
PAT_Hispanic
2: Not Hispanic or Latino    168
1: Hispanic or Latino         53
97: Unknown                    4
Name: count, dtype: int64
Percentage of Hispanic or Latino: 53 (23.56%)


In [7]:
# Calculating medians and IQRs for eGFR and UPCR
egfr_median = np.median(pat_df['eGFRatBx'])
egfr_25th = np.percentile(pat_df['eGFRatBx'], 25)
egfr_75th = np.percentile(pat_df['eGFRatBx'], 75)
egfr_iqr = egfr_75th - egfr_25th

upcr_median = np.median(pat_df['UPCRatBx'])
upcr_25th = np.percentile(pat_df['UPCRatBx'], 25)
upcr_75th = np.percentile(pat_df['UPCRatBx'], 75)
upcr_iqr = upcr_75th - upcr_25th

# Output results with two decimal places
print(f"Median eGFR: {egfr_median:.2f}, IQR: ({egfr_25th:.2f}, {egfr_75th:.2f})")
print(f"Median UPCR: {upcr_median:.2f}, IQR: ({upcr_25th:.2f}, {upcr_75th:.2f})")

Median eGFR: 84.76, IQR: (54.11, 105.69)
Median UPCR: 3.20, IQR: (1.15, 8.67)


In [8]:
pat_df['ArterioSclerosis'] = pat_df['ArterioSclerosis'].map(SEVERITY_MAPPING)
pat_df['ArterialHyalinosis'] = pat_df['ArterialHyalinosis'].map(SEVERITY_MAPPING)
# Function to calculate count and percentage and return a formatted DataFrame
def calculate_distribution(column):
    value_counts = column.value_counts(dropna=False)
    percentage = column.value_counts(normalize=True, dropna=False) * 100
    # Formatting percentage with two decimal places
    formatted_percentage = percentage.map("{:.2f}%".format)
    return pd.DataFrame({
        'Count': value_counts,
        'Percentage': formatted_percentage
    })

# Calculate for both conditions
arterio_sclerosis_result = calculate_distribution(pat_df['ArterioSclerosis'])
arterial_hyalinosis_result = calculate_distribution(pat_df['ArterialHyalinosis'])

# Display the results
print("ArterioSclerosis Distribution:")
print(arterio_sclerosis_result)
print("ArterialHyalinosis Distribution:")
print(arterial_hyalinosis_result)


ArterioSclerosis Distribution:
                  Count Percentage
ArterioSclerosis                  
0.0                 120     53.33%
1.0                  46     20.44%
NaN                  27     12.00%
2.0                  19      8.44%
3.0                  13      5.78%
ArterialHyalinosis Distribution:
                    Count Percentage
ArterialHyalinosis                  
0.0                   150     66.67%
1.0                    40     17.78%
NaN                    24     10.67%
2.0                    11      4.89%


In [9]:
# Applying the function to various columns
covariates_to_normalize = ['PAT_AgeV3', 'eGFRatBx', 'UPCRatBx', 'ArterioSclerosis', 'ArterialHyalinosis']  # add numerical columns here
scaler = StandardScaler() # choose a scaler?
pat_df[covariates_to_normalize] = scaler.fit_transform(pat_df[covariates_to_normalize]) # in place
pat_df['DaysBXtoESRDorEGFR40_LR'] = pd.to_numeric(pat_df['DaysBXtoESRDorEGFR40_LR'], errors='coerce')
pat_df['ESRDorEGFR40BX_LR'] = pat_df['ESRDorEGFR40BX_LR'].map({'1: Yes': 1, '0: No': 0}).astype(int)

In [10]:
columns_of_interest = ['Num_Arterioles', 'Num_Interlobular_Arteries', 'Num_Arcuate_Arteries']

# Calculating the number of rows with values > 0 only for the specified columns
greater_than_zero_counts = (pat_df[columns_of_interest] > 0).sum()

# Calculating the sum of each specified column
sum_values = pat_df[columns_of_interest].sum()

# Combine the results into a single DataFrame for a clear presentation
result = pd.DataFrame({
    'Count of >0': greater_than_zero_counts,
    'Sum of Values': sum_values
})

print(result)

                           Count of >0  Sum of Values
Num_Arterioles                     217         1499.0
Num_Interlobular_Arteries          196          686.0
Num_Arcuate_Arteries                84          131.0


In [11]:
selected_features_by_type = {'Arterioles': {'Hyalinosis Area Ratio',
  'Intima Area Ratio',
  'Intima Median',
  'Log Artery Area',
  'Lumen Area Ratio'},
 'Interlobular Arteries': {'Intima Area Ratio', 'Ratio Average'},
 'Arcuate Arteries': {'Intima Area Ratio', 'Ratio Median'}}

agged_features_by_type = {}
agged_scores_by_type = {}

for artery_type in ARTERY_TYPES:
    agged_features = []
    for feature_name in selected_features_by_type[artery_type]:
        for agg_metric in ["Max", "Median", "75th"]:
            agged_feature = '_'.join([agg_metric, feature_name, "in", artery_type])
            agged_features.append(agged_feature.replace(" ", "_"))
    agged_features_by_type[artery_type] = agged_features

    score_features = []
    for disease in DISEASE_TYPES:
        if artery_type == "Arcuate Arteries" and disease == "Hyalinosis": continue
        for agg_metric in ["Max", "Median", "75th"]:
            score_feature= '_'.join([agg_metric, disease, "Severity", "in", artery_type])
            score_features.append(score_feature.replace(" ", "_"))
    agged_scores_by_type[artery_type] = score_features
    

In [12]:
for artery_type in ARTERY_TYPES:
    scaler = StandardScaler()
    agg_features = agged_features_by_type[artery_type]
    score_features = agged_scores_by_type[artery_type]
    pat_df[agg_features + score_features] = scaler.fit_transform(pat_df[agg_features + score_features])

In [13]:
def cox_model_analysis(df, base, features):
    # Create a figure and axis object with proper dimensions
    # Initialize and fit the Cox proportional hazards model
    if len(features) > 0:
        cph = CoxPHFitter(penalizer=0.01, l1_ratio=1)
        cph.fit(df, duration_col='DaysBXtoESRDorEGFR40_LR', event_col='ESRDorEGFR40BX_LR', formula="+".join(features))
        summary = cph.summary
        # Compute the absolute values of the coefficients and sort them
        summary['abs_coef'] = summary['coef'].abs()
        summary_sorted = summary.sort_values(by='abs_coef', ascending=False)
        selected_features = summary_sorted[summary_sorted["abs_coef"] > 0.1].index.values.tolist()
    else:
        selected_features = features

    cph = CoxPHFitter(penalizer=0.01, l1_ratio=1)
    cph.fit(df, duration_col='DaysBXtoESRDorEGFR40_LR', event_col='ESRDorEGFR40BX_LR', formula="+".join(base + selected_features))
    print(f"Cox Model Concordance: {cph.concordance_index_:.2f}")
    return cph, selected_features

In [14]:
def get_cox_stat_univariate(df, base, feature):
    cph = CoxPHFitter(penalizer=0.01, l1_ratio=1)
    cph.fit(df, duration_col='DaysBXtoESRDorEGFR40_LR', event_col='ESRDorEGFR40BX_LR', formula="+".join(base + [feature]))
    row = cph.summary.loc[feature, ["exp(coef)", "exp(coef) lower 95%", "exp(coef) upper 95%", "p"]]
    hr, p = row["exp(coef)"], row["p"]
    lower_95, upper_95 = row["exp(coef) lower 95%"], row["exp(coef) upper 95%"]
    if len(base) == 0:
        print(f"Unadjusted {feature}: HR {hr:.2f} ({lower_95:.2f}-{upper_95:.2f}), p: {p:.3f}")
    else:
        print(f"adjusted {feature}: HR {hr:.2f} ({lower_95:.2f}-{upper_95:.2f}), p: {p:.3f}")

def cox_single_analysis(pat_df_selected, selected_features):
    for feature in selected_features:
        get_cox_stat_univariate(pat_df_selected, [], feature)
        get_cox_stat_univariate(pat_df_selected, DEMOGRAPHICS + CLININCAL_DATA, feature)

In [15]:
agged_features = agged_features_by_type["Arcuate Arteries"]
score_features = agged_scores_by_type["Arcuate Arteries"]
pat_df_selected = pat_df.dropna(subset = DEMOGRAPHICS + CLININCAL_DATA  + DESCRIPTOR + 
                                score_features + agged_features, inplace = False)
print(pat_df_selected.shape)
cph, _ = cox_model_analysis(pat_df_selected, DEMOGRAPHICS + CLININCAL_DATA , [])
cph, _ = cox_model_analysis(pat_df_selected, DEMOGRAPHICS + CLININCAL_DATA , DESCRIPTOR)
cph, selected_score_features = cox_model_analysis(pat_df_selected, DEMOGRAPHICS + CLININCAL_DATA , score_features)
cox_single_analysis(pat_df_selected, selected_score_features)

cph, selected_agged_features = cox_model_analysis(pat_df_selected, DEMOGRAPHICS + CLININCAL_DATA , agged_features)
cox_single_analysis(pat_df_selected, selected_agged_features)

(71, 147)
Cox Model Concordance: 0.69
Cox Model Concordance: 0.72
Cox Model Concordance: 0.75
Unadjusted Median_Arteriosclerosis_Severity_in_Arcuate_Arteries: HR 0.82 (0.52-1.31), p: 0.409
adjusted Median_Arteriosclerosis_Severity_in_Arcuate_Arteries: HR 0.68 (0.34-1.39), p: 0.291
Unadjusted Max_Arteriosclerosis_Severity_in_Arcuate_Arteries: HR 0.95 (0.59-1.53), p: 0.828
adjusted Max_Arteriosclerosis_Severity_in_Arcuate_Arteries: HR 0.87 (0.48-1.57), p: 0.645
Cox Model Concordance: 0.74
Unadjusted 75th_Intima_Area_Ratio_in_Arcuate_Arteries: HR 0.69 (0.41-1.16), p: 0.163
adjusted 75th_Intima_Area_Ratio_in_Arcuate_Arteries: HR 0.54 (0.27-1.09), p: 0.084
Unadjusted Median_Ratio_Median_in_Arcuate_Arteries: HR 0.71 (0.43-1.17), p: 0.174
adjusted Median_Ratio_Median_in_Arcuate_Arteries: HR 0.57 (0.29-1.12), p: 0.103


In [16]:
agged_features = agged_features_by_type["Interlobular Arteries"]
score_features = agged_scores_by_type["Interlobular Arteries"]

pat_df_selected = pat_df.dropna(subset = DEMOGRAPHICS + CLININCAL_DATA + DESCRIPTOR + 
                                score_features + agged_features, inplace = False)
print(pat_df_selected.shape)

cph, _ = cox_model_analysis(pat_df_selected, DEMOGRAPHICS + CLININCAL_DATA , [])
cph, _ = cox_model_analysis(pat_df_selected, DEMOGRAPHICS + CLININCAL_DATA , DESCRIPTOR)
cph, selected_score_features = cox_model_analysis(pat_df_selected, DEMOGRAPHICS + CLININCAL_DATA , score_features)
cox_single_analysis(pat_df_selected, selected_score_features)

cph, selected_agged_features = cox_model_analysis(pat_df_selected, DEMOGRAPHICS + CLININCAL_DATA , agged_features)
cox_single_analysis(pat_df_selected, selected_agged_features)


(174, 147)
Cox Model Concordance: 0.70
Cox Model Concordance: 0.72
Cox Model Concordance: 0.71
Unadjusted Median_Arteriosclerosis_Severity_in_Interlobular_Arteries: HR 0.97 (0.72-1.32), p: 0.861
adjusted Median_Arteriosclerosis_Severity_in_Interlobular_Arteries: HR 0.88 (0.63-1.25), p: 0.487
Unadjusted Max_Arteriosclerosis_Severity_in_Interlobular_Arteries: HR 1.07 (0.81-1.42), p: 0.629
adjusted Max_Arteriosclerosis_Severity_in_Interlobular_Arteries: HR 1.02 (0.73-1.41), p: 0.928
Cox Model Concordance: 0.72
Unadjusted 75th_Ratio_Average_in_Interlobular_Arteries: HR 0.95 (0.70-1.30), p: 0.742
adjusted 75th_Ratio_Average_in_Interlobular_Arteries: HR 0.92 (0.65-1.29), p: 0.617
Unadjusted Max_Intima_Area_Ratio_in_Interlobular_Arteries: HR 1.08 (0.81-1.44), p: 0.579
adjusted Max_Intima_Area_Ratio_in_Interlobular_Arteries: HR 1.04 (0.73-1.46), p: 0.841
Unadjusted Median_Intima_Area_Ratio_in_Interlobular_Arteries: HR 1.00 (0.74-1.37), p: 0.975
adjusted Median_Intima_Area_Ratio_in_Interlobular

In [17]:
agged_features = agged_features_by_type["Arterioles"]
score_features = agged_scores_by_type["Arterioles"]
pat_df_selected = pat_df.dropna(subset = DEMOGRAPHICS + CLININCAL_DATA + DESCRIPTOR + 
                                score_features + agged_features, inplace = False)
print(pat_df_selected.shape)
cph, _ = cox_model_analysis(pat_df_selected, DEMOGRAPHICS + CLININCAL_DATA , [])
cph, _ = cox_model_analysis(pat_df_selected, DEMOGRAPHICS + CLININCAL_DATA , DESCRIPTOR)
cph, selected_score_features = cox_model_analysis(pat_df_selected, DEMOGRAPHICS + CLININCAL_DATA , score_features)
cox_single_analysis(pat_df_selected, selected_score_features)

cph, selected_agged_features = cox_model_analysis(pat_df_selected, DEMOGRAPHICS + CLININCAL_DATA , agged_features)
cox_single_analysis(pat_df_selected, selected_agged_features)

(192, 147)
Cox Model Concordance: 0.70
Cox Model Concordance: 0.70
Cox Model Concordance: 0.73
Unadjusted 75th_Hyalinosis_Severity_in_Arterioles: HR 1.06 (0.80-1.40), p: 0.693
adjusted 75th_Hyalinosis_Severity_in_Arterioles: HR 1.01 (0.74-1.38), p: 0.949
Unadjusted Median_Hyalinosis_Severity_in_Arterioles: HR 1.26 (1.01-1.57), p: 0.043
adjusted Median_Hyalinosis_Severity_in_Arterioles: HR 1.26 (0.99-1.60), p: 0.057
Unadjusted Max_Hyalinosis_Severity_in_Arterioles: HR 1.24 (0.92-1.66), p: 0.154
adjusted Max_Hyalinosis_Severity_in_Arterioles: HR 1.23 (0.87-1.73), p: 0.238
Unadjusted Max_Arteriosclerosis_Severity_in_Arterioles: HR 1.32 (1.05-1.65), p: 0.016
adjusted Max_Arteriosclerosis_Severity_in_Arterioles: HR 1.32 (1.04-1.67), p: 0.023
Cox Model Concordance: 0.75
Unadjusted 75th_Log_Artery_Area_in_Arterioles: HR 1.00 (0.99-1.01), p: 0.999
adjusted 75th_Log_Artery_Area_in_Arterioles: HR 1.00 (1.00-1.00), p: 0.999
Unadjusted 75th_Lumen_Area_Ratio_in_Arterioles: HR 0.73 (0.53-1.01), p: 0