# Imports

In [None]:
import pandas as pd
import sys
import numpy as np
import matplotlib.pyplot as plt
from skimage.io import imread
from skimage.io import imsave
from skimage.transform import resize
import seaborn as sns
from utils import *
from statsmodels.formula.api import logit

import warnings
warnings.filterwarnings("ignore", category=UserWarning)


## Dataframe


In [None]:
df = pd.read_csv("processed_data/chexpert_plus_240401_cleaned.csv")


df_labeled = pd.read_csv("final_data/chexpert_plus_240401_cleaned_label.csv")
df_bis = pd.read_csv("processed_data/chexpert_plus_240401_cleaned_sex.csv")
df_cleaned_health_insurance = pd.read_csv("processed_data/chexpert_plus_240401_cleaned_health_insurance.csv")
df_cleaned_race = pd.read_csv("processed_data/chexpert_plus_240401_cleaned_race.csv")
df_cleaned_sex_label = pd.read_csv("final_data/chexpert_plus_240401_cleaned_label_sex.csv")
df_cleaned_health_insurance_label = pd.read_csv("final_data/chexpert_plus_240401_cleaned_label_health.csv")
df_cleaned_race_label = pd.read_csv("final_data/chexpert_plus_240401_cleaned_label_race.csv")

In [None]:
print('Total number of acquisitions:', len(df))
print('Total number of patients:', len(df['deid_patient_id'].unique()))
print('Train/Valid:', df.split.value_counts())

In [None]:
print('Total number of acquisitions:', len(df_bis))
print('Total number of patients:', len(df_bis['deid_patient_id'].unique()))
print('Train/Valid:', df_bis.split.value_counts())


In [None]:
print('Total number of acquisitions:', len(df_cleaned_health_insurance))
print('Total number of patients:', len(df_cleaned_health_insurance['deid_patient_id'].unique()))
print('Train/Valid:', df_cleaned_health_insurance.split.value_counts())

In [None]:
print('Total number of acquisitions:', len(df_cleaned_race))
print('Total number of patients:', len(df_cleaned_race['deid_patient_id'].unique()))
print('Train/Valid:', df_cleaned_race.split.value_counts())

## Image examples

In [None]:
# img_f = imread("/data4/CheXpert/CheXpert-v1.0/train/patient04528/study1/view1_frontal.jpg")
# img_l = imread("/data4/CheXpert/CheXpert-v1.0/train/patient04528/study1/view2_lateral.jpg")

# fig, ax = plt.subplots(1, 2, figsize=(10, 5))
# ax[0].imshow(img_f, cmap='gray')
# ax[0].axis('off')
# ax[0].set_title(f'Frontal \n {img_f.shape[0]}x{img_f.shape[1]}')
# ax[1].imshow(img_l, cmap='gray')
# ax[1].axis('off')
# ax[1].set_title(f'Lateral \n {img_l.shape[0]}x{img_l.shape[1]}')

In [None]:
# # Resized
# img_f_resized = resize(img_f, (224, 224), preserve_range=True)
# img_l_resized = resize(img_l, (224, 224), preserve_range=True)

# fig, ax = plt.subplots(1, 2, figsize=(10, 5))
# ax[0].imshow(img_f_resized, cmap='gray')
# ax[0].axis('off')
# ax[0].set_title(f'Frontal \n {img_f_resized.shape[0]}x{img_f_resized.shape[1]}')
# ax[1].imshow(img_l_resized, cmap='gray')
# ax[1].axis('off')
# ax[1].set_title(f'Lateral \n {img_l_resized.shape[0]}x{img_l_resized.shape[1]}')


## Tabular Exploration

In [None]:
def plot_data_distribution(df, title='Data Distribution'):
    colors = get_colors(3)
    color_maps = {
        'sex': {'Male': colors[0], 'Female': colors[1]},
        'race': {'White': colors[0], 'Black': colors[1], 'Asian': colors[2]},
        'insurance_type': {'Private Insurance': colors[0], 'Medicare': colors[1], 'Medicaid': colors[2]}
    }

    fig, axes = plt.subplots(4, 4, figsize=(15, 15))
    fig.suptitle(title)

    # Categorical plots
    plot_categorical(df, 'sex', axes[0, 0], color_maps['sex'], df['sex'].value_counts().sum())
    plot_categorical(df, 'race', axes[0, 1], color_maps['race'], df['race'].value_counts().sum(), rotation=45)
    plot_categorical(df, 'insurance_type', axes[0, 2], color_maps['insurance_type'], df['insurance_type'].value_counts().sum(), rotation=45)
    axes[0, 3].axis('off')

    # Age distribution plots
    plot_age_distribution(df, axes[1, 3])
    plot_age_distribution(df, axes[1, 0], hue='sex', palette=color_maps['sex'])
    plot_age_distribution(df, axes[1, 1], hue='race', palette=color_maps['race'])
    plot_age_distribution(df, axes[1, 2], hue='insurance_type', palette=color_maps['insurance_type'])

    # Age boxplots
    plot_age_boxplot(df, axes[2, 3])
    plot_age_boxplot(df, axes[2, 0], x='sex', hue='sex', palette=color_maps['sex'], order=['Male', 'Female'])
    plot_age_boxplot(df, axes[2, 1], x='race', hue='race', palette=color_maps['race'], order=['White', 'Black', 'Asian'])
    plot_age_boxplot(df, axes[2, 2], x='insurance_type', hue='insurance_type', palette=color_maps['insurance_type'])

    # Insurance percentage plots
    plot_insurance_percentage(df, axes[3, 0], 'sex', hue_order=['Male', 'Female'], palette=color_maps['sex'])
    plot_insurance_percentage(df, axes[3, 1], 'race', hue_order=['White', 'Black', 'Asian'], palette=color_maps['race'])

    axes[3, 2].axis('off')
    axes[3, 3].axis('off')

    plt.tight_layout()
    plt.show()

plot_data_distribution(df, title='All samples: Distribution of Sex, Race, Ethnicity, and Age')


In [None]:
df.describe(include='all')

In [None]:
# Wähle nur numerische Spalten aus (ohne Strings wie Pfade oder Kategorische Daten)
numeric_cols = df.select_dtypes(include=[np.number])

# Berechne die Korrelationsmatrix
corr_matrix = numeric_cols.corr()

# Heatmap der Korrelationsmatrix
plt.figure(figsize=(15, 12))  # Größere Abbildungsgröße
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', annot_kws={"size": 12}, fmt=".2f")  # Größere Schrift für die Zahlen
plt.xticks(fontsize=12)  # Größere Schrift für X-Achsen-Beschriftung
plt.yticks(fontsize=12)  # Größere Schrift für Y-Achsen-Beschriftung
plt.title('Correlation Matrix of Numeric Features', fontsize=16)  # Größere Schrift für den Titel
plt.show()

In [None]:
# split in man and woman
df_cleaned_sex_label_male = df_cleaned_sex_label[df_cleaned_sex_label['sex']==0]
df_cleaned_sex_label_female = df_cleaned_sex_label[df_cleaned_sex_label['sex']==1]

In [None]:
print(f'Anzahl der Männer: {df_cleaned_sex_label_male.shape[0]}')
print(f'Anzahl der Frauen: {df_cleaned_sex_label_female.shape[0]}')

In [None]:
disease_columns = ['No Finding', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion',
                   'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis',
                   'Pneumothorax', 'Pleural Effusion', 'Pleural Other',
                   'Fracture', 'Support Devices']




In [None]:
# cross table betwwen sex and insurance type
health_by_sex = pd.crosstab(df['sex'], df['insurance_type'])
print(health_by_sex)

# Heatmap ot the results
sns.heatmap(health_by_sex, annot=True, fmt='d')
plt.title('Health Condition by Sex')
plt.show()

In [None]:
# list of diseases
disease_columns = ['No Finding', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion',
                   'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis',
                   'Pneumothorax', 'Pleural Effusion', 'Pleural Other',
                   'Fracture', 'Support Devices']

filtered_df = df[df[disease_columns].any(axis=1)]

#cross table for sex
disease_by_sex = pd.DataFrame()

for disease in disease_columns:
    # look if the patient has the disease
    disease_count = filtered_df[filtered_df[disease] == 1].groupby('sex').size()
    disease_by_sex[disease] = disease_count

# Nan = 0
disease_by_sex = disease_by_sex.fillna(0).astype(int)

print(disease_by_sex)

In [None]:
plt.figure(figsize=(12, 8))  
sns.heatmap(disease_by_sex, annot=True, fmt='d', cmap='YlGnBu', cbar=True,
            annot_kws={"size": 12})  
plt.title('Disease Frequency by Sex', fontsize=16)  
plt.ylabel('Sex', fontsize=14)  
plt.xlabel('Disease', fontsize=14)  
plt.xticks(fontsize=12) 
plt.yticks(fontsize=12) 
plt.show()

In [None]:
print(df_cleaned_sex_label.info())

In [None]:
# Laden der Daten
df_cleaned_sex_label = pd.read_csv("final_data/chexpert_plus_240401_cleaned_label_sex.csv")

# Umwandeln der Geschlechtsspalte in den String-Datentyp
df_cleaned_sex_label['sex'] = df_cleaned_sex_label['sex'].astype(str).str.lower()

# Auswahl der relevanten Spalten von 'Enlarged Cardiomediastinum' bis 'Support Devices'
relevant_columns = df_cleaned_sex_label.loc[:, 'Enlarged Cardiomediastinum':'Support Devices']

# Gruppieren nach Geschlecht und Summieren der Krankheitslabels
disease_counts = df_cleaned_sex_label.groupby('sex')[relevant_columns.columns].sum()

# Anzeigen der Ergebnisse
print(disease_counts)

In [None]:


# change in string data
df_cleaned_sex_label['sex'] = df_cleaned_sex_label['sex'].astype(str).str.lower()

# relevant columns 'Enlarged Cardiomediastinum' to 'Support Devices'
relevant_columns = df_cleaned_sex_label.loc[:, 'Enlarged Cardiomediastinum':'Support Devices']

#group by sex and sum by disease
disease_counts = df_cleaned_sex_label.groupby('sex')[relevant_columns.columns].sum()

# Plot
ax = disease_counts.T.plot(kind='bar', figsize=(14, 7), width=0.8)

# Add values on top of the bars
for p in ax.patches:
    ax.annotate(f'{p.get_height():.0f}', (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='center', xytext=(0, 5), textcoords='offset points',fontsize=8)


plt.title('Absolute values from the disease')
plt.xlabel('diseases')
plt.ylabel('Absolute values from diseases')
plt.xticks(rotation=45, ha='right')
plt.legend(title='sex', loc='upper right')
plt.tight_layout()
plt.show()


In [None]:
df.columns

In [None]:
# change in string data
df['sex'] = df['sex'].astype(str).str.lower()

# relevant columns 'Enlarged Cardiomediastinum' to 'Support Devices'
relevant_columns = df.loc[:, 'Enlarged Cardiomediastinum':'Support Devices']

#group by sex and sum by disease
disease_counts = df.groupby('sex')[relevant_columns.columns].sum()

sex_counts = df['sex'].value_counts()
# Normalize disease counts by the total number of males and females
normalized_disease_counts = disease_counts.div(sex_counts, axis=0)


ax = normalized_disease_counts.T.plot(kind='bar', figsize=(14, 7), width=0.8)

# Add values on top of the bars
for p in ax.patches:
    ax.annotate(f'{p.get_height():.2f}', (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='center', xytext=(0, 5), textcoords='offset points', fontsize=8)

# Labels and title
plt.title('Relative values of diseases by sex (normalized by total number of males and females)')
plt.xlabel('Diseases')
plt.ylabel('Relative values (normalized by sex)')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Sex', loc='upper right')
plt.tight_layout()

plt.show()

In [None]:
print(df_labeled.info())

In [None]:
# Laden der Daten


# Filter the Data for Patients with 'Enlarged Cardiomediastinum'
filtered_data = df_labeled[df_labeled ['Enlarged Cardiomediastinum'] == 1]

# Histogramm der Altersverteilung erstellen
plt.figure(figsize=(10, 6))
plt.hist(filtered_data['age'].dropna(), bins=30, color='skyblue', edgecolor='black')

plt.title('Age distribution for Enlarged Cardiomediastinum')
plt.xlabel('age')
plt.ylabel('frequency')
plt.grid(axis='y', alpha=0.75)

plt.tight_layout()

# Anzeigen des Plots
plt.show()

In [None]:
df.columns

In [None]:
# Laden der Daten

# Filter the Data for Patients with 'Enlarged Cardiomediastinum'
filtered_data = df[df['Enlarged Cardiomediastinum'] == 1]

# Histogramm der Altersverteilung erstellen
plt.figure(figsize=(10, 6))
n, bins, patches = plt.hist(filtered_data['age'].dropna(), bins=30, color='skyblue', edgecolor='black')

# Add age values (frequencies) on top of the bars
for i in range(len(patches)):
    plt.text(patches[i].get_x() + patches[i].get_width() / 2, n[i], f'{int(n[i])}', 
             ha='center', va='bottom', fontsize=10)

plt.title('Age distribution for Enlarged Cardiomediastinum')
plt.xlabel('age')
plt.ylabel('frequency')
plt.grid(axis='y', alpha=0.75)

plt.tight_layout()

# Anzeigen des Plots
plt.show()


In [None]:
import plotly.express as px

In [None]:
df.columns = df.columns.str.replace(" ", "_")
diseases = ['Enlarged_Cardiomediastinum','Cardiomegaly', 'Lung_Opacity', 'Lung_Lesion', 'Edema', 'Consolidation','Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural_Effusion','Pleural_Other', 'Fracture']
avg_by_insurance = df.groupby("insurance_type", as_index=False)[diseases].mean()
avg_by_insurance = avg_by_insurance.melt(id_vars = ["insurance_type"], var_name = "disease_type", value_name="average_sickness_rate")
fig = px.bar(avg_by_insurance, color="insurance_type", x="disease_type", y="average_sickness_rate", barmode="group", text="average_sickness_rate")

fig.update_traces(texttemplate='%{text:.2f}', textposition='outside') 

fig.show()

In [None]:
avg_by_sex = df.groupby("sex", as_index=False)[diseases].mean()
avg_by_sex = avg_by_sex.melt(id_vars = ["sex"], var_name = "disease_type", value_name="average_sickness_rate")
px.bar(avg_by_sex, color="sex", x="disease_type", y="average_sickness_rate", barmode="group")


In [None]:
import pandas as pd
import plotly.express as px

# Replace spaces in column names with underscores
df.columns = df.columns.str.replace(" ", "_")

# List of diseases
diseases = [
    'Enlarged_Cardiomediastinum', 'Cardiomegaly', 'Lung_Opacity', 
    'Lung_Lesion', 'Edema', 'Consolidation', 'Pneumonia', 
    'Atelectasis', 'Pneumothorax', 'Pleural_Effusion', 'Pleural_Other', 
    'Fracture'
]

# Calculate average sickness rates by insurance type
avg_by_insurance = df.groupby("insurance_type", as_index=False)[diseases].mean()

# Melt the DataFrame for easier plotting
avg_by_insurance = avg_by_insurance.melt(id_vars=["insurance_type"], var_name="disease_type", value_name="average_sickness_rate")

# Create a bar plot with values on top of the bars
fig = px.bar(
    avg_by_insurance, 
    color="insurance_type", 
    x="disease_type", 
    y="average_sickness_rate", 
    barmode="group",
    text="average_sickness_rate"  # Show values on the bars
)

# Update the layout and text positioning
fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')  # Display values with two decimal places

# Update layout for better display
fig.update_layout(
    title='Average Sickness Rate by Insurance Type and Disease',
    xaxis_title='Disease Type',
    yaxis_title='Average Sickness Rate',
    legend_title='Insurance Type'
)

# Show the plot
fig.show()

In [None]:
avg_by_race = df.groupby("race", as_index=False)[diseases].mean()
avg_by_race = avg_by_race.melt(id_vars = ["race"], var_name = "disease_type", value_name="average_sickness_rate")
px.bar(avg_by_race, color="race", x="disease_type", y="average_sickness_rate", barmode="group")


In [None]:
px.violin(df, y="age", color="race", box=True, x="insurance_type", facet_row="sex")

In [None]:
px.violin(df, y="age", color="race", box=True, x="sex")

In [None]:
from statsmodels.formula.api import logit

In [None]:
diseases

In [None]:
models = {}
df[diseases]

for disease in diseases: 
    models[disease] = logit(data=df, formula=f"{disease} ~ C(sex) + C(insurance_type) + age").fit()
    

In [None]:
models["Edema"].summary()

In [None]:
logit(data=df, formula=f"Edema ~ C(sex) + C(insurance_type) + age")