In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
df_meta = pd.read_csv('/opt/NAS3/EmoryCXRv1/Tables/emory_cxr_v1_metadata_DEID.csv')
df_finding = pd.read_csv('/home/fli40/Data/Datathon2023_CXR/FINAL_FINDINGS_ANON.csv')

In [None]:
#df_meta.columns

In [None]:
#df_finding.columns

In [None]:
df_finding.replace(np.nan, 0, inplace=True)
df_finding.replace(-1, np.nan, inplace=True)
df_finding.dropna(inplace=True)
df_finding

In [None]:
PATHOLOGIES = ['Support Devices','Pneumothorax', 'No Finding']

In [None]:
df = df_meta[['AccessionNumber_anon', 'empi_anon', 'image_path', 'sex', 'RACE', 'age','ImageView_Frontal']].merge(df_finding[['AccessionNumber_anon']+PATHOLOGIES], on='AccessionNumber_anon')
df.drop_duplicates(inplace=True)
df

In [None]:
## select only one image for one subject

df = df[df['ImageView_Frontal']==1]
df.drop_duplicates(subset=['empi_anon'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.rename(columns={'image_path':'Path', 'sex':'Sex', 'RACE':'Race', 'age':'Age'}, inplace=True)

In [None]:
race_map = {'Caucasian or White':'White', 
            'African American  or Black':'Black', 
            'Asian':'Asian',
            'Unknown, Unavailable or Unreported':'Other', 
            'Multiple':'Other',
            'Patient Declines':'Other', 
            'Hispanic':'Hispanic',
            'Native Hawaiian or Other Pacific Islander':'Other', 
            'Not Recorded':'Other',
            'American Indian or Alaskan Native':'Other', 
            'Asian American':'Asian',
            'Alaskan Native':'Asian'}

In [None]:
df['Race'] = df.Race.replace(race_map)

In [None]:
df = df[(df.Race=='Black') | (df.Race=='White')]

In [None]:
df['Age_group'] = pd.cut(df['Age'], 
                           bins=[0, 60, 130],
                           labels=['18-60', '60+'])

In [None]:
df

In [None]:
#df.groupby(['Sex', 'Race', 'Age_group', 'Support Devices', 'No Finding'])['Pneumothorax'].value_counts()
df.groupby(['Sex', 'Race', 'Support Devices', 'No Finding'])['Pneumothorax'].value_counts()

In [None]:
df.groupby(['Support Devices', 'No Finding'])['Pneumothorax'].value_counts()

In [None]:
def combine_categories(df, cat):
    col_list = []
    for c in cat:
        col_list.append(df[c].astype(str))
        
    combined = col_list[0]
    for dfc in col_list[1:]:
        combined = combined + '_' + dfc 
    return combined

In [None]:
df_no_finding = df[(df['No Finding'] == 1.0) & (df['Support Devices'] == 0.0)]
df_PTX = df[(df.Pneumothorax == 1.0) & (df['Support Devices'] == 1.0)]

In [None]:
df_no_finding['group'] = combine_categories(df_no_finding, ['Sex','Race'])
df_PTX['group'] = combine_categories(df_PTX, ['Sex','Race'])

In [None]:
df_no_finding.group.value_counts()

In [None]:
df_PTX.group.value_counts()

In [None]:
df_PTX_sampled = df_PTX.groupby('group').sample(n=282, replace=False)
df_PTX_sampled

In [None]:
df_no_finding_sampled = df_no_finding.groupby('group').sample(n=282)
df_no_finding_sampled

In [None]:
df_sampled = pd.concat([df_PTX_sampled, df_no_finding_sampled], axis=0)

In [None]:
df_sampled.Race.value_counts()

In [None]:
df_sampled.Sex.value_counts()

In [None]:
df_sampled.Age_group.value_counts()

In [None]:
df_sampled.Pneumothorax.value_counts()

In [None]:
df_sampled['Support Devices'].value_counts()

In [None]:
df_sampled['No Finding'].value_counts()

In [None]:
df_sampled['Path'] = ['/'.join(l[5:]) for l in df_sampled.Path.str.split('/')]
df_sampled  

In [None]:
df_sampled.to_csv('Data1.csv', index=False)