In [89]:
import pandas as pd
pd.set_option('mode.chained_assignment', None)
import numpy as np 

In [90]:
trdf = pd.read_csv('/data/ddmg/slabs/CheXpert-v1.0/train.csv')
vdf = pd.read_csv('/data/ddmg/slabs/CheXpert-v1.0/valid.csv')
df = trdf.append(vdf)
del trdf, vdf

In [91]:
df = df[((df['No Finding'] == 1) | (df['Pneumonia'] == 1))]

In [92]:
df.Pneumonia.value_counts(dropna=False, normalize=True)

NaN    0.745556
1.0    0.212429
0.0    0.042015
Name: Pneumonia, dtype: float64

In [93]:
df['patient'] = df.Path.str.extract(r'(patient)(\d+)')[1]
df['study'] = df.Path.str.extract(r'(study)(\d+)')[1].astype(int)
df['uid'] = df['patient'] + "_" + df['study'].astype(str)
df = df[['uid', 'patient', 'study', 'Sex', 'Frontal/Lateral', 'Pneumonia', 'Path']]

current_uid = len(df.uid.unique())
print(f'Total uids {current_uid}')

Total uids 21237


In [94]:
# get the main outcome
df['y0'] = df['Pneumonia'].copy()
df.y0.fillna(0, inplace = True)
df.y0[(df.y0 == -1)] = 1
df.y0.value_counts(dropna = False, normalize = True)

0.0    0.787571
1.0    0.212429
Name: y0, dtype: float64

In [95]:
# study_outcome = df.groupby('uid').agg({'y0':'max'}).reset_index()
# df.drop('y0', axis = 1, inplace = True)
# df = df.merge(study_outcome, on = ['uid'])

# df = df[(df.y0 >= 0)]
# df.drop('Pneumonia', axis = 1, inplace = True)

# print(f'Lost {100*(current_uid - len(df.uid.unique()))/current_uid:.3f}% because of uncertain labels')
# current_uid = len(df.uid.unique())

In [96]:
# get the auxiliary label 
df = df[(df.Sex != 'Unknown')]
df['y1'] = (df.Sex == 'Male').astype(int)
df.drop('Sex', axis = 1, inplace = True)

print(f'Lost {100*(current_uid - len(df.uid.unique()))/current_uid:.3f}% because of unknown sex')
current_uid = len(df.uid.unique())


Lost 0.005% because of unknown sex


In [97]:
# keep only studies with frontal views
df['frontal'] = (df['Frontal/Lateral'] == 'Frontal').astype(int)
# df.drop(['study_ever_frontal','patient_ever_frontal'], axis=1, errors='ignore', inplace =True)
# max_frontal= df.groupby(['patient', 'study']).agg({'frontal': 'max'}).reset_index()
# max_frontal.columns = ['patient', 'study', 'ever_frontal']
# df = df.merge(max_frontal, on = ['patient', 'study'])
df = df[(df.frontal ==1)]

print(f'Lost {100*(current_uid - len(df.uid.unique()))/current_uid:.3f}% because they dont have frontal views')
current_uid = len(df.uid.unique())



Lost 0.019% because they dont have frontal views


In [98]:
df.drop_duplicates(subset=['uid'], inplace = True)
print(f'Lost {100*(current_uid - df.shape[0])/current_uid:.3f}% because they have duplicates')
current_uid = len(df.uid.unique())

Lost 0.000% because they have duplicates


In [99]:
df.drop(['Frontal/Lateral', 'frontal', 'Pneumonia'], axis = 1, inplace = True)
df.head()

Unnamed: 0,uid,patient,study,Path,y0,y1
0,00001_1,1,1,CheXpert-v1.0/train/patient00001/study1/view1_...,0.0,0
5,00004_1,4,1,CheXpert-v1.0/train/patient00004/study1/view1_...,0.0,0
7,00005_1,5,1,CheXpert-v1.0/train/patient00005/study1/view1_...,0.0,1
11,00006_1,6,1,CheXpert-v1.0/train/patient00006/study1/view1_...,0.0,0
18,00010_1,10,1,CheXpert-v1.0/train/patient00010/study1/view1_...,0.0,0


In [100]:
df.y0.value_counts(normalize=True)

0.0    0.784099
1.0    0.215901
Name: y0, dtype: float64

In [101]:
df.to_csv('/data/ddmg/slabs/chexpert/clean_data.csv', index = False)

# ---- END ----# 

In [10]:
df.columns

Index(['uid', 'patient', 'study', 'Path', 'y0', 'y1'], dtype='object')

In [10]:
dfr = pd.read_csv(f'/data/ddmg/slabs/chexpert/clean_data.csv')
dfr.head()

Unnamed: 0,uid,patient,study,Path,y0,y1
0,00001_1,1,1,CheXpert-v1.0-small/train/patient00001/study1/...,0.0,0
1,00002_2,2,2,CheXpert-v1.0-small/train/patient00002/study2/...,0.0,0
2,00002_1,2,1,CheXpert-v1.0-small/train/patient00002/study1/...,0.0,0
3,00003_1,3,1,CheXpert-v1.0-small/train/patient00003/study1/...,0.0,1
4,00004_1,4,1,CheXpert-v1.0-small/train/patient00004/study1/...,0.0,0


In [11]:
rng = np.random.RandomState(0)
test_candidates = rng.choice(df.patient.unique(), 
                size = int(len(df.patient.unique())*0.3), replace = False).tolist()
train_candidates = list(set(df.patient.unique()) - set(test_candidates))
assert len(test_candidates) + len(train_candidates) == len(df.patient.unique())

In [12]:
train_candidates_df = df[(df.patient.isin(train_candidates))]
test_candidates_df = df[(df.patient.isin(test_candidates))]

In [13]:
def get_skewed_data_old(cand_df, rng):
    if rng is None:
        rng = np.random.RandomState(0)
        
    ids_11 = cand_df.uid[((cand_df.sex==1) &  (cand_df.y==1))].tolist()
    ids_10_size = int(0.1 * (len(ids_11)/0.9))
    ids_10_candidates = cand_df.uid[((cand_df.sex==1) &  (cand_df.y==0))].tolist()
    ids_10 = rng.choice(ids_10_candidates, size = ids_10_size, replace =False).tolist()

    ids_01 = cand_df.uid[((cand_df.sex==0) &  (cand_df.y==1))].tolist()
    ids_00_size = int(0.9 * (len(ids_01)/0.1))
    ids_00_candidates = cand_df.uid[((cand_df.sex==0) &  (cand_df.y==0))].tolist()
    ids_00 = rng.choice(ids_00_candidates, size = ids_00_size, replace =False).tolist()

    ids = ids_11 + ids_10 + ids_00 + ids_01
    sk_df = cand_df[(cand_df.uid.isin(ids))]
    return sk_df



def get_skewed_data(cand_df, rng):
    py1_v0 = py0_v1 =  0.9 
    py0_v0 = py1_v1 =  1 - py0_v1

    py1 = 0.3
    py0 = 1 - py1

    pv0 = 0.75
    pv1 = 1 - pv0
    pv1_y1 = pv1 * py1_v1/py1
    pv1_y0 = pv1 * py0_v1/py0

    pv0_y1 = pv0 * py1_v0/py1
    pv0_y0 = pv0 * py0_v0/py0
    


def get_unskewed_data(cand_df, skewed_data, rng):
    if rng is None:
        rng = np.random.RandomState(0)

    n_y1 = skewed_data[(skewed_data.y == 1)].shape[0]
    n_y0 = skewed_data[(skewed_data.y == 0)].shape[0]

    ids_0 = rng.choice(
        cand_df.uid[(cand_df.y==0)], 
        size = n_y0, replace=False).tolist() 
    ids_1 = rng.choice(
        cand_df.uid[(cand_df.y==1)], 
        size = n_y1, replace=False).tolist() 

    ids = ids_0 + ids_1 
    usk_df = cand_df[(cand_df.uid.isin(ids))]
    return usk_df
    
    

In [14]:
tr_sk_df = get_skewed_data(train_candidates_df, rng)
tr_usk_df = get_unskewed_data(train_candidates_df, tr_sk_df, rng)


In [56]:
tr_usk_df.sex.value_counts(normalize=True)

1    0.584308
0    0.415692
Name: sex, dtype: float64

In [84]:
tr_sk_df.shape[0] * 0.75

11738.25

In [59]:
pd.crosstab(tr_sk_df.sex, tr_sk_df.y, normalize = 'columns')

y,0.0,1.0
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.983345,0.421477
1,0.016655,0.578523


In [18]:
pd.crosstab(tr_usk_df.sex, tr_usk_df.y, normalize = 'index')

y,0.0,1.0
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.79127,0.20873
1,0.796173,0.203827


In [19]:
ts_sk_df = get_skewed_data(test_candidates_df, rng)
ts_usk_df = get_unskewed_data(test_candidates_df, ts_sk_df, rng)

In [20]:
ts_sk_df.y.value_counts(normalize=True)

0.0    0.794384
1.0    0.205616
Name: y, dtype: float64

In [21]:
ts_usk_df.y.value_counts(normalize=True)

0.0    0.794384
1.0    0.205616
Name: y, dtype: float64

In [22]:
pd.crosstab(ts_sk_df.sex, ts_sk_df.y, normalize = 'index')

y,0.0,1.0
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.9,0.1
1,0.099542,0.900458


In [23]:
pd.crosstab(ts_usk_df.sex, ts_usk_df.y, normalize = 'index')

y,0.0,1.0
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.790681,0.209319
1,0.797008,0.202992


In [24]:
weights_df = pd.DataFrame([
    {'sex':1, 'y':1, 'weight': 0.1 * 0.3}, 
    {'sex':0, 'y':1, 'weight': 0.9 * 0.3}, 
    {'sex':1, 'y':0, 'weight': 0.9 * 0.7}, 
    {'sex':0, 'y':0, 'weight': 0.1 * 0.7}, 
    
    
])

In [25]:
test_candidates_df = test_candidates_df.merge(weights_df, on=['sex', 'y'])

In [26]:
df_oskew = test_candidates_df.sample(weights=test_candidates_df.weight, n=10000, replace = False)

In [27]:
pd.crosstab(df_oskew.sex, df_oskew.y, normalize='index')

y,0.0,1.0
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.919865,0.080135
1,0.999232,0.000768


In [28]:
df_oskew.y.value_counts(normalize=True)

0.0    0.9922
1.0    0.0078
Name: y, dtype: float64

In [29]:
ts_sk_df.Path[(ts_sk_df.sex ==0)].iloc[0]

'CheXpert-v1.0-small/train/patient00001/study1/view1_frontal.jpg'

In [29]:
test_candidates_df.shape

(51120, 7)

In [44]:
np.random.choice(test_candidates_df.uid.tolist(), size = 1000, p=test_candidates_df.weight.tolist() )

ValueError: probabilities do not sum to 1

In [81]:
# pv1_y0 = pv0_y1 =  0.9 
# pv1_y1 = pv0_y0 =  1 - pv1_y0

# py1 = 0.3
# py0 = 1 - py1

# pv1 = pv1_y0 * py0 + pv1_y1 * py1
# pv0 = 1 - pv1 

# py1_v1 = (pv1_y1 * py1) / pv1
# py0_v1 = (pv1_y0 * py0) / pv1

# py1_v0 = (pv0_y1 * py1) / pv0
# py0_v0 = (pv0_y0 * py0) / pv0

py1_v0 = py0_v1 =  0.1 
py0_v0 = py1_v1 =  1 - py0_v1

py1 = 0.3
py0 = 1 - py1

pv0 = 0.75
pv1 = 1 - pv0
pv1_y1 = pv1 * py1_v1/py1
pv1_y0 = pv1 * py0_v1/py0

pv0_y1 = pv0 * py1_v0/py1
pv0_y0 = pv0 * py0_v0/py0



In [86]:
train_only_df = train_candidates_df.sample(n = int(0.75*train_candidates_df.shape[0]))


In [90]:
train_only_df.shape[0]

90785

In [89]:
n = 5000
df11 = train_only_df[
    ((train_only_df.sex==1) & (train_only_df.y==1))].sample(n=int(n* py1_v1 * pv1))
df01 = train_only_df[
    ((train_only_df.sex==0) & (train_only_df.y==1))].sample(n=int(n* py1_v0 * pv0))
df10 = train_only_df[
    ((train_only_df.sex==1) & (train_only_df.y==0))].sample(n=int(n* py0_v1 * pv1))
df00 = train_only_df[
    ((train_only_df.sex==0) & (train_only_df.y==0))].sample(n=int(n* py0_v0 * pv0))


In [77]:
test_oskew = pd.concat([df11, df01, df10, df00])


In [78]:
test_oskew.y.value_counts(normalize=True)

0.0    0.700348
1.0    0.299652
Name: y, dtype: float64

In [66]:
pd.crosstab(test_oskew.sex, test_oskew.y, normalize = 'index')

y,0.0,1.0
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.90107,0.09893
1,0.096774,0.903226


In [48]:
weights_df.sum()

sex       2.0
y         2.0
weight    1.0
dtype: float64

In [224]:
pv1

1.5558823529411765

In [160]:
pd.crosstab(df.sex, df.y, normalize='index')

y,0.0,1.0
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.973005,0.026995
1,0.973638,0.026362


In [175]:
(0.9 / 0.3)/ ((0.9 / 0.3) + (0.1 /0.7))

0.9545454545454546

In [271]:
3 * 0.25

0.75

In [283]:
(0.1 * 0.3) + (0.9 * 0.3) + (0.1*0.7) + (0.9 * 0.7)

1.0

In [310]:
1 - ((0.9 * 0.3) + (0.1 * 0.7) )

0.6599999999999999