In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

In [3]:
train_set = pd.read_csv('../data/train_set.csv').values.flatten().tolist()
val_set = pd.read_csv('../data/val_set.csv').values.flatten().tolist()
train_val_set = train_set + val_set

internal_test_set = pd.read_csv('../data/internal_test_set.csv').values.flatten().tolist()
external_test_set = pd.read_csv('../data/external_test_set.csv').values.flatten().tolist()

useable_patients = pd.read_csv('../data/all_study_patients.csv').values.flatten().tolist()

secondary_outcome_df = pd.read_csv('../data/HEALAnalysisDatabase_DATA_LABELS_2022-05-10_1416.csv')

# Fix ids
secondary_outcome_df['studyid'] = secondary_outcome_df['studyid'].str.replace('-', '')
secondary_outcome_df = secondary_outcome_df.loc[secondary_outcome_df['studyid'].isin(useable_patients)]

In [4]:
secondary_outcome_cols = [
    'hieseverity',
    'primary_all',
    'secondary_all',
    'death24m',
    'anyndi',
    'mod2sevndi',
    'sevndi',
]

In [5]:
for col in secondary_outcome_cols:
    print(secondary_outcome_df[col].value_counts())
    print('-----')

hieseverity
Moderate    323
Severe       91
Name: count, dtype: int64
-----
primary_all
No death or NDI    216
Death or NDI       198
Name: count, dtype: int64
-----
secondary_all
Normal                    216
Moderate to Severe NDI    111
Mild NDI                   51
Died                       36
Name: count, dtype: int64
-----
death24m
No     378
Yes     36
Name: count, dtype: int64
-----
anyndi
No NDI     210
Any NDI    157
Name: count, dtype: int64
-----
mod2sevndi
No Moderate to Severe NDI    254
Moderate to Severe NDI       112
Name: count, dtype: int64
-----
sevndi
No Severe NDI    314
Severe NDI        53
Name: count, dtype: int64
-----


In [6]:
no_mitt = secondary_outcome_df.loc[secondary_outcome_df['mittflag'] == 'Do not include in mITT analysis', 'studyid'].str.replace('-', '')

[x for x in useable_patients if  x in (no_mitt)]

[]

In [7]:
# Need to change outcomes to binary
new_secondary_outcome_df = pd.DataFrame()
new_secondary_outcome_df['studyid'] = secondary_outcome_df['studyid'].copy()
new_secondary_outcome_df['secondary_all'] = secondary_outcome_df['secondary_all'].copy()

# One mislabel which can be seen on GMFCS to confirm severe NDI
new_secondary_outcome_df.loc[new_secondary_outcome_df['studyid'] == 'CLA204', 'secondary_all'] = 'Moderate to Severe NDI'

new_secondary_outcome_df['severe_hieseverity'] = secondary_outcome_df['hieseverity'].copy()
new_secondary_outcome_df.loc[new_secondary_outcome_df['severe_hieseverity'] == 'Moderate', 'severe_hieseverity'] = 0
new_secondary_outcome_df.loc[new_secondary_outcome_df['severe_hieseverity'] == 'Severe', 'severe_hieseverity'] = 1

new_secondary_outcome_df['death24m'] = secondary_outcome_df['death24m'].copy()
new_secondary_outcome_df.loc[new_secondary_outcome_df['death24m'] == 'No', 'death24m'] = 0
new_secondary_outcome_df.loc[new_secondary_outcome_df['death24m'] == 'Yes', 'death24m'] = 1

new_secondary_outcome_df['death_or_mod2sevndi'] = secondary_outcome_df['mod2sevndi'].copy()
new_secondary_outcome_df.loc[new_secondary_outcome_df['death_or_mod2sevndi'] == 'No Moderate to Severe NDI', 'death_or_mod2sevndi'] = 0
new_secondary_outcome_df.loc[new_secondary_outcome_df['death_or_mod2sevndi'] == 'Moderate to Severe NDI', 'death_or_mod2sevndi'] = 1
new_secondary_outcome_df.loc[new_secondary_outcome_df['secondary_all'] == 'Died', 'death_or_mod2sevndi'] = 1
new_secondary_outcome_df['death_or_mod2sevndi'] = new_secondary_outcome_df['death_or_mod2sevndi'].fillna(value=0) # leftover are mild or normal

new_secondary_outcome_df['death_or_sevndi'] = secondary_outcome_df['sevndi'].copy()
new_secondary_outcome_df.loc[new_secondary_outcome_df['death_or_sevndi'] == 'No Severe NDI', 'death_or_sevndi'] = 0
new_secondary_outcome_df.loc[new_secondary_outcome_df['death_or_sevndi'] == 'Severe NDI', 'death_or_sevndi'] = 1
new_secondary_outcome_df.loc[new_secondary_outcome_df['secondary_all'] == 'Died', 'death_or_sevndi'] = 1
new_secondary_outcome_df['death_or_sevndi'] = new_secondary_outcome_df['death_or_sevndi'].fillna(value=0) # same as above

In [8]:
outcomes_df = pd.read_csv('../data/outcomes_updated.csv')
outcomes_df = outcomes_df.loc[outcomes_df['studyid'].isin(useable_patients)]

In [9]:
outcomes_df = outcomes_df.merge(
    new_secondary_outcome_df,
    how='inner', on='studyid'
)

In [10]:
outcomes_df.to_csv('../data/outcomes_with_secondary_updated.csv', index=False)