In [1]:
import pandas as pd

exclude_list = [
    'Prostate cancer', 'Ovarian cancer', 'Breast cancer', 'Esophageal cancer',
    'Lung cancer', 'Pancreatic cancer', 'Colorectal cancer', 'Liver cancer', 'Brain cancer'
]

def simplify_dataframe(df: pd.DataFrame, n_rows: int = 50) -> pd.DataFrame:
    df_out = df.copy().head(n_rows)

    # First column: 1 to N
    first_col = df_out.columns[0]
    df_out[first_col] = range(1, len(df_out) + 1)

    # Other columns: repeat first row's value
    for col in df_out.columns[1:]:
        first_value = df_out[col].iloc[0]
        df_out[col] = [first_value] * len(df_out)

    return df_out

# Read files
real_prot = pd.read_csv('./original_files_REMOVE/measured_proteomics_random.csv')
syn_prot = pd.read_csv('./original_files_REMOVE/RABIT_proteomics_random.csv')
ehr_rep = pd.read_csv('./original_files_REMOVE/ehr_representations_random.csv')
master_labels = pd.read_csv('./original_files_REMOVE/master_labels_random.csv')
pred_time = pd.read_csv('./original_files_REMOVE/pred_time_random.csv')

# Drop excluded columns from master_labels
master_labels = master_labels.drop(columns=[col for col in exclude_list if col in master_labels.columns])

# Apply transformation
real_prot_simplified = simplify_dataframe(real_prot)
syn_prot_simplified = simplify_dataframe(syn_prot)
ehr_rep_simplified = simplify_dataframe(ehr_rep)
master_labels_simplified = simplify_dataframe(master_labels)
pred_time_simplified = simplify_dataframe(pred_time)

# Save
real_prot_simplified.to_csv('./measured_proteomics_randomized.csv', index=False)
syn_prot_simplified.to_csv('./RABIT_proteomics_randomized.csv', index=False)
ehr_rep_simplified.to_csv('./ehr_representations_randomized.csv', index=False)
master_labels_simplified.to_csv('./master_labels_randomized.csv', index=False)
pred_time_simplified.to_csv('./pred_time_randomized.csv', index=False)


In [3]:
real_prot_simplified

Unnamed: 0,eid,aarsd1_protein,abhd14b_protein,abl1_protein,acaa1_protein,acan_protein,ace2_protein,acox1_protein,acp5_protein,acp6_protein,...,zfyve19_protein,zhx2_protein,znf174_protein,znf75d_protein,znf830_protein,znrd2_protein,znrf4_protein,zp3_protein,zp4_protein,zpr1_protein
0,1,-0.06195,0.3955,0.091,-0.3971,0.1943,0.53835,-0.068,0.0998,0.9434,...,,,0.1447,,,,,-4.5514,-0.1796,0.5056
1,2,-0.06195,0.3955,0.091,-0.3971,0.1943,0.53835,-0.068,0.0998,0.9434,...,,,0.1447,,,,,-4.5514,-0.1796,0.5056
2,3,-0.06195,0.3955,0.091,-0.3971,0.1943,0.53835,-0.068,0.0998,0.9434,...,,,0.1447,,,,,-4.5514,-0.1796,0.5056
3,4,-0.06195,0.3955,0.091,-0.3971,0.1943,0.53835,-0.068,0.0998,0.9434,...,,,0.1447,,,,,-4.5514,-0.1796,0.5056
4,5,-0.06195,0.3955,0.091,-0.3971,0.1943,0.53835,-0.068,0.0998,0.9434,...,,,0.1447,,,,,-4.5514,-0.1796,0.5056
5,6,-0.06195,0.3955,0.091,-0.3971,0.1943,0.53835,-0.068,0.0998,0.9434,...,,,0.1447,,,,,-4.5514,-0.1796,0.5056
6,7,-0.06195,0.3955,0.091,-0.3971,0.1943,0.53835,-0.068,0.0998,0.9434,...,,,0.1447,,,,,-4.5514,-0.1796,0.5056
7,8,-0.06195,0.3955,0.091,-0.3971,0.1943,0.53835,-0.068,0.0998,0.9434,...,,,0.1447,,,,,-4.5514,-0.1796,0.5056
8,9,-0.06195,0.3955,0.091,-0.3971,0.1943,0.53835,-0.068,0.0998,0.9434,...,,,0.1447,,,,,-4.5514,-0.1796,0.5056
9,10,-0.06195,0.3955,0.091,-0.3971,0.1943,0.53835,-0.068,0.0998,0.9434,...,,,0.1447,,,,,-4.5514,-0.1796,0.5056


In [4]:
syn_prot_simplified

Unnamed: 0,patient_ids,aarsd1_prediction,abhd14b_prediction,abl1_prediction,acaa1_prediction,acan_prediction,ace2_prediction,acox1_prediction,acp5_prediction,acp6_prediction,...,zfyve19_prediction,zhx2_prediction,znf174_prediction,znf75d_prediction,znf830_prediction,znrd2_prediction,znrf4_prediction,zp3_prediction,zp4_prediction,zpr1_prediction
0,1,-0.00308,-0.024954,0.052223,-0.053578,0.000485,-0.280643,0.019757,-0.094659,-0.165657,...,-0.063092,0.002289,0.073093,0.037236,0.338925,-0.022038,0.030668,-1.029901,0.037572,0.260659
1,2,-0.00308,-0.024954,0.052223,-0.053578,0.000485,-0.280643,0.019757,-0.094659,-0.165657,...,-0.063092,0.002289,0.073093,0.037236,0.338925,-0.022038,0.030668,-1.029901,0.037572,0.260659
2,3,-0.00308,-0.024954,0.052223,-0.053578,0.000485,-0.280643,0.019757,-0.094659,-0.165657,...,-0.063092,0.002289,0.073093,0.037236,0.338925,-0.022038,0.030668,-1.029901,0.037572,0.260659
3,4,-0.00308,-0.024954,0.052223,-0.053578,0.000485,-0.280643,0.019757,-0.094659,-0.165657,...,-0.063092,0.002289,0.073093,0.037236,0.338925,-0.022038,0.030668,-1.029901,0.037572,0.260659
4,5,-0.00308,-0.024954,0.052223,-0.053578,0.000485,-0.280643,0.019757,-0.094659,-0.165657,...,-0.063092,0.002289,0.073093,0.037236,0.338925,-0.022038,0.030668,-1.029901,0.037572,0.260659
5,6,-0.00308,-0.024954,0.052223,-0.053578,0.000485,-0.280643,0.019757,-0.094659,-0.165657,...,-0.063092,0.002289,0.073093,0.037236,0.338925,-0.022038,0.030668,-1.029901,0.037572,0.260659
6,7,-0.00308,-0.024954,0.052223,-0.053578,0.000485,-0.280643,0.019757,-0.094659,-0.165657,...,-0.063092,0.002289,0.073093,0.037236,0.338925,-0.022038,0.030668,-1.029901,0.037572,0.260659
7,8,-0.00308,-0.024954,0.052223,-0.053578,0.000485,-0.280643,0.019757,-0.094659,-0.165657,...,-0.063092,0.002289,0.073093,0.037236,0.338925,-0.022038,0.030668,-1.029901,0.037572,0.260659
8,9,-0.00308,-0.024954,0.052223,-0.053578,0.000485,-0.280643,0.019757,-0.094659,-0.165657,...,-0.063092,0.002289,0.073093,0.037236,0.338925,-0.022038,0.030668,-1.029901,0.037572,0.260659
9,10,-0.00308,-0.024954,0.052223,-0.053578,0.000485,-0.280643,0.019757,-0.094659,-0.165657,...,-0.063092,0.002289,0.073093,0.037236,0.338925,-0.022038,0.030668,-1.029901,0.037572,0.260659


In [5]:
ehr_rep_simplified

Unnamed: 0,patient_ids,labeling_time,data_0,data_1,data_2,data_3,data_4,data_5,data_6,data_7,...,data_758,data_759,data_760,data_761,data_762,data_763,data_764,data_765,data_766,data_767
0,1,2009-08-01,-0.319,2.057,0.547,-1.115,0.7925,-0.938,-0.0756,0.5884,...,1.402,-0.3215,-0.633,-0.8774,-0.4966,-1.514,-0.308,-0.2428,2.072,-0.1746
1,2,2009-08-01,-0.319,2.057,0.547,-1.115,0.7925,-0.938,-0.0756,0.5884,...,1.402,-0.3215,-0.633,-0.8774,-0.4966,-1.514,-0.308,-0.2428,2.072,-0.1746
2,3,2009-08-01,-0.319,2.057,0.547,-1.115,0.7925,-0.938,-0.0756,0.5884,...,1.402,-0.3215,-0.633,-0.8774,-0.4966,-1.514,-0.308,-0.2428,2.072,-0.1746
3,4,2009-08-01,-0.319,2.057,0.547,-1.115,0.7925,-0.938,-0.0756,0.5884,...,1.402,-0.3215,-0.633,-0.8774,-0.4966,-1.514,-0.308,-0.2428,2.072,-0.1746
4,5,2009-08-01,-0.319,2.057,0.547,-1.115,0.7925,-0.938,-0.0756,0.5884,...,1.402,-0.3215,-0.633,-0.8774,-0.4966,-1.514,-0.308,-0.2428,2.072,-0.1746
5,6,2009-08-01,-0.319,2.057,0.547,-1.115,0.7925,-0.938,-0.0756,0.5884,...,1.402,-0.3215,-0.633,-0.8774,-0.4966,-1.514,-0.308,-0.2428,2.072,-0.1746
6,7,2009-08-01,-0.319,2.057,0.547,-1.115,0.7925,-0.938,-0.0756,0.5884,...,1.402,-0.3215,-0.633,-0.8774,-0.4966,-1.514,-0.308,-0.2428,2.072,-0.1746
7,8,2009-08-01,-0.319,2.057,0.547,-1.115,0.7925,-0.938,-0.0756,0.5884,...,1.402,-0.3215,-0.633,-0.8774,-0.4966,-1.514,-0.308,-0.2428,2.072,-0.1746
8,9,2009-08-01,-0.319,2.057,0.547,-1.115,0.7925,-0.938,-0.0756,0.5884,...,1.402,-0.3215,-0.633,-0.8774,-0.4966,-1.514,-0.308,-0.2428,2.072,-0.1746
9,10,2009-08-01,-0.319,2.057,0.547,-1.115,0.7925,-0.938,-0.0756,0.5884,...,1.402,-0.3215,-0.633,-0.8774,-0.4966,-1.514,-0.308,-0.2428,2.072,-0.1746


In [6]:
master_labels_simplified

Unnamed: 0,person_id,Leukemia,Non-Hodgkin lymphoma,Type 2 diabetes_earliest,Ischemic heart disease_earliest,Cerebrovascular diseases_earliest,"Emphysema, COPD_earliest",Chronic liver diseases_earliest,Chronic kidney diseases_earliest,All-cause dementia_earliest,Alzheimer’s disease_earliest,Parkinson’s disease and parkinsonism_earliest,Rheumatoid arthritis_earliest,Macular degeneration_earliest,Osteoporosis_earliest,Osteoarthritis_earliest
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
pred_time_simplified

Unnamed: 0,patient_id,prediction_time,label_type,value
0,1,2009-01-17 00:00:00,boolean,True
1,2,2009-01-17 00:00:00,boolean,True
2,3,2009-01-17 00:00:00,boolean,True
3,4,2009-01-17 00:00:00,boolean,True
4,5,2009-01-17 00:00:00,boolean,True
5,6,2009-01-17 00:00:00,boolean,True
6,7,2009-01-17 00:00:00,boolean,True
7,8,2009-01-17 00:00:00,boolean,True
8,9,2009-01-17 00:00:00,boolean,True
9,10,2009-01-17 00:00:00,boolean,True
