In [2]:
import pandas as pd
import os

CLINICAL_PATH = "../datasets/clinical_data"

### Create clinical_df DataFrame with days_to_death from TXT files:

In [40]:
def import_xml(file_path):
    temp_df = pd.read_xml(file_path, parser="etree")

    if 'days_to_death' in temp_df.columns:
        # Remove rows in which all columns are NaN
        temp_df = temp_df[["days_to_death"]]
    
        temp_df = temp_df.dropna(how='all')
        
        if temp_df.empty:
            print(f"File {file_path} has no valid age data.")

        return temp_df
    return None

def main():
    clinical_df = pd.DataFrame(columns=["folder_name", "file_name", "days_to_death"])
    for subfolder in os.listdir(f"{CLINICAL_PATH}"):
        file_xml_or_annotation_or_orm = 0
        for file in os.listdir(f"{CLINICAL_PATH}/{subfolder}"):
            if file.endswith(".xml") and "annotations" not in file and "org_omf" not in file:
                file_xml_or_annotation_or_orm = 1
                selected_data = import_xml(f"{CLINICAL_PATH}/{subfolder}/{file}")
            
                if selected_data is not None:
                    selected_data.insert(0, "file_name", file)
                    selected_data.insert(0, "folder_name", subfolder)

                    # Safe concatenation avoiding empty DataFrames or with NaN values
                    if not selected_data.empty and not selected_data.isna().all().all():
                        clinical_df = pd.concat([clinical_df, selected_data], ignore_index=True)
                        
    return clinical_df

clinical_death_df = main()
clinical_death_df.shape

File ../datasets/clinical_data/00049989-fa21-48fb-8dda-710c0dd5932e/nationwidechildrens.org_clinical.TCGA-A2-A0CT.xml has no valid age data.
File ../datasets/clinical_data/004b6bd4-19d0-4b40-99ef-1a76313fe7a5/nationwidechildrens.org_clinical.TCGA-GM-A2DD.xml has no valid age data.
File ../datasets/clinical_data/00a5e81c-cd67-483f-9d99-3c733b2ead38/nationwidechildrens.org_clinical.TCGA-D8-A1JM.xml has no valid age data.
File ../datasets/clinical_data/014f5ae1-5862-4165-9a3b-bba7bb08a527/nationwidechildrens.org_clinical.TCGA-C8-A12P.xml has no valid age data.
File ../datasets/clinical_data/01a962ea-a87f-49fa-9a27-7273a39f64a9/nationwidechildrens.org_clinical.TCGA-S3-A6ZF.xml has no valid age data.
File ../datasets/clinical_data/02726df0-5db3-4b3d-a3ca-8a1d8a0e130e/nationwidechildrens.org_clinical.TCGA-AN-A0FW.xml has no valid age data.
File ../datasets/clinical_data/02865683-19cc-4600-be9b-555c34c20db3/nationwidechildrens.org_clinical.TCGA-D8-A140.xml has no valid age data.
File ../datas

  clinical_df = pd.concat([clinical_df, selected_data], ignore_index=True)


File ../datasets/clinical_data/0b49c0d2-ac97-4446-aeff-f756ec145b8e/nationwidechildrens.org_clinical.TCGA-AR-A0TV.xml has no valid age data.
File ../datasets/clinical_data/0c5baab4-37cb-48c6-88d8-56be866d8632/nationwidechildrens.org_clinical.TCGA-AR-A24P.xml has no valid age data.
File ../datasets/clinical_data/0c75afc8-dba8-436e-b467-21b987cb63f7/nationwidechildrens.org_clinical.TCGA-EW-A1IZ.xml has no valid age data.
File ../datasets/clinical_data/0cc3d450-2c60-4cb0-a073-d92dc979fa5e/nationwidechildrens.org_clinical.TCGA-BH-A18F.xml has no valid age data.
File ../datasets/clinical_data/0d37a02a-252e-49dd-84ef-51c9d06ec43d/nationwidechildrens.org_clinical.TCGA-A7-A26H.xml has no valid age data.
File ../datasets/clinical_data/0da89962-24e4-4dba-abce-854ab68843a1/nationwidechildrens.org_clinical.TCGA-AN-A0FZ.xml has no valid age data.
File ../datasets/clinical_data/0e5052c4-7eca-427e-9a3b-dc22bbd01be4/nationwidechildrens.org_clinical.TCGA-A7-A4SD.xml has no valid age data.
File ../datas

(75, 3)

In [12]:
clinical_death_df.head()

Unnamed: 0,folder_name,file_name,days_to_death
0,021d8f80-3db0-4f60-b404-a71e115102ca,nationwidechildrens.org_clinical.TCGA-BH-A18K.xml,2763.0
1,0a9561ad-0d8e-4776-be26-c9d7b2825045,nationwidechildrens.org_clinical.TCGA-B6-A0IG.xml,4456.0
2,0ecfe554-fdc5-41f3-b287-66080d24d1bd,nationwidechildrens.org_clinical.TCGA-BH-A1ET.xml,2520.0
3,1bd2a479-da63-4cee-a722-31570b97c254,nationwidechildrens.org_clinical.TCGA-BH-A1EY.xml,538.0
4,1bf445b0-0b0d-4ba5-884a-ff70300b8795,nationwidechildrens.org_clinical.TCGA-AR-A0U2.xml,2551.0


In [15]:
clinical_death_df[clinical_death_df['days_to_death'] > 0].shape

(75, 3)

In [39]:
clinical_death_df['days_to_death'].describe()

count      75.000000
mean     1466.960000
std      1064.430179
min         1.000000
25%       575.000000
50%      1142.000000
75%      2317.000000
max      4456.000000
Name: days_to_death, dtype: float64

### Create clinical_df DataFrame with days_to_last_followup from TXT files:

In [13]:
def import_xml(file_path):
    temp_df = pd.read_xml(file_path, parser="etree")

    if 'days_to_last_followup' in temp_df.columns:
        # Remove rows in which all columns are NaN
        temp_df = temp_df[["days_to_last_followup"]]
    
        temp_df = temp_df.dropna(how='all')
        
        if temp_df.empty:
            print(f"File {file_path} has no valid age data.")

        return temp_df
    return None

def main():
    clinical_df = pd.DataFrame(columns=["folder_name", "file_name", "days_to_last_followup"])
    for subfolder in os.listdir(f"{CLINICAL_PATH}"):
        file_xml_or_annotation_or_orm = 0
        for file in os.listdir(f"{CLINICAL_PATH}/{subfolder}"):
            if file.endswith(".xml") and "annotations" not in file and "org_omf" not in file:
                file_xml_or_annotation_or_orm = 1
                selected_data = import_xml(f"{CLINICAL_PATH}/{subfolder}/{file}")
            
                if selected_data is not None:
                    selected_data.insert(0, "file_name", file)
                    selected_data.insert(0, "folder_name", subfolder)

                    # Safe concatenation avoiding empty DataFrames or with NaN values
                    if not selected_data.empty and not selected_data.isna().all().all():
                        clinical_df = pd.concat([clinical_df, selected_data], ignore_index=True)
                        
    return clinical_df

clinical_followup_df = main()
clinical_followup_df.shape

  clinical_df = pd.concat([clinical_df, selected_data], ignore_index=True)


File ../datasets/clinical_data/021d8f80-3db0-4f60-b404-a71e115102ca/nationwidechildrens.org_clinical.TCGA-BH-A18K.xml has no valid age data.
File ../datasets/clinical_data/0a9561ad-0d8e-4776-be26-c9d7b2825045/nationwidechildrens.org_clinical.TCGA-B6-A0IG.xml has no valid age data.
File ../datasets/clinical_data/0ecfe554-fdc5-41f3-b287-66080d24d1bd/nationwidechildrens.org_clinical.TCGA-BH-A1ET.xml has no valid age data.
File ../datasets/clinical_data/1bd2a479-da63-4cee-a722-31570b97c254/nationwidechildrens.org_clinical.TCGA-BH-A1EY.xml has no valid age data.
File ../datasets/clinical_data/1bf445b0-0b0d-4ba5-884a-ff70300b8795/nationwidechildrens.org_clinical.TCGA-AR-A0U2.xml has no valid age data.
File ../datasets/clinical_data/2438eb6e-278a-4c4a-b93d-e7fa50cb1080/nationwidechildrens.org_clinical.TCGA-EW-A1P8.xml has no valid age data.
File ../datasets/clinical_data/26fe7667-2972-4904-bf30-e12f9d8b6a33/nationwidechildrens.org_clinical.TCGA-BH-A18L.xml has no valid age data.
File ../datas

(696, 3)

In [14]:
clinical_followup_df.head()

Unnamed: 0,folder_name,file_name,days_to_last_followup
0,00049989-fa21-48fb-8dda-710c0dd5932e,nationwidechildrens.org_clinical.TCGA-A2-A0CT.xml,1918.0
1,004b6bd4-19d0-4b40-99ef-1a76313fe7a5,nationwidechildrens.org_clinical.TCGA-GM-A2DD.xml,1309.0
2,00a5e81c-cd67-483f-9d99-3c733b2ead38,nationwidechildrens.org_clinical.TCGA-D8-A1JM.xml,238.0
3,014f5ae1-5862-4165-9a3b-bba7bb08a527,nationwidechildrens.org_clinical.TCGA-C8-A12P.xml,0.0
4,01a962ea-a87f-49fa-9a27-7273a39f64a9,nationwidechildrens.org_clinical.TCGA-S3-A6ZF.xml,212.0


In [17]:
clinical_followup_df[clinical_followup_df['days_to_last_followup'] > 0].shape

(654, 3)

In [18]:
clinical_followup_df['days_to_last_followup'].describe()

count     696.000000
mean      739.989943
std       949.046203
min         0.000000
25%       118.000000
50%       394.000000
75%      1117.750000
max      6796.000000
Name: days_to_last_followup, dtype: float64