In [1]:
import pandas as pd
import os

CLINICAL_PATH = "../datasets/clinical_data"

# Create clinical_df DataFrame with age_at_initial_pathologic_diagnosis from XML files:

 ## Clinical data
 
We want to create a DataFrame clinical_df with the following columns:
- **folder_name**: the name of the subfolder in clinical_data
- **file_name**: the name of the XML file
- **age_at_initial_pathologic_diagnosis**: the value from the XML file

In [2]:
def import_xml(file_path):
    temp_df = pd.read_xml(file_path, parser="etree")
    
    if 'age_at_initial_pathologic_diagnosis' in temp_df.columns:
        temp_df = temp_df[["age_at_initial_pathologic_diagnosis", "days_to_death", "days_to_last_followup"]]
    
        temp_df = temp_df.dropna(how='all')
        
        if temp_df.empty:
            print(f"File {file_path} has no valid data.")
        return temp_df
    return None


def main():
    clinical_df = pd.DataFrame(columns=["folder_name", "file_name", "age_at_initial_pathologic_diagnosis", "days_to_death", "days_to_last_followup"])
    for subfolder in os.listdir(f"{CLINICAL_PATH}"):
        file_xml_or_annotation_or_orm = 0
        for file in os.listdir(f"{CLINICAL_PATH}/{subfolder}"):
            if file.endswith(".xml") and "annotations" not in file and "org_omf" not in file:
                file_xml_or_annotation_or_orm = 1
                selected_data = import_xml(f"{CLINICAL_PATH}/{subfolder}/{file}")
            
                if selected_data is not None:
                    selected_data.insert(0, "file_name", file)
                    selected_data.insert(0, "folder_name", subfolder)

                    # Safe concatenation avoiding empty DataFrames or with NaN values
                    if not selected_data.empty and not selected_data.isna().all().all():
                        clinical_df = pd.concat([clinical_df, selected_data], ignore_index=True)
                        
    return clinical_df

clinical_df = main()
clinical_df.shape

  clinical_df = pd.concat([clinical_df, selected_data], ignore_index=True)


(771, 5)

In [3]:
import xml.etree.ElementTree as ET

def import_xml(file_path, file, subfolder):
    tree = ET.parse(file_path)
    root = tree.getroot()
            
    ns = {
        'admin': 'http://tcga.nci/bcr/xml/administration/2.7',
        'shared': 'http://tcga.nci/bcr/xml/shared/2.7',
        'clin_shared': 'http://tcga.nci/bcr/xml/clinical/shared/2.7',
        'stage': 'http://tcga.nci/bcr/xml/clinical/shared/stage/2.7'
    }
    
    age_elem = root.find('.//clin_shared:age_at_initial_pathologic_diagnosis', ns)
    age_at_initial_pathologic_diagnosis = int(age_elem.text) if (age_elem is not None and age_elem.text) else None
    
    death_elem = root.find('.//clin_shared:days_to_death', ns)
    days_to_death = int(death_elem.text) if (death_elem is not None and death_elem.text) else None
    
    followup_elem = root.find('.//clin_shared:days_to_last_followup', ns)
    days_to_last_followup = int(followup_elem.text) if (followup_elem is not None and followup_elem.text) else None
        
    vital_status = root.find('.//clin_shared:vital_status', ns).text
    
    pathologic_stage_elem = root.find('.//stage:pathologic_stage', ns)
    pathologic_stage = pathologic_stage_elem.text if (pathologic_stage_elem is not None and pathologic_stage_elem.text) else None
        
    records.append({"file_name": file,
                    "folder_name": subfolder,
                    'days_to_death': days_to_death, 
                    'vital_status': vital_status, 
                    'pathologic_stage': pathologic_stage,
                    'age_at_initial_pathologic_diagnosis': age_at_initial_pathologic_diagnosis,
                    'days_to_last_followup': days_to_last_followup
                    })
    return records

records = []
clinical_df = pd.DataFrame(columns=["folder_name", "file_name", "age_at_initial_pathologic_diagnosis", "days_to_death", "days_to_last_followup", "vital_status", "pathologic_stage"])
    
for subfolder in os.listdir(f"{CLINICAL_PATH}"):
        file_xml_or_annotation_or_orm = 0
        for file in os.listdir(f"{CLINICAL_PATH}/{subfolder}"):
            if file.endswith(".xml") and "annotations" not in file and "org_omf" not in file:
                selected_data = import_xml(f"{CLINICAL_PATH}/{subfolder}/{file}", file, subfolder)

clinical_df = pd.DataFrame(records)
clinical_df = clinical_df.dropna(how='all')

print(clinical_df.head())

                                           file_name  \
0  nationwidechildrens.org_clinical.TCGA-A2-A0CT.xml   
1  nationwidechildrens.org_clinical.TCGA-GM-A2DD.xml   
2  nationwidechildrens.org_clinical.TCGA-D8-A1JM.xml   
3  nationwidechildrens.org_clinical.TCGA-C8-A12P.xml   
4  nationwidechildrens.org_clinical.TCGA-S3-A6ZF.xml   

                            folder_name  days_to_death vital_status  \
0  00049989-fa21-48fb-8dda-710c0dd5932e            NaN        Alive   
1  004b6bd4-19d0-4b40-99ef-1a76313fe7a5            NaN        Alive   
2  00a5e81c-cd67-483f-9d99-3c733b2ead38            NaN        Alive   
3  014f5ae1-5862-4165-9a3b-bba7bb08a527            NaN        Alive   
4  01a962ea-a87f-49fa-9a27-7273a39f64a9            NaN        Alive   

  pathologic_stage  age_at_initial_pathologic_diagnosis  days_to_last_followup  
0        Stage IIA                                   71                 1918.0  
1          Stage I                                   53                 13

In [4]:
clinical_df.shape

(771, 7)

In [5]:
clinical_df['Death'] = clinical_df['vital_status'].map({'Alive': 0, 'Dead': 1})

In [6]:
clinical_df.drop(columns=['vital_status'], inplace=True)

In [7]:
clinical_df.head()

Unnamed: 0,file_name,folder_name,days_to_death,pathologic_stage,age_at_initial_pathologic_diagnosis,days_to_last_followup,Death
0,nationwidechildrens.org_clinical.TCGA-A2-A0CT.xml,00049989-fa21-48fb-8dda-710c0dd5932e,,Stage IIA,71,1918.0,0
1,nationwidechildrens.org_clinical.TCGA-GM-A2DD.xml,004b6bd4-19d0-4b40-99ef-1a76313fe7a5,,Stage I,53,1309.0,0
2,nationwidechildrens.org_clinical.TCGA-D8-A1JM.xml,00a5e81c-cd67-483f-9d99-3c733b2ead38,,Stage IIB,59,238.0,0
3,nationwidechildrens.org_clinical.TCGA-C8-A12P.xml,014f5ae1-5862-4165-9a3b-bba7bb08a527,,Stage IIB,55,0.0,0
4,nationwidechildrens.org_clinical.TCGA-S3-A6ZF.xml,01a962ea-a87f-49fa-9a27-7273a39f64a9,,Stage IIA,64,212.0,0


In [8]:
clinical_df[["age_at_initial_pathologic_diagnosis"]].describe()

Unnamed: 0,age_at_initial_pathologic_diagnosis
count,771.0
mean,57.413748
std,13.18485
min,26.0
25%,48.0
50%,58.0
75%,66.0
max,90.0


In [9]:
print("-------------------------------------\n"
      "\t\tNaN values check:")
# Check is there are any NaN values in the DataFrame
print(clinical_df.isna().sum())
print("-------------------------------------")

-------------------------------------
		NaN values check:
file_name                                0
folder_name                              0
days_to_death                          696
pathologic_stage                         8
age_at_initial_pathologic_diagnosis      0
days_to_last_followup                   75
Death                                    0
dtype: int64
-------------------------------------


In [10]:
print("-------------------------------------")
# Check if all folder_name values are unique
print(f"-> All folder names are unique? {clinical_df['folder_name'].nunique() == len(clinical_df)}")

# Check if all file_name values are unique
print(f"-> All file names are unique? {clinical_df['file_name'].nunique() == len(clinical_df)}")
print("-------------------------------------")

-------------------------------------
-> All folder names are unique? True
-> All file names are unique? True
-------------------------------------


In [11]:
# Convert days_to_death None value into -1
clinical_df['days_to_death'] = clinical_df['days_to_death'].fillna(-1)
clinical_df.head()

Unnamed: 0,file_name,folder_name,days_to_death,pathologic_stage,age_at_initial_pathologic_diagnosis,days_to_last_followup,Death
0,nationwidechildrens.org_clinical.TCGA-A2-A0CT.xml,00049989-fa21-48fb-8dda-710c0dd5932e,-1.0,Stage IIA,71,1918.0,0
1,nationwidechildrens.org_clinical.TCGA-GM-A2DD.xml,004b6bd4-19d0-4b40-99ef-1a76313fe7a5,-1.0,Stage I,53,1309.0,0
2,nationwidechildrens.org_clinical.TCGA-D8-A1JM.xml,00a5e81c-cd67-483f-9d99-3c733b2ead38,-1.0,Stage IIB,59,238.0,0
3,nationwidechildrens.org_clinical.TCGA-C8-A12P.xml,014f5ae1-5862-4165-9a3b-bba7bb08a527,-1.0,Stage IIB,55,0.0,0
4,nationwidechildrens.org_clinical.TCGA-S3-A6ZF.xml,01a962ea-a87f-49fa-9a27-7273a39f64a9,-1.0,Stage IIA,64,212.0,0


In [12]:
# Convert days_to_last_followup None value into days_to_death value
clinical_df['days_to_last_followup'] = clinical_df.apply(
    lambda row: row['days_to_death'] if pd.isna(row['days_to_last_followup']) else row['days_to_last_followup'],
    axis=1
)
clinical_df.head()

Unnamed: 0,file_name,folder_name,days_to_death,pathologic_stage,age_at_initial_pathologic_diagnosis,days_to_last_followup,Death
0,nationwidechildrens.org_clinical.TCGA-A2-A0CT.xml,00049989-fa21-48fb-8dda-710c0dd5932e,-1.0,Stage IIA,71,1918.0,0
1,nationwidechildrens.org_clinical.TCGA-GM-A2DD.xml,004b6bd4-19d0-4b40-99ef-1a76313fe7a5,-1.0,Stage I,53,1309.0,0
2,nationwidechildrens.org_clinical.TCGA-D8-A1JM.xml,00a5e81c-cd67-483f-9d99-3c733b2ead38,-1.0,Stage IIB,59,238.0,0
3,nationwidechildrens.org_clinical.TCGA-C8-A12P.xml,014f5ae1-5862-4165-9a3b-bba7bb08a527,-1.0,Stage IIB,55,0.0,0
4,nationwidechildrens.org_clinical.TCGA-S3-A6ZF.xml,01a962ea-a87f-49fa-9a27-7273a39f64a9,-1.0,Stage IIA,64,212.0,0


In [13]:
def find_non_contributing_subfolders():
    all_subfolders = set(os.listdir(f"{CLINICAL_PATH}"))
    
    # Take the subfolders that contributed to clinical_df
    contributing_subfolders = set(clinical_df['folder_name'].unique())
    
    # Folders that did not contribute
    non_contributing_subfolders = all_subfolders - contributing_subfolders
    
    return non_contributing_subfolders

non_contributing_subfolders = find_non_contributing_subfolders()
print("Subfolders don't contribute:", non_contributing_subfolders)
print("\n-----------------------------------------------------")
print(f"-> Do not contribute to the dataframe {len(non_contributing_subfolders)} subfolders.")
print("-----------------------------------------------------")

Subfolders don't contribute: {'d583e2e8-32e6-4846-ae69-6fb1880075d9', 'aad15c34-a100-4834-a83e-52a0bb0a82f3', '0b2ccc49-bc42-498d-bf21-752f99c3d160', '8162d394-8b64-4da2-9f5b-d164c54b9608', 'ff679dd3-62be-4332-8157-0a83ffb8516a', '0a40467f-9495-4c5b-b56e-3347a3ee0572', 'b6e8df85-fcd5-47e3-b2aa-13894cbc4326', '323149a5-b38e-4cc3-adb2-fcad02347ddd', '62d4515f-a30b-4b1a-b2dd-c8bf9476e803', 'f434b393-9e32-4ca2-aa1a-3b0de36b5ffe', '4a3c7925-3ae9-44ca-b9d9-62ba78bcba33', '2ee83b95-d19c-4ae4-bb5c-bad54e9ecbe2', '8e004ced-76ea-4bdf-a100-9b6509158a7c', 'e094828e-082a-4da2-9a66-f1172af28288', 'b2235f20-5387-4548-b6d0-7d092d60bf83', 'e1cb423c-a7bc-44d8-b460-cf2cda56af90', 'cae7b60a-b248-46b2-b2c4-0a6892c2538c', '024bde93-ff69-4d1f-b301-c053e8c594f5', '22f38342-3f1a-47e5-9025-c19719fa21d0', 'a1018c1b-c6c4-4399-86f7-a0a9bdfe37ce', '890c2a98-131e-426c-8435-e18626643cdb', 'a88c168e-4bba-4bd2-9c0c-77934444cc1c', '5025cc06-de19-4fcc-9c89-c4f7d6450b2c', 'ee698dd2-1add-4bba-83d7-b2d14e8f08e7', 'b74b115b-

**771 rows** = 827 (total subfolders) - 47 (omf files) - 9 (org_clinical_radiation_brca txt files)

# Clinical JSON

In [14]:
df_clinical_json = pd.read_json('../datasets/clinical_data(json&manifest)/files.2024-12-12.json')
print(f"Data cinical JSON shape: {df_clinical_json.shape}")
df_clinical_json.columns

Data cinical JSON shape: (827, 9)


Index(['data_format', 'cases', 'access', 'file_name', 'file_id', 'data_type',
       'data_category', 'file_size', 'annotations'],
      dtype='object')

In [15]:
print(f"Column 'data_category' unique values: {df_clinical_json['data_category'].unique()}")
print(f"Column 'data_format' unique values: {df_clinical_json['data_format'].unique()}")

Column 'data_category' unique values: ['Clinical']
Column 'data_format' unique values: ['BCR XML' 'BCR OMF XML' 'BCR Biotab']


In [16]:
print(df_clinical_json[['cases', 'file_name']].shape)
df_clinical_json[['cases', 'file_name']].head()

(827, 2)


Unnamed: 0,cases,file_name
0,[{'case_id': 'e4fc0909-f284-4471-866d-d8967b6a...,nationwidechildrens.org_clinical.TCGA-E2-A14P.xml
1,[{'case_id': '87281a89-91d2-44f7-9f80-668567ad...,nationwidechildrens.org_clinical.TCGA-EW-A1J6.xml
2,[{'case_id': 'b8a615f9-d19b-4b09-8ec8-0674e5c6...,nationwidechildrens.org_clinical.TCGA-C8-A12N.xml
3,[{'case_id': '3b01d064-8c00-4972-9f07-407eac8e...,nationwidechildrens.org_clinical.TCGA-BH-A0HX.xml
4,[{'case_id': '8785012f-f73e-4d68-87cf-1d804af3...,nationwidechildrens.org_clinical.TCGA-A7-A13D.xml


In [17]:
# The 'cases' column contains lists of dictionaries
cases_expanded = df_clinical_json['cases'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else {})
unique_projects = pd.json_normalize(cases_expanded)['project.project_id'].unique()

print("Column 'cases' is a dict, with project IDs value:", unique_projects)

Column 'cases' is a dict, with project IDs value: ['TCGA-BRCA']


In [18]:
# Extract the 'case_id' value from the list of dictionaries in the 'cases' column
df_clinical_json['case_id'] = df_clinical_json['cases'].apply(
    lambda x: x[0]['case_id'] if isinstance(x, list) and len(x) > 0 and 'case_id' in x[0] else None
)

print(df_clinical_json[['case_id', 'file_name']].head())

                                case_id  \
0  e4fc0909-f284-4471-866d-d8967b6adcbc   
1  87281a89-91d2-44f7-9f80-668567ad5c72   
2  b8a615f9-d19b-4b09-8ec8-0674e5c648cd   
3  3b01d064-8c00-4972-9f07-407eac8e7534   
4  8785012f-f73e-4d68-87cf-1d804af32782   

                                           file_name  
0  nationwidechildrens.org_clinical.TCGA-E2-A14P.xml  
1  nationwidechildrens.org_clinical.TCGA-EW-A1J6.xml  
2  nationwidechildrens.org_clinical.TCGA-C8-A12N.xml  
3  nationwidechildrens.org_clinical.TCGA-BH-A0HX.xml  
4  nationwidechildrens.org_clinical.TCGA-A7-A13D.xml  


# Merge clinical_df with df_clinical_json on file_name

In [19]:
merged_df = pd.merge(clinical_df, df_clinical_json[['case_id', 'file_name']], on='file_name', how='inner')
merged_df.head()

Unnamed: 0,file_name,folder_name,days_to_death,pathologic_stage,age_at_initial_pathologic_diagnosis,days_to_last_followup,Death,case_id
0,nationwidechildrens.org_clinical.TCGA-A2-A0CT.xml,00049989-fa21-48fb-8dda-710c0dd5932e,-1.0,Stage IIA,71,1918.0,0,378778d2-b331-4867-a93b-c64028c8b4c7
1,nationwidechildrens.org_clinical.TCGA-GM-A2DD.xml,004b6bd4-19d0-4b40-99ef-1a76313fe7a5,-1.0,Stage I,53,1309.0,0,b343bfe0-7c23-4c6a-8c84-9ee39db2ecda
2,nationwidechildrens.org_clinical.TCGA-D8-A1JM.xml,00a5e81c-cd67-483f-9d99-3c733b2ead38,-1.0,Stage IIB,59,238.0,0,3e775c99-ceda-4246-8d6f-0f58ca5097c8
3,nationwidechildrens.org_clinical.TCGA-C8-A12P.xml,014f5ae1-5862-4165-9a3b-bba7bb08a527,-1.0,Stage IIB,55,0.0,0,abdc76db-f85e-4337-a57e-6d098789da03
4,nationwidechildrens.org_clinical.TCGA-S3-A6ZF.xml,01a962ea-a87f-49fa-9a27-7273a39f64a9,-1.0,Stage IIA,64,212.0,0,fbee40f1-d6d8-4156-8d42-36e09bb9f095


In [20]:
merged_df.shape

(771, 8)

# miRNA_seq

In [21]:
def import_txt(file_path):
    temp_df = pd.read_csv(file_path, sep="\t")
    if 'reads_per_million_miRNA_mapped' in temp_df.columns:
        temp_df = temp_df[["miRNA_ID", "read_count", "reads_per_million_miRNA_mapped"]]
        #temp_df = temp_df[["read_count"]]
        temp_df = temp_df.dropna(how='all')
        return temp_df
    return None

def miRNA_process():
    count_subfolders = 0
    # Initialize an empty DataFrame to store the results
    miRNA_df1 = pd.DataFrame(columns=["folder_name", "file_name", "miRNA_ID", "read_count", "reads_per_million_miRNA_mapped"])
    #miRNA_df1 = pd.DataFrame(columns=["folder_name", "file_name", "read_count"])
    for subfolder in os.listdir("../datasets/miRNA_seq"):
        count_subfolders += 1
        for file in os.listdir(f"../datasets/miRNA_seq/{subfolder}"):
            if file.endswith(".txt") and file != "annotations.xml":
                selected_data = import_txt(f"../datasets/miRNA_seq/{subfolder}/{file}")

                if selected_data is not None:
                    selected_data.insert(0, "file_name", file)
                    selected_data.insert(0, "folder_name", subfolder)

                    # Safe concatenation avoiding empty DataFrames or with NaN values
                    if not selected_data.empty and not selected_data.isna().all().all():
                        miRNA_df1 = pd.concat([miRNA_df1, selected_data], ignore_index=True)
    print(f"Subfolders: {count_subfolders}")
    return miRNA_df1
miRNA_seq_df = miRNA_process()

  miRNA_df1 = pd.concat([miRNA_df1, selected_data], ignore_index=True)


Subfolders: 767


In [22]:
miRNA_seq_df.head()

Unnamed: 0,folder_name,file_name,miRNA_ID,read_count,reads_per_million_miRNA_mapped
0,01626fb9-a7f7-4324-97f2-ef2fee03f3c7,e7b7bf36-aa58-4dc0-8548-a28c11d5060f.mirbase21...,hsa-let-7a-1,7173,5652.188302
1,01626fb9-a7f7-4324-97f2-ef2fee03f3c7,e7b7bf36-aa58-4dc0-8548-a28c11d5060f.mirbase21...,hsa-let-7a-2,7453,5872.823005
2,01626fb9-a7f7-4324-97f2-ef2fee03f3c7,e7b7bf36-aa58-4dc0-8548-a28c11d5060f.mirbase21...,hsa-let-7a-3,7475,5890.158589
3,01626fb9-a7f7-4324-97f2-ef2fee03f3c7,e7b7bf36-aa58-4dc0-8548-a28c11d5060f.mirbase21...,hsa-let-7b,18096,14259.305663
4,01626fb9-a7f7-4324-97f2-ef2fee03f3c7,e7b7bf36-aa58-4dc0-8548-a28c11d5060f.mirbase21...,hsa-let-7c,1947,1534.199167


In [23]:
simplified_miRNA_df = miRNA_seq_df.groupby(
    ['folder_name', 'file_name']
).agg({
    'miRNA_ID': list,
    'read_count': list,
    'reads_per_million_miRNA_mapped': list}).reset_index()

"""simplified_miRNA_df = miRNA_seq_df.groupby(
    ['folder_name', 'file_name']
).agg({'read_count': list}).reset_index()"""

simplified_miRNA_df.shape

(767, 5)

# miRNA JSON

In [24]:
df_miRNA_json = pd.read_json('../datasets/miRNA(json&manifest)/files.2024-12-12.json')
print(df_miRNA_json.columns)

Index(['data_format', 'cases', 'access', 'file_name', 'file_id', 'data_type',
       'data_category', 'experimental_strategy', 'platform', 'file_size',
       'annotations'],
      dtype='object')


In [25]:
print(df_miRNA_json[['cases', 'file_name']].head())
print(df_miRNA_json['cases'].head())
print(f"'data_category' unique: {df_miRNA_json['data_category'].unique()}")
print(df_miRNA_json['data_format'].unique())
print(df_miRNA_json.shape)

                                               cases  \
0  [{'case_id': '1c3610f7-e0aa-48d7-9a27-0dbaf6e2...   
1  [{'case_id': '241fffc8-4250-4cfa-b2e7-e68c33ae...   
2  [{'case_id': 'e5aae05a-478e-4a55-a27c-12b2b4be...   
3  [{'case_id': 'b7f74ae1-6f58-447c-be50-a7666eb1...   
4  [{'case_id': 'e7a00d67-2c26-4d1f-bd17-35f659e8...   

                                           file_name  
0  a8c7bebe-a450-4a3e-b891-e2d4ac578b04.mirbase21...  
1  97cb5037-7eea-4ba7-86c3-d13dd91e5b1e.mirbase21...  
2  6bd02673-dd3f-42ed-9997-e371b09f22ee.mirbase21...  
3  f846c0ad-9bbd-46a5-a678-fc3c05cdcc41.mirbase21...  
4  fe712c27-8dbb-4531-afb3-10999e319a7b.mirbase21...  
0    [{'case_id': '1c3610f7-e0aa-48d7-9a27-0dbaf6e2...
1    [{'case_id': '241fffc8-4250-4cfa-b2e7-e68c33ae...
2    [{'case_id': 'e5aae05a-478e-4a55-a27c-12b2b4be...
3    [{'case_id': 'b7f74ae1-6f58-447c-be50-a7666eb1...
4    [{'case_id': 'e7a00d67-2c26-4d1f-bd17-35f659e8...
Name: cases, dtype: object
'data_category' unique: ['Trans

In [26]:
print(df_miRNA_json[['cases', 'file_name']].shape)
df_miRNA_json[['cases', 'file_name']].head(1)

(767, 2)


Unnamed: 0,cases,file_name
0,[{'case_id': '1c3610f7-e0aa-48d7-9a27-0dbaf6e2...,a8c7bebe-a450-4a3e-b891-e2d4ac578b04.mirbase21...


In [27]:
# Column 'cases' contains lists of dictionaries
cases_expanded = df_miRNA_json['cases'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else {})
unique_projects = pd.json_normalize(cases_expanded)['project.project_id'].unique()

print("Unique project IDs:", unique_projects)

Unique project IDs: ['TCGA-BRCA']


In [28]:
# Extract the 'case_id' value from the list of dictionaries in the 'cases' column
df_miRNA_json['case_id'] = df_miRNA_json['cases'].apply(
    lambda x: x[0]['case_id'] if isinstance(x, list) and len(x) > 0 and 'case_id' in x[0] else None
)

print(df_miRNA_json[['case_id', 'file_name']].head())

                                case_id  \
0  1c3610f7-e0aa-48d7-9a27-0dbaf6e244f9   
1  241fffc8-4250-4cfa-b2e7-e68c33ae07dc   
2  e5aae05a-478e-4a55-a27c-12b2b4be302a   
3  b7f74ae1-6f58-447c-be50-a7666eb19d9a   
4  e7a00d67-2c26-4d1f-bd17-35f659e88bc1   

                                           file_name  
0  a8c7bebe-a450-4a3e-b891-e2d4ac578b04.mirbase21...  
1  97cb5037-7eea-4ba7-86c3-d13dd91e5b1e.mirbase21...  
2  6bd02673-dd3f-42ed-9997-e371b09f22ee.mirbase21...  
3  f846c0ad-9bbd-46a5-a678-fc3c05cdcc41.mirbase21...  
4  fe712c27-8dbb-4531-afb3-10999e319a7b.mirbase21...  


# Merge miRNA_df1 with df_miRNA_json on file_name

In [29]:
miRNA_merged_df = pd.merge(simplified_miRNA_df, df_miRNA_json[['case_id', 'file_name']], on='file_name', how='inner')
miRNA_merged_df.head()

Unnamed: 0,folder_name,file_name,miRNA_ID,read_count,reads_per_million_miRNA_mapped,case_id
0,01626fb9-a7f7-4324-97f2-ef2fee03f3c7,e7b7bf36-aa58-4dc0-8548-a28c11d5060f.mirbase21...,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[7173, 7453, 7475, 18096, 1947, 554, 788, 3822...","[5652.188302, 5872.823005, 5890.158589, 14259....",e6b79d7a-ed6b-459a-b040-d142616e7ab4
1,016db033-3cec-4c63-b90f-0428da475a63,f46e1aef-c572-4651-964d-a0a9bc7a1128.mirbase21...,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[10401, 10268, 10321, 25140, 632, 668, 1657, 2...","[8395.099993, 8287.749902, 8330.52851, 20291.5...",09c2bc35-c21f-4aa4-ac30-0d8db02ad811
2,0173d27e-ff23-42b2-afb9-9b867ace3efc,bc1018a6-62f3-4386-a545-9fa8a31a2e96.mirbase21...,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[30248, 30184, 30357, 223998, 1272, 907, 1259,...","[19889.387965, 19847.305155, 19961.06025, 1472...",08de63a2-7b76-43c3-80dc-df748b1d81bc
3,0195917c-c127-4523-aff3-9e64ebdd4363,75134cfe-bc9d-4838-9a88-2ef2200b34ec.mirbase21...,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[47017, 47002, 46859, 159551, 7356, 2186, 4169...","[13142.168087, 13137.975295, 13098.004007, 445...",d071c16b-7cee-45ed-8ec9-612418143815
4,046b15a8-38d3-41fc-b55f-04dd591a8e14,ad7d6f10-1dff-4bc8-a8c3-0b6ce7b45366.mirbase21...,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[9585, 9548, 9638, 18789, 6743, 2383, 1126, 21...","[10511.654941, 10471.07787, 10569.778855, 2060...",49717f75-0f2d-4e1c-9a12-f1cd7877b80a


In [30]:
miRNA_merged_df.shape

(767, 6)

# Merge the two merged DataFrames on case_id

In [31]:
final_merged_df = pd.merge(merged_df, miRNA_merged_df, on='case_id', suffixes=('_clinical', '_miRNA'))
final_merged_df.head()

Unnamed: 0,file_name_clinical,folder_name_clinical,days_to_death,pathologic_stage,age_at_initial_pathologic_diagnosis,days_to_last_followup,Death,case_id,folder_name_miRNA,file_name_miRNA,miRNA_ID,read_count,reads_per_million_miRNA_mapped
0,nationwidechildrens.org_clinical.TCGA-A2-A0CT.xml,00049989-fa21-48fb-8dda-710c0dd5932e,-1.0,Stage IIA,71,1918.0,0,378778d2-b331-4867-a93b-c64028c8b4c7,c2cf4ba8-f4c8-4be0-b61a-2e3bd8119638,ceed6c37-0a34-42e8-98a6-5e3626e62c2f.mirbase21...,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[13756, 13807, 13949, 55698, 5797, 518, 3747, ...","[7988.580442, 8018.197889, 8100.662153, 32345...."
1,nationwidechildrens.org_clinical.TCGA-GM-A2DD.xml,004b6bd4-19d0-4b40-99ef-1a76313fe7a5,-1.0,Stage I,53,1309.0,0,b343bfe0-7c23-4c6a-8c84-9ee39db2ecda,735ece50-9155-418b-8e31-64d1bfaf153a,6b6e80ce-d104-477a-91be-d69717c8c8ce.mirbase21...,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[37711, 37303, 37662, 44231, 14405, 1889, 3169...","[11857.268807, 11728.98354, 11841.861998, 1390..."
2,nationwidechildrens.org_clinical.TCGA-C8-A12P.xml,014f5ae1-5862-4165-9a3b-bba7bb08a527,-1.0,Stage IIB,55,0.0,0,abdc76db-f85e-4337-a57e-6d098789da03,d6c77415-a3c1-4627-a0ab-dacdac3db507,c0405ebb-1311-40b2-a5c4-522d2cc62988.mirbase21...,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[10731, 10926, 10792, 14125, 2622, 462, 698, 3...","[7918.38843, 8062.27863, 7963.400236, 10422.81..."
3,nationwidechildrens.org_clinical.TCGA-S3-A6ZF.xml,01a962ea-a87f-49fa-9a27-7273a39f64a9,-1.0,Stage IIA,64,212.0,0,fbee40f1-d6d8-4156-8d42-36e09bb9f095,a8804277-1e01-4cb0-9a1a-a637ca5852d9,b56bae60-552c-48bf-a04d-999aa7cbde6f.mirbase21...,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[70280, 70637, 70972, 67833, 5167, 1629, 18954...","[14251.643715, 14324.037522, 14391.970087, 137..."
4,nationwidechildrens.org_clinical.TCGA-BH-A18K.xml,021d8f80-3db0-4f60-b404-a71e115102ca,2763.0,Stage I,46,2763.0,1,50619f8c-10aa-464a-a227-90a7aa6ffd43,1d47e720-1a02-45f4-b0dc-99861916e3e1,b34c0985-ac8d-44a9-8b10-559279fda5f4.mirbase21...,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[12807, 12718, 13064, 26120, 4490, 1775, 11051...","[4818.534597, 4785.049036, 4915.228857, 9827.4..."


In [32]:
final_merged_df.shape

(767, 13)

In [33]:
final_merged_df.isna().sum()

file_name_clinical                     0
folder_name_clinical                   0
days_to_death                          0
pathologic_stage                       7
age_at_initial_pathologic_diagnosis    0
days_to_last_followup                  0
Death                                  0
case_id                                0
folder_name_miRNA                      0
file_name_miRNA                        0
miRNA_ID                               0
read_count                             0
reads_per_million_miRNA_mapped         0
dtype: int64

In [34]:
final_merged_df.columns

Index(['file_name_clinical', 'folder_name_clinical', 'days_to_death',
       'pathologic_stage', 'age_at_initial_pathologic_diagnosis',
       'days_to_last_followup', 'Death', 'case_id', 'folder_name_miRNA',
       'file_name_miRNA', 'miRNA_ID', 'read_count',
       'reads_per_million_miRNA_mapped'],
      dtype='object')

## Righe non mergiate

In [35]:
# Merge con outer join per includere tutte le righe
all_rows_df = pd.merge(merged_df, miRNA_merged_df, on='case_id', suffixes=('_clinical', '_miRNA'), how='outer')

# Retrieve rows with NaN values in any column
non_matching_rows_df = all_rows_df[all_rows_df.isna().any(axis=1)]

non_matching_rows_df.shape

(23, 13)

In [36]:
non_matching_rows_df.head(10)

Unnamed: 0,file_name_clinical,folder_name_clinical,days_to_death,pathologic_stage,age_at_initial_pathologic_diagnosis,days_to_last_followup,Death,case_id,folder_name_miRNA,file_name_miRNA,miRNA_ID,read_count,reads_per_million_miRNA_mapped
34,nationwidechildrens.org_clinical.TCGA-A8-A08F.xml,051ca785-e7be-4ac3-8757-1c9ace8acf7e,-1.0,Stage IIIC,59,549.0,0,0a017f15-1c6b-45e7-8d55-e0a71df1b2e8,,,,,
72,nationwidechildrens.org_clinical.TCGA-B6-A0X1.xml,492170e4-90b8-4560-89ff-effa9eeb5194,-1.0,,48,5677.0,0,178b2c48-c07d-422e-ae17-8bcfd996ad51,,,,,
76,nationwidechildrens.org_clinical.TCGA-AR-A0U1.xml,48da1f05-062b-4f2d-9936-5064d9a1a9f3,-1.0,Stage IIB,36,2134.0,0,17d9e646-6ab3-40b3-a0bc-2c834d3c3213,,,,,
133,nationwidechildrens.org_clinical.TCGA-AO-A03P.xml,affbf824-7c87-4dac-96ed-b98edeee7775,-1.0,Stage IIB,54,2576.0,0,2d4c778c-7f77-4f0a-8261-2086accf15fd,,,,,
188,nationwidechildrens.org_clinical.TCGA-BH-A1FD.xml,f4ea8ea9-c4f2-4f93-a259-29d49be19ef8,1009.0,Stage I,68,1009.0,1,3c8b5af9-c34d-43c2-b8c9-39ea11e44fa6,,,,,
193,nationwidechildrens.org_clinical.TCGA-D8-A1JM.xml,00a5e81c-cd67-483f-9d99-3c733b2ead38,-1.0,Stage IIB,59,238.0,0,3e775c99-ceda-4246-8d6f-0f58ca5097c8,,,,,
206,nationwidechildrens.org_clinical.TCGA-D8-A141.xml,a423f8fd-044c-4231-8a2c-fd59968a0ab3,-1.0,,40,113.0,0,42f6f503-4cf6-4a8e-b5fe-b44bccf6b38b,58df0564-7b33-4237-9867-c3426979fd94,494315de-dc27-480c-b09c-b37cbe1bd7ab.mirbase21...,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[45413, 45280, 45685, 113674, 7661, 1986, 3420...","[15495.213554, 15449.833082, 15588.021739, 387..."
278,nationwidechildrens.org_clinical.TCGA-D8-A13Z.xml,ab9d072b-f0ad-470c-ab84-e1276f4629ca,-1.0,,51,210.0,0,5a17dcd9-5ced-4a69-8069-23c7fd0649d1,98c6443e-64f7-438c-bbe1-bea65458e98b,4a43baff-f305-4fd4-a221-464f68d40918.mirbase21...,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[27211, 27190, 27312, 60619, 7190, 2453, 2920,...","[8545.89994, 8539.30467, 8577.62005, 19038.032..."
344,nationwidechildrens.org_clinical.TCGA-A8-A06X.xml,4b81e1c7-f651-4326-a9b7-d9ff18ccefdb,943.0,Stage IIB,77,943.0,1,70931617-b3df-4a12-8e3f-2b2307602f48,,,,,
372,nationwidechildrens.org_clinical.TCGA-A8-A08C.xml,032c6c78-a0f7-4695-a93e-3f42747353a8,-1.0,Stage IIA,65,608.0,0,78e1da41-127c-4e9c-aaaa-77a0d94c31d0,,,,,


# Save in csv file

In [37]:
# Select relevant columns to save
#final_merged_df_to_save = final_merged_df[['case_id', 'age_at_initial_pathologic_diagnosis', 'reads_per_million_miRNA_mapped']]
final_merged_df_to_save = final_merged_df.drop(columns=['folder_name_clinical', 'file_name_clinical', 'folder_name_miRNA', 'file_name_miRNA'])

final_merged_df_to_save.to_csv('../datasets/preprocessed/clinical_miRNA(RC_RPM).csv', index=False)

In [None]:
"""final_merged_df = final_merged_df_to_save.copy()
# Expand the lists in 'reads_per_million_miRNA_mapped' into separate columns
reads_df = pd.DataFrame(final_merged_df["reads_per_million_miRNA_mapped"].tolist()).fillna(0)

# Rename the columns appropriately
reads_df.columns = [f"miRNA_{i+1}" for i in range(reads_df.shape[1])]

# Unisci con le altre colonne (case_id, et√†, ecc.)
#df_finale = pd.concat([final_merged_df.drop(columns=["reads_per_million_miRNA_mapped"]), reads_df], axis=1)
df_finale = pd.concat([final_merged_df.drop(columns=["read_count"]), reads_df], axis=1)

print(df_finale.head())
df_finale.to_csv('../datasets/preprocessed/clinical_miRNA_splitted(reads_per_million).csv', index=False)"""

In [38]:
final_merged_df_to_save.shape

(767, 9)