In [3]:
import pandas as pd
import os

CLINICAL_PATH = "../datasets/clinical_data"

# Create clinical_df DataFrame with age_at_initial_pathologic_diagnosis from XML files:

 ## Clinical data
 
We want to create a DataFrame clinical_df with the following columns:
- **folder_name**: the name of the subfolder in clinical_data
- **file_name**: the name of the XML file
- **age_at_initial_pathologic_diagnosis**: the value from the XML file

In [4]:
def import_xml(file_path):
    temp_df = pd.read_xml(file_path, parser="etree")

    if 'days_to_birth' in temp_df.columns and 'age_at_initial_pathologic_diagnosis' in temp_df.columns:
        temp_df = temp_df[["age_at_initial_pathologic_diagnosis"]]
    
        temp_df = temp_df.dropna(how='all')
        
        if temp_df.empty:
            print(f"File {file_path} has no valid age data.")
        return temp_df
    return None


def main():
    clinical_df = pd.DataFrame(columns=["folder_name", "file_name", "age_at_initial_pathologic_diagnosis"])
    for subfolder in os.listdir(f"{CLINICAL_PATH}"):
        file_xml_or_annotation_or_orm = 0
        for file in os.listdir(f"{CLINICAL_PATH}/{subfolder}"):
            if file.endswith(".xml") and "annotations" not in file and "org_omf" not in file:
                file_xml_or_annotation_or_orm = 1
                selected_data = import_xml(f"{CLINICAL_PATH}/{subfolder}/{file}")
            
                if selected_data is not None:
                    selected_data.insert(0, "file_name", file)
                    selected_data.insert(0, "folder_name", subfolder)

                    # Safe concatenation avoiding empty DataFrames or with NaN values
                    if not selected_data.empty and not selected_data.isna().all().all():
                        clinical_df = pd.concat([clinical_df, selected_data], ignore_index=True)
                        
    return clinical_df

clinical_df = main()
clinical_df.shape

  clinical_df = pd.concat([clinical_df, selected_data], ignore_index=True)


(771, 3)

In [5]:
clinical_df.head()

Unnamed: 0,folder_name,file_name,age_at_initial_pathologic_diagnosis
0,00049989-fa21-48fb-8dda-710c0dd5932e,nationwidechildrens.org_clinical.TCGA-A2-A0CT.xml,71.0
1,004b6bd4-19d0-4b40-99ef-1a76313fe7a5,nationwidechildrens.org_clinical.TCGA-GM-A2DD.xml,53.0
2,00a5e81c-cd67-483f-9d99-3c733b2ead38,nationwidechildrens.org_clinical.TCGA-D8-A1JM.xml,59.0
3,014f5ae1-5862-4165-9a3b-bba7bb08a527,nationwidechildrens.org_clinical.TCGA-C8-A12P.xml,55.0
4,01a962ea-a87f-49fa-9a27-7273a39f64a9,nationwidechildrens.org_clinical.TCGA-S3-A6ZF.xml,64.0


In [6]:
clinical_df[["age_at_initial_pathologic_diagnosis"]].describe()

Unnamed: 0,age_at_initial_pathologic_diagnosis
count,771.0
mean,57.413748
std,13.18485
min,26.0
25%,48.0
50%,58.0
75%,66.0
max,90.0


In [7]:
print("-------------------------------------\n"
      "\t\tNaN values check:")
# Check is there are any NaN values in the DataFrame
print(clinical_df.isna().sum())
print("-------------------------------------")

-------------------------------------
		NaN values check:
folder_name                            0
file_name                              0
age_at_initial_pathologic_diagnosis    0
dtype: int64
-------------------------------------


In [8]:
print("-------------------------------------")
# Check if all folder_name values are unique
print(f"-> All folder names are unique? {clinical_df['folder_name'].nunique() == len(clinical_df)}")

# Check if all file_name values are unique
print(f"-> All file names are unique? {clinical_df['file_name'].nunique() == len(clinical_df)}")
print("-------------------------------------")

-------------------------------------
-> All folder names are unique? True
-> All file names are unique? True
-------------------------------------


In [9]:
def find_non_contributing_subfolders():
    all_subfolders = set(os.listdir(f"{CLINICAL_PATH}"))
    
    # Take the subfolders that contributed to clinical_df
    contributing_subfolders = set(clinical_df['folder_name'].unique())
    
    # Folders that did not contribute
    non_contributing_subfolders = all_subfolders - contributing_subfolders
    
    return non_contributing_subfolders

non_contributing_subfolders = find_non_contributing_subfolders()
print("Subfolders don't contribute:", non_contributing_subfolders)
print("\n-----------------------------------------------------")
print(f"-> Do not contribute to the dataframe {len(non_contributing_subfolders)} subfolders.")
print("-----------------------------------------------------")

Subfolders don't contribute: {'100659db-c917-4318-ab86-192cc46a32f3', '671069e5-4730-4c23-8adf-79576bb1843f', '024bde93-ff69-4d1f-b301-c053e8c594f5', '967f57a0-b6d6-4b6f-8882-6cb524a461ca', 'ff679dd3-62be-4332-8157-0a83ffb8516a', '403b5cef-8173-47c7-b56a-cc94dcfbb2e3', '0b2ccc49-bc42-498d-bf21-752f99c3d160', '5af9bc05-ecb6-4031-9039-a5164bb91aed', 'aad15c34-a100-4834-a83e-52a0bb0a82f3', 'a3af7034-da31-47d0-8cb7-47eb8a0dc102', '415848eb-7256-474f-9c80-ecdfec0f0a9d', 'b74b115b-1a48-4dad-a219-d5f5c76d9bbd', 'ee698dd2-1add-4bba-83d7-b2d14e8f08e7', 'a81b89be-b50e-4e63-9821-21aa9e8ff2be', '2495917b-a873-4b71-8ff3-1832fd6daba6', '1e3fc77b-4c6a-4406-a808-7d35d56cf1a1', 'cd6d0af9-6c41-4f71-81a6-103b11e50b93', '4a3c7925-3ae9-44ca-b9d9-62ba78bcba33', '7322ca0d-71ef-4eaa-bbfd-224cdb896684', '1e6b79ff-9787-4cbe-b19d-ebabb6b43589', '8162d394-8b64-4da2-9f5b-d164c54b9608', 'e1cb423c-a7bc-44d8-b460-cf2cda56af90', 'b6e8df85-fcd5-47e3-b2aa-13894cbc4326', '0a40467f-9495-4c5b-b56e-3347a3ee0572', 'cae7b60a-

**771 rows** = 827 (total subfolders) - 47 (omf files) - 9 (org_clinical_radiation_brca txt files)

# Clinical JSON

In [10]:
df_clinical_json = pd.read_json('../datasets/clinical_data(json&manifest)/files.2024-12-12.json')
print(f"Data cinical JSON shape: {df_clinical_json.shape}")
df_clinical_json.columns

Data cinical JSON shape: (827, 9)


Index(['data_format', 'cases', 'access', 'file_name', 'file_id', 'data_type',
       'data_category', 'file_size', 'annotations'],
      dtype='object')

In [11]:
print(f"Column 'data_category' unique values: {df_clinical_json['data_category'].unique()}")
print(f"Column 'data_format' unique values: {df_clinical_json['data_format'].unique()}")

Column 'data_category' unique values: ['Clinical']
Column 'data_format' unique values: ['BCR XML' 'BCR OMF XML' 'BCR Biotab']


In [12]:
print(df_clinical_json[['cases', 'file_name']].shape)
df_clinical_json[['cases', 'file_name']].head()

(827, 2)


Unnamed: 0,cases,file_name
0,[{'case_id': 'e4fc0909-f284-4471-866d-d8967b6a...,nationwidechildrens.org_clinical.TCGA-E2-A14P.xml
1,[{'case_id': '87281a89-91d2-44f7-9f80-668567ad...,nationwidechildrens.org_clinical.TCGA-EW-A1J6.xml
2,[{'case_id': 'b8a615f9-d19b-4b09-8ec8-0674e5c6...,nationwidechildrens.org_clinical.TCGA-C8-A12N.xml
3,[{'case_id': '3b01d064-8c00-4972-9f07-407eac8e...,nationwidechildrens.org_clinical.TCGA-BH-A0HX.xml
4,[{'case_id': '8785012f-f73e-4d68-87cf-1d804af3...,nationwidechildrens.org_clinical.TCGA-A7-A13D.xml


In [13]:
# The 'cases' column contains lists of dictionaries
cases_expanded = df_clinical_json['cases'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else {})
unique_projects = pd.json_normalize(cases_expanded)['project.project_id'].unique()

print("Column 'cases' is a dict, with project IDs value:", unique_projects)

Column 'cases' is a dict, with project IDs value: ['TCGA-BRCA']


In [14]:
# Extract the 'case_id' value from the list of dictionaries in the 'cases' column
df_clinical_json['case_id'] = df_clinical_json['cases'].apply(
    lambda x: x[0]['case_id'] if isinstance(x, list) and len(x) > 0 and 'case_id' in x[0] else None
)

print(df_clinical_json[['case_id', 'file_name']].head())

                                case_id  \
0  e4fc0909-f284-4471-866d-d8967b6adcbc   
1  87281a89-91d2-44f7-9f80-668567ad5c72   
2  b8a615f9-d19b-4b09-8ec8-0674e5c648cd   
3  3b01d064-8c00-4972-9f07-407eac8e7534   
4  8785012f-f73e-4d68-87cf-1d804af32782   

                                           file_name  
0  nationwidechildrens.org_clinical.TCGA-E2-A14P.xml  
1  nationwidechildrens.org_clinical.TCGA-EW-A1J6.xml  
2  nationwidechildrens.org_clinical.TCGA-C8-A12N.xml  
3  nationwidechildrens.org_clinical.TCGA-BH-A0HX.xml  
4  nationwidechildrens.org_clinical.TCGA-A7-A13D.xml  


# Merge clinical_df with df_clinical_json on file_name

In [15]:
merged_df = pd.merge(clinical_df, df_clinical_json[['case_id', 'file_name']], on='file_name', how='inner')
merged_df.head()

Unnamed: 0,folder_name,file_name,age_at_initial_pathologic_diagnosis,case_id
0,00049989-fa21-48fb-8dda-710c0dd5932e,nationwidechildrens.org_clinical.TCGA-A2-A0CT.xml,71.0,378778d2-b331-4867-a93b-c64028c8b4c7
1,004b6bd4-19d0-4b40-99ef-1a76313fe7a5,nationwidechildrens.org_clinical.TCGA-GM-A2DD.xml,53.0,b343bfe0-7c23-4c6a-8c84-9ee39db2ecda
2,00a5e81c-cd67-483f-9d99-3c733b2ead38,nationwidechildrens.org_clinical.TCGA-D8-A1JM.xml,59.0,3e775c99-ceda-4246-8d6f-0f58ca5097c8
3,014f5ae1-5862-4165-9a3b-bba7bb08a527,nationwidechildrens.org_clinical.TCGA-C8-A12P.xml,55.0,abdc76db-f85e-4337-a57e-6d098789da03
4,01a962ea-a87f-49fa-9a27-7273a39f64a9,nationwidechildrens.org_clinical.TCGA-S3-A6ZF.xml,64.0,fbee40f1-d6d8-4156-8d42-36e09bb9f095


In [16]:
merged_df.shape

(771, 4)

# miRNA_seq

In [17]:
def import_txt(file_path):
    temp_df = pd.read_csv(file_path, sep="\t")
    if 'reads_per_million_miRNA_mapped' in temp_df.columns:
        temp_df = temp_df[["reads_per_million_miRNA_mapped"]]
        temp_df = temp_df.dropna(how='all')
        return temp_df
    return None

def miRNA_process():
    count_subfolders = 0
    # Initialize an empty DataFrame to store the results
    miRNA_df1 = pd.DataFrame(columns=["folder_name", "file_name", "reads_per_million_miRNA_mapped"])
    for subfolder in os.listdir("../datasets/miRNA_seq"):
        count_subfolders += 1
        for file in os.listdir(f"../datasets/miRNA_seq/{subfolder}"):
            if file.endswith(".txt") and file != "annotations.xml":
                selected_data = import_txt(f"../datasets/miRNA_seq/{subfolder}/{file}")

                if selected_data is not None:
                    selected_data.insert(0, "file_name", file)
                    selected_data.insert(0, "folder_name", subfolder)

                    # Safe concatenation avoiding empty DataFrames or with NaN values
                    if not selected_data.empty and not selected_data.isna().all().all():
                        miRNA_df1 = pd.concat([miRNA_df1, selected_data], ignore_index=True)
    print(f"Subfolders: {count_subfolders}")
    return miRNA_df1
miRNA_seq_df = miRNA_process()

  miRNA_df1 = pd.concat([miRNA_df1, selected_data], ignore_index=True)


Subfolders: 767


In [18]:
miRNA_seq_df.head()

Unnamed: 0,folder_name,file_name,reads_per_million_miRNA_mapped
0,01626fb9-a7f7-4324-97f2-ef2fee03f3c7,e7b7bf36-aa58-4dc0-8548-a28c11d5060f.mirbase21...,5652.188302
1,01626fb9-a7f7-4324-97f2-ef2fee03f3c7,e7b7bf36-aa58-4dc0-8548-a28c11d5060f.mirbase21...,5872.823005
2,01626fb9-a7f7-4324-97f2-ef2fee03f3c7,e7b7bf36-aa58-4dc0-8548-a28c11d5060f.mirbase21...,5890.158589
3,01626fb9-a7f7-4324-97f2-ef2fee03f3c7,e7b7bf36-aa58-4dc0-8548-a28c11d5060f.mirbase21...,14259.305663
4,01626fb9-a7f7-4324-97f2-ef2fee03f3c7,e7b7bf36-aa58-4dc0-8548-a28c11d5060f.mirbase21...,1534.199167


In [19]:
simplified_miRNA_df = miRNA_seq_df.groupby(
    ['folder_name', 'file_name']
).agg({'reads_per_million_miRNA_mapped': list}).reset_index()

simplified_miRNA_df.shape

(767, 3)

# miRNA JSON

In [20]:
df_miRNA_json = pd.read_json('../datasets/miRNA(json&manifest)/files.2024-12-12.json')
print(df_miRNA_json.columns)

Index(['data_format', 'cases', 'access', 'file_name', 'file_id', 'data_type',
       'data_category', 'experimental_strategy', 'platform', 'file_size',
       'annotations'],
      dtype='object')


In [21]:
print(df_miRNA_json[['cases', 'file_name']].head())
print(df_miRNA_json['cases'].head())
print(f"'data_category' unique: {df_miRNA_json['data_category'].unique()}")
print(df_miRNA_json['data_format'].unique())
print(df_miRNA_json.shape)

                                               cases  \
0  [{'case_id': '1c3610f7-e0aa-48d7-9a27-0dbaf6e2...   
1  [{'case_id': '241fffc8-4250-4cfa-b2e7-e68c33ae...   
2  [{'case_id': 'e5aae05a-478e-4a55-a27c-12b2b4be...   
3  [{'case_id': 'b7f74ae1-6f58-447c-be50-a7666eb1...   
4  [{'case_id': 'e7a00d67-2c26-4d1f-bd17-35f659e8...   

                                           file_name  
0  a8c7bebe-a450-4a3e-b891-e2d4ac578b04.mirbase21...  
1  97cb5037-7eea-4ba7-86c3-d13dd91e5b1e.mirbase21...  
2  6bd02673-dd3f-42ed-9997-e371b09f22ee.mirbase21...  
3  f846c0ad-9bbd-46a5-a678-fc3c05cdcc41.mirbase21...  
4  fe712c27-8dbb-4531-afb3-10999e319a7b.mirbase21...  
0    [{'case_id': '1c3610f7-e0aa-48d7-9a27-0dbaf6e2...
1    [{'case_id': '241fffc8-4250-4cfa-b2e7-e68c33ae...
2    [{'case_id': 'e5aae05a-478e-4a55-a27c-12b2b4be...
3    [{'case_id': 'b7f74ae1-6f58-447c-be50-a7666eb1...
4    [{'case_id': 'e7a00d67-2c26-4d1f-bd17-35f659e8...
Name: cases, dtype: object
'data_category' unique: ['Trans

In [22]:
print(df_miRNA_json[['cases', 'file_name']].shape)
df_miRNA_json[['cases', 'file_name']].head(1)

(767, 2)


Unnamed: 0,cases,file_name
0,[{'case_id': '1c3610f7-e0aa-48d7-9a27-0dbaf6e2...,a8c7bebe-a450-4a3e-b891-e2d4ac578b04.mirbase21...


In [23]:
# Column 'cases' contains lists of dictionaries
cases_expanded = df_miRNA_json['cases'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else {})
unique_projects = pd.json_normalize(cases_expanded)['project.project_id'].unique()

print("Unique project IDs:", unique_projects)

Unique project IDs: ['TCGA-BRCA']


In [24]:
# Extract the 'case_id' value from the list of dictionaries in the 'cases' column
df_miRNA_json['case_id'] = df_miRNA_json['cases'].apply(
    lambda x: x[0]['case_id'] if isinstance(x, list) and len(x) > 0 and 'case_id' in x[0] else None
)

print(df_miRNA_json[['case_id', 'file_name']].head())

                                case_id  \
0  1c3610f7-e0aa-48d7-9a27-0dbaf6e244f9   
1  241fffc8-4250-4cfa-b2e7-e68c33ae07dc   
2  e5aae05a-478e-4a55-a27c-12b2b4be302a   
3  b7f74ae1-6f58-447c-be50-a7666eb19d9a   
4  e7a00d67-2c26-4d1f-bd17-35f659e88bc1   

                                           file_name  
0  a8c7bebe-a450-4a3e-b891-e2d4ac578b04.mirbase21...  
1  97cb5037-7eea-4ba7-86c3-d13dd91e5b1e.mirbase21...  
2  6bd02673-dd3f-42ed-9997-e371b09f22ee.mirbase21...  
3  f846c0ad-9bbd-46a5-a678-fc3c05cdcc41.mirbase21...  
4  fe712c27-8dbb-4531-afb3-10999e319a7b.mirbase21...  


# Merge miRNA_df1 with df_miRNA_json on file_name

In [25]:
miRNA_merged_df = pd.merge(simplified_miRNA_df, df_miRNA_json[['case_id', 'file_name']], on='file_name', how='inner')
miRNA_merged_df.head()

Unnamed: 0,folder_name,file_name,reads_per_million_miRNA_mapped,case_id
0,01626fb9-a7f7-4324-97f2-ef2fee03f3c7,e7b7bf36-aa58-4dc0-8548-a28c11d5060f.mirbase21...,"[5652.188302, 5872.823005, 5890.158589, 14259....",e6b79d7a-ed6b-459a-b040-d142616e7ab4
1,016db033-3cec-4c63-b90f-0428da475a63,f46e1aef-c572-4651-964d-a0a9bc7a1128.mirbase21...,"[8395.099993, 8287.749902, 8330.52851, 20291.5...",09c2bc35-c21f-4aa4-ac30-0d8db02ad811
2,0173d27e-ff23-42b2-afb9-9b867ace3efc,bc1018a6-62f3-4386-a545-9fa8a31a2e96.mirbase21...,"[19889.387965, 19847.305155, 19961.06025, 1472...",08de63a2-7b76-43c3-80dc-df748b1d81bc
3,0195917c-c127-4523-aff3-9e64ebdd4363,75134cfe-bc9d-4838-9a88-2ef2200b34ec.mirbase21...,"[13142.168087, 13137.975295, 13098.004007, 445...",d071c16b-7cee-45ed-8ec9-612418143815
4,046b15a8-38d3-41fc-b55f-04dd591a8e14,ad7d6f10-1dff-4bc8-a8c3-0b6ce7b45366.mirbase21...,"[10511.654941, 10471.07787, 10569.778855, 2060...",49717f75-0f2d-4e1c-9a12-f1cd7877b80a


In [26]:
miRNA_merged_df.shape

(767, 4)

# Merge the two merged DataFrames on case_id

In [27]:
final_merged_df = pd.merge(merged_df, miRNA_merged_df, on='case_id', suffixes=('_clinical', '_miRNA'))
final_merged_df.head()

Unnamed: 0,folder_name_clinical,file_name_clinical,age_at_initial_pathologic_diagnosis,case_id,folder_name_miRNA,file_name_miRNA,reads_per_million_miRNA_mapped
0,00049989-fa21-48fb-8dda-710c0dd5932e,nationwidechildrens.org_clinical.TCGA-A2-A0CT.xml,71.0,378778d2-b331-4867-a93b-c64028c8b4c7,c2cf4ba8-f4c8-4be0-b61a-2e3bd8119638,ceed6c37-0a34-42e8-98a6-5e3626e62c2f.mirbase21...,"[7988.580442, 8018.197889, 8100.662153, 32345...."
1,004b6bd4-19d0-4b40-99ef-1a76313fe7a5,nationwidechildrens.org_clinical.TCGA-GM-A2DD.xml,53.0,b343bfe0-7c23-4c6a-8c84-9ee39db2ecda,735ece50-9155-418b-8e31-64d1bfaf153a,6b6e80ce-d104-477a-91be-d69717c8c8ce.mirbase21...,"[11857.268807, 11728.98354, 11841.861998, 1390..."
2,014f5ae1-5862-4165-9a3b-bba7bb08a527,nationwidechildrens.org_clinical.TCGA-C8-A12P.xml,55.0,abdc76db-f85e-4337-a57e-6d098789da03,d6c77415-a3c1-4627-a0ab-dacdac3db507,c0405ebb-1311-40b2-a5c4-522d2cc62988.mirbase21...,"[7918.38843, 8062.27863, 7963.400236, 10422.81..."
3,01a962ea-a87f-49fa-9a27-7273a39f64a9,nationwidechildrens.org_clinical.TCGA-S3-A6ZF.xml,64.0,fbee40f1-d6d8-4156-8d42-36e09bb9f095,a8804277-1e01-4cb0-9a1a-a637ca5852d9,b56bae60-552c-48bf-a04d-999aa7cbde6f.mirbase21...,"[14251.643715, 14324.037522, 14391.970087, 137..."
4,021d8f80-3db0-4f60-b404-a71e115102ca,nationwidechildrens.org_clinical.TCGA-BH-A18K.xml,46.0,50619f8c-10aa-464a-a227-90a7aa6ffd43,1d47e720-1a02-45f4-b0dc-99861916e3e1,b34c0985-ac8d-44a9-8b10-559279fda5f4.mirbase21...,"[4818.534597, 4785.049036, 4915.228857, 9827.4..."


In [28]:
final_merged_df.shape

(767, 7)

In [29]:
final_merged_df.isna().sum()

folder_name_clinical                   0
file_name_clinical                     0
age_at_initial_pathologic_diagnosis    0
case_id                                0
folder_name_miRNA                      0
file_name_miRNA                        0
reads_per_million_miRNA_mapped         0
dtype: int64

In [30]:
final_merged_df.columns

Index(['folder_name_clinical', 'file_name_clinical',
       'age_at_initial_pathologic_diagnosis', 'case_id', 'folder_name_miRNA',
       'file_name_miRNA', 'reads_per_million_miRNA_mapped'],
      dtype='object')

## Righe non mergiate

In [31]:
# Merge con outer join per includere tutte le righe
all_rows_df = pd.merge(merged_df, miRNA_merged_df, on='case_id', suffixes=('_clinical', '_miRNA'), how='outer')

# Retrieve rows with NaN values in any column
non_matching_rows_df = all_rows_df[all_rows_df.isna().any(axis=1)]

non_matching_rows_df.shape

(16, 7)

In [32]:
non_matching_rows_df.head(10)

Unnamed: 0,folder_name_clinical,file_name_clinical,age_at_initial_pathologic_diagnosis,case_id,folder_name_miRNA,file_name_miRNA,reads_per_million_miRNA_mapped
34,051ca785-e7be-4ac3-8757-1c9ace8acf7e,nationwidechildrens.org_clinical.TCGA-A8-A08F.xml,59.0,0a017f15-1c6b-45e7-8d55-e0a71df1b2e8,,,
72,492170e4-90b8-4560-89ff-effa9eeb5194,nationwidechildrens.org_clinical.TCGA-B6-A0X1.xml,48.0,178b2c48-c07d-422e-ae17-8bcfd996ad51,,,
76,48da1f05-062b-4f2d-9936-5064d9a1a9f3,nationwidechildrens.org_clinical.TCGA-AR-A0U1.xml,36.0,17d9e646-6ab3-40b3-a0bc-2c834d3c3213,,,
133,affbf824-7c87-4dac-96ed-b98edeee7775,nationwidechildrens.org_clinical.TCGA-AO-A03P.xml,54.0,2d4c778c-7f77-4f0a-8261-2086accf15fd,,,
188,f4ea8ea9-c4f2-4f93-a259-29d49be19ef8,nationwidechildrens.org_clinical.TCGA-BH-A1FD.xml,68.0,3c8b5af9-c34d-43c2-b8c9-39ea11e44fa6,,,
193,00a5e81c-cd67-483f-9d99-3c733b2ead38,nationwidechildrens.org_clinical.TCGA-D8-A1JM.xml,59.0,3e775c99-ceda-4246-8d6f-0f58ca5097c8,,,
344,4b81e1c7-f651-4326-a9b7-d9ff18ccefdb,nationwidechildrens.org_clinical.TCGA-A8-A06X.xml,77.0,70931617-b3df-4a12-8e3f-2b2307602f48,,,
372,032c6c78-a0f7-4695-a93e-3f42747353a8,nationwidechildrens.org_clinical.TCGA-A8-A08C.xml,65.0,78e1da41-127c-4e9c-aaaa-77a0d94c31d0,,,
375,a41fdc1d-3836-4965-8a19-fab3364e35c1,nationwidechildrens.org_clinical.TCGA-A2-A0YJ.xml,39.0,796b593f-5886-4a27-bf94-7ea8df325e1a,,,
405,4697c0de-da15-4daa-8b39-c2f4a1fd7be9,nationwidechildrens.org_clinical.TCGA-A2-A0YE.xml,48.0,84a88e8d-1e65-4b88-aae6-bf4c7a7e4c33,,,


# Save in csv file

In [33]:
# Select relevant columns to save
final_merged_df_to_save = final_merged_df[['case_id', 'age_at_initial_pathologic_diagnosis', 'reads_per_million_miRNA_mapped']]
final_merged_df_to_save.to_csv('../datasets/preprocessed/clinical_miRNA_ageAtInitDiagnosis.csv', index=False)

In [36]:
final_merged_df = final_merged_df_to_save.copy()
# Expand the lists in 'reads_per_million_miRNA_mapped' into separate columns
reads_df = pd.DataFrame(final_merged_df["reads_per_million_miRNA_mapped"].tolist()).fillna(0)

# Rename the columns appropriately
reads_df.columns = [f"miRNA_{i+1}" for i in range(reads_df.shape[1])]

# Unisci con le altre colonne (case_id, et√†, ecc.)
df_finale = pd.concat([final_merged_df.drop(columns=["reads_per_million_miRNA_mapped"]), reads_df], axis=1)

print(df_finale.head())
df_finale.to_csv('../datasets/preprocessed/clinical_miRNA_ageAtInitDiagnosis_splitted.csv', index=False)

                                case_id  age_at_initial_pathologic_diagnosis  \
0  378778d2-b331-4867-a93b-c64028c8b4c7                                 71.0   
1  b343bfe0-7c23-4c6a-8c84-9ee39db2ecda                                 53.0   
2  abdc76db-f85e-4337-a57e-6d098789da03                                 55.0   
3  fbee40f1-d6d8-4156-8d42-36e09bb9f095                                 64.0   
4  50619f8c-10aa-464a-a227-90a7aa6ffd43                                 46.0   

        miRNA_1       miRNA_2       miRNA_3       miRNA_4      miRNA_5  \
0   7988.580442   8018.197889   8100.662153  32345.736656  3366.516489   
1  11857.268807  11728.983540  11841.861998  13907.317668  4529.287401   
2   7918.388430   8062.278630   7963.400236  10422.815821  1934.769776   
3  14251.643715  14324.037522  14391.970087  13755.431817  1047.783766   
4   4818.534597   4785.049036   4915.228857   9827.447776  1689.327738   

      miRNA_6      miRNA_7      miRNA_8  ...  miRNA_1872  miRNA_1873  \
0 

In [35]:
final_merged_df_to_save.shape

(767, 3)