In [27]:
import pandas as pd
import os

CLINICAL_PATH = "../../datasets/clinical_data"
CLINICAL_JSON_PATH = '../../datasets/clinical_data(json&manifest)/files.2024-12-12.json'
miRNA_PATH = '../../datasets/miRNA_seq'
miRNA_JSON_PATH = '../../datasets/miRNA(json&manifest)/files.2024-12-12.json'

mRNA_PATH = '../../datasets/RNAseq'
mRNA_JSON_PATH = '../../datasets/RNAseq(json&manifest)/files.2025-10-21.json'

FINAL_PATH = '../../datasets/preprocessed/clinical_miRNA(RC_RPM).csv'
FINAL_mRNA_PATH = '../../datasets/preprocessed/clinical_mRNA.csv'
FINAL_mRNA_protein_coding_PATH = '../../datasets/preprocessed/clinical_mRNA(protein_coding).csv'

 # Clinical data

In [28]:
def import_xml(file_path):
    temp_df = pd.read_xml(file_path, parser="etree")
    
    if 'age_at_initial_pathologic_diagnosis' in temp_df.columns:
        temp_df = temp_df[["age_at_initial_pathologic_diagnosis", "days_to_death", "days_to_last_followup"]]
    
        temp_df = temp_df.dropna(how='all')
        
        if temp_df.empty:
            print(f"File {file_path} has no valid data.")
        return temp_df
    return None


def main():
    clinical_df = pd.DataFrame(columns=["file_name", "age_at_initial_pathologic_diagnosis", "days_to_death", "days_to_last_followup"])
    for subfolder in os.listdir(f"{CLINICAL_PATH}"):
        file_xml_or_annotation_or_orm = 0
        for file in os.listdir(f"{CLINICAL_PATH}/{subfolder}"):
            if file.endswith(".xml") and "annotations" not in file and "org_omf" not in file:
                file_xml_or_annotation_or_orm = 1
                selected_data = import_xml(f"{CLINICAL_PATH}/{subfolder}/{file}")
            
                if selected_data is not None:
                    selected_data.insert(0, "file_name", file)

                    # Safe concatenation avoiding empty DataFrames or with NaN values
                    if not selected_data.empty and not selected_data.isna().all().all():
                        clinical_df = pd.concat([clinical_df, selected_data], ignore_index=True)
                        
    return clinical_df

clinical_df = main()
clinical_df.shape

  clinical_df = pd.concat([clinical_df, selected_data], ignore_index=True)


(771, 4)

# Clinical Data

Collect the following data from XML files
- **folder_name**: the name of the subfolder in clinical_data
- **file_name**: the name of the XML file
- **age_at_initial_pathologic_diagnosis**
- **days_to_death**
- **days_to_last_followup**
- **vital_status**
- **pathologic_stage**

To access to some information we have to use the namespaces defined in the XML files.

In [29]:
import xml.etree.ElementTree as ET

def import_xml(file_path, file):
    tree = ET.parse(file_path)
    root = tree.getroot()
            
    ns = {
        'admin': 'http://tcga.nci/bcr/xml/administration/2.7',
        'shared': 'http://tcga.nci/bcr/xml/shared/2.7',
        'clin_shared': 'http://tcga.nci/bcr/xml/clinical/shared/2.7',
        'stage': 'http://tcga.nci/bcr/xml/clinical/shared/stage/2.7'
    }
    
    age_elem = root.find('.//clin_shared:age_at_initial_pathologic_diagnosis', ns)
    age_at_initial_pathologic_diagnosis = int(age_elem.text) if (age_elem is not None and age_elem.text) else None
    
    death_elem = root.find('.//clin_shared:days_to_death', ns)
    days_to_death = int(death_elem.text) if (death_elem is not None and death_elem.text) else None
    
    followup_elem = root.find('.//clin_shared:days_to_last_followup', ns)
    days_to_last_followup = int(followup_elem.text) if (followup_elem is not None and followup_elem.text) else None
        
    vital_status = root.find('.//clin_shared:vital_status', ns).text
    
    pathologic_stage_elem = root.find('.//stage:pathologic_stage', ns)
    pathologic_stage = pathologic_stage_elem.text if (pathologic_stage_elem is not None and pathologic_stage_elem.text) else None
        
    records.append({"file_name": file,
                    'days_to_death': days_to_death, 
                    'vital_status': vital_status, 
                    'pathologic_stage': pathologic_stage,
                    'age_at_initial_pathologic_diagnosis': age_at_initial_pathologic_diagnosis,
                    'days_to_last_followup': days_to_last_followup
                    })
    return records

records = []
clinical_df = pd.DataFrame(columns=["file_name", "age_at_initial_pathologic_diagnosis", "days_to_death", "days_to_last_followup", "vital_status", "pathologic_stage"])
    
for subfolder in os.listdir(f"{CLINICAL_PATH}"):
        file_xml_or_annotation_or_orm = 0
        for file in os.listdir(f"{CLINICAL_PATH}/{subfolder}"):
            if file.endswith(".xml") and "annotations" not in file and "org_omf" not in file:
                selected_data = import_xml(f"{CLINICAL_PATH}/{subfolder}/{file}", file)

clinical_df = pd.DataFrame(records)
clinical_df = clinical_df.dropna(how='all')

print(clinical_df.head())

                                           file_name  days_to_death  \
0  nationwidechildrens.org_clinical.TCGA-A2-A0CT.xml            NaN   
1  nationwidechildrens.org_clinical.TCGA-GM-A2DD.xml            NaN   
2  nationwidechildrens.org_clinical.TCGA-D8-A1JM.xml            NaN   
3  nationwidechildrens.org_clinical.TCGA-C8-A12P.xml            NaN   
4  nationwidechildrens.org_clinical.TCGA-S3-A6ZF.xml            NaN   

  vital_status pathologic_stage  age_at_initial_pathologic_diagnosis  \
0        Alive        Stage IIA                                   71   
1        Alive          Stage I                                   53   
2        Alive        Stage IIB                                   59   
3        Alive        Stage IIB                                   55   
4        Alive        Stage IIA                                   64   

   days_to_last_followup  
0                 1918.0  
1                 1309.0  
2                  238.0  
3                    0.0  
4    

In [4]:
clinical_df.shape

(771, 6)

### Mapping Vital Status column

In [30]:
clinical_df['Death'] = clinical_df['vital_status'].map({'Alive': 0, 'Dead': 1})
clinical_df.drop(columns=['vital_status'], inplace=True)
clinical_df.head()

Unnamed: 0,file_name,days_to_death,pathologic_stage,age_at_initial_pathologic_diagnosis,days_to_last_followup,Death
0,nationwidechildrens.org_clinical.TCGA-A2-A0CT.xml,,Stage IIA,71,1918.0,0
1,nationwidechildrens.org_clinical.TCGA-GM-A2DD.xml,,Stage I,53,1309.0,0
2,nationwidechildrens.org_clinical.TCGA-D8-A1JM.xml,,Stage IIB,59,238.0,0
3,nationwidechildrens.org_clinical.TCGA-C8-A12P.xml,,Stage IIB,55,0.0,0
4,nationwidechildrens.org_clinical.TCGA-S3-A6ZF.xml,,Stage IIA,64,212.0,0


#### Handling NaN values in days_to_death and days_to_last_followup
- For `days_to_death`, replace NaN values with -1 to indicate that the patient is alive.
- For `days_to_last_followup`, replace NaN values with the corresponding value from `days_to_death`.

In [31]:
# Convert days_to_death None value into -1
clinical_df['days_to_death'] = clinical_df['days_to_death'].fillna(-1)
clinical_df.head()

Unnamed: 0,file_name,days_to_death,pathologic_stage,age_at_initial_pathologic_diagnosis,days_to_last_followup,Death
0,nationwidechildrens.org_clinical.TCGA-A2-A0CT.xml,-1.0,Stage IIA,71,1918.0,0
1,nationwidechildrens.org_clinical.TCGA-GM-A2DD.xml,-1.0,Stage I,53,1309.0,0
2,nationwidechildrens.org_clinical.TCGA-D8-A1JM.xml,-1.0,Stage IIB,59,238.0,0
3,nationwidechildrens.org_clinical.TCGA-C8-A12P.xml,-1.0,Stage IIB,55,0.0,0
4,nationwidechildrens.org_clinical.TCGA-S3-A6ZF.xml,-1.0,Stage IIA,64,212.0,0


In [32]:
# Convert days_to_last_followup None value into days_to_death value
clinical_df['days_to_last_followup'] = clinical_df.apply(
    lambda row: row['days_to_death'] if pd.isna(row['days_to_last_followup']) else row['days_to_last_followup'],
    axis=1
)
clinical_df.head()

Unnamed: 0,file_name,days_to_death,pathologic_stage,age_at_initial_pathologic_diagnosis,days_to_last_followup,Death
0,nationwidechildrens.org_clinical.TCGA-A2-A0CT.xml,-1.0,Stage IIA,71,1918.0,0
1,nationwidechildrens.org_clinical.TCGA-GM-A2DD.xml,-1.0,Stage I,53,1309.0,0
2,nationwidechildrens.org_clinical.TCGA-D8-A1JM.xml,-1.0,Stage IIB,59,238.0,0
3,nationwidechildrens.org_clinical.TCGA-C8-A12P.xml,-1.0,Stage IIB,55,0.0,0
4,nationwidechildrens.org_clinical.TCGA-S3-A6ZF.xml,-1.0,Stage IIA,64,212.0,0


## Clinical JSON

In [33]:
df_clinical_json = pd.read_json(CLINICAL_JSON_PATH)
df_clinical_json.shape

(827, 9)

In [34]:
# Extract the 'case_id' value from the list of dictionaries in the 'cases' column
df_clinical_json['case_id'] = df_clinical_json['cases'].apply(
    lambda x: x[0]['case_id'] if isinstance(x, list) and len(x) > 0 and 'case_id' in x[0] else None
)

print(df_clinical_json[['case_id', 'file_name']].head())

                                case_id  \
0  e4fc0909-f284-4471-866d-d8967b6adcbc   
1  87281a89-91d2-44f7-9f80-668567ad5c72   
2  b8a615f9-d19b-4b09-8ec8-0674e5c648cd   
3  3b01d064-8c00-4972-9f07-407eac8e7534   
4  8785012f-f73e-4d68-87cf-1d804af32782   

                                           file_name  
0  nationwidechildrens.org_clinical.TCGA-E2-A14P.xml  
1  nationwidechildrens.org_clinical.TCGA-EW-A1J6.xml  
2  nationwidechildrens.org_clinical.TCGA-C8-A12N.xml  
3  nationwidechildrens.org_clinical.TCGA-BH-A0HX.xml  
4  nationwidechildrens.org_clinical.TCGA-A7-A13D.xml  


## Merge clinical data with clinical JSON on file_name

In [35]:
merged_df = pd.merge(clinical_df, df_clinical_json[['case_id', 'file_name']], on='file_name', how='inner')
merged_df.head()

Unnamed: 0,file_name,days_to_death,pathologic_stage,age_at_initial_pathologic_diagnosis,days_to_last_followup,Death,case_id
0,nationwidechildrens.org_clinical.TCGA-A2-A0CT.xml,-1.0,Stage IIA,71,1918.0,0,378778d2-b331-4867-a93b-c64028c8b4c7
1,nationwidechildrens.org_clinical.TCGA-GM-A2DD.xml,-1.0,Stage I,53,1309.0,0,b343bfe0-7c23-4c6a-8c84-9ee39db2ecda
2,nationwidechildrens.org_clinical.TCGA-D8-A1JM.xml,-1.0,Stage IIB,59,238.0,0,3e775c99-ceda-4246-8d6f-0f58ca5097c8
3,nationwidechildrens.org_clinical.TCGA-C8-A12P.xml,-1.0,Stage IIB,55,0.0,0,abdc76db-f85e-4337-a57e-6d098789da03
4,nationwidechildrens.org_clinical.TCGA-S3-A6ZF.xml,-1.0,Stage IIA,64,212.0,0,fbee40f1-d6d8-4156-8d42-36e09bb9f095


In [11]:
merged_df.shape

(771, 7)

# miRNA_seq

In [36]:
def import_txt(file_path):
    temp_df = pd.read_csv(file_path, sep="\t")
    if 'reads_per_million_miRNA_mapped' in temp_df.columns:
        temp_df = temp_df[["miRNA_ID", "read_count", "reads_per_million_miRNA_mapped"]]
        #temp_df = temp_df[["read_count"]]
        temp_df = temp_df.dropna(how='all')
        return temp_df
    return None

def miRNA_process():
    count_subfolders = 0
    # Initialize an empty DataFrame to store the results
    miRNA_df1 = pd.DataFrame(columns=["file_name", "miRNA_ID", "read_count", "reads_per_million_miRNA_mapped"])
    for subfolder in os.listdir(miRNA_PATH):
        count_subfolders += 1
        for file in os.listdir(f"{miRNA_PATH}/{subfolder}"):
            if file.endswith(".txt") and file != "annotations.xml":
                selected_data = import_txt(f"{miRNA_PATH}/{subfolder}/{file}")

                if selected_data is not None:
                    selected_data.insert(0, "file_name", file)

                    # Safe concatenation avoiding empty DataFrames or with NaN values
                    if not selected_data.empty and not selected_data.isna().all().all():
                        miRNA_df1 = pd.concat([miRNA_df1, selected_data], ignore_index=True)
    print(f"Subfolders: {count_subfolders}")
    return miRNA_df1
miRNA_seq_df = miRNA_process()

  miRNA_df1 = pd.concat([miRNA_df1, selected_data], ignore_index=True)


Subfolders: 767


In [13]:
miRNA_seq_df.head()

Unnamed: 0,file_name,miRNA_ID,read_count,reads_per_million_miRNA_mapped
0,e7b7bf36-aa58-4dc0-8548-a28c11d5060f.mirbase21...,hsa-let-7a-1,7173,5652.188302
1,e7b7bf36-aa58-4dc0-8548-a28c11d5060f.mirbase21...,hsa-let-7a-2,7453,5872.823005
2,e7b7bf36-aa58-4dc0-8548-a28c11d5060f.mirbase21...,hsa-let-7a-3,7475,5890.158589
3,e7b7bf36-aa58-4dc0-8548-a28c11d5060f.mirbase21...,hsa-let-7b,18096,14259.305663
4,e7b7bf36-aa58-4dc0-8548-a28c11d5060f.mirbase21...,hsa-let-7c,1947,1534.199167


In [37]:
aggr_miRNA_df = miRNA_seq_df.groupby(
    ['file_name']
).agg({
    'miRNA_ID': list,
    'read_count': list,
    'reads_per_million_miRNA_mapped': list}).reset_index()

aggr_miRNA_df.shape

(767, 4)

## miRNA JSON

In [38]:
df_miRNA_json = pd.read_json(miRNA_JSON_PATH)
df_miRNA_json.shape

(767, 11)

In [39]:
# Extract the 'case_id' value from the list of dictionaries in the 'cases' column
df_miRNA_json['case_id'] = df_miRNA_json['cases'].apply(
    lambda x: x[0]['case_id'] if isinstance(x, list) and len(x) > 0 and 'case_id' in x[0] else None
)

print(df_miRNA_json[['case_id', 'file_name']].head())

                                case_id  \
0  1c3610f7-e0aa-48d7-9a27-0dbaf6e244f9   
1  241fffc8-4250-4cfa-b2e7-e68c33ae07dc   
2  e5aae05a-478e-4a55-a27c-12b2b4be302a   
3  b7f74ae1-6f58-447c-be50-a7666eb19d9a   
4  e7a00d67-2c26-4d1f-bd17-35f659e88bc1   

                                           file_name  
0  a8c7bebe-a450-4a3e-b891-e2d4ac578b04.mirbase21...  
1  97cb5037-7eea-4ba7-86c3-d13dd91e5b1e.mirbase21...  
2  6bd02673-dd3f-42ed-9997-e371b09f22ee.mirbase21...  
3  f846c0ad-9bbd-46a5-a678-fc3c05cdcc41.mirbase21...  
4  fe712c27-8dbb-4531-afb3-10999e319a7b.mirbase21...  


## Merge miRNA data with miRNA JSON on file_name

In [40]:
miRNA_merged_df = pd.merge(aggr_miRNA_df, df_miRNA_json[['case_id', 'file_name']], on='file_name', how='inner')
miRNA_merged_df.head()

Unnamed: 0,file_name,miRNA_ID,read_count,reads_per_million_miRNA_mapped,case_id
0,004b1938-87ab-4bf2-a19b-d2953d0e81a0.mirbase21...,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[8696, 8866, 8610, 25859, 3897, 413, 749, 1585...","[6540.401419, 6668.261153, 6475.719437, 19448....",ac18b3a3-8d52-4e35-8625-673171a7fd92
1,00574710-29e7-4df3-a26d-7cdfd19aef31.mirbase21...,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[32828, 33168, 33378, 84658, 5858, 7423, 5588,...","[6888.809148, 6960.156629, 7004.224191, 17765....",a82d0a57-4383-473d-b334-d13b278404b1
2,00cdf835-e345-4027-a68e-64db83d1a01f.mirbase21...,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[26447, 26270, 26248, 53153, 6383, 2119, 3226,...","[6982.72655, 6935.993741, 6930.185144, 14033.8...",5580b21a-2cdb-4777-ad79-6e06654144f5
3,010de2d6-ab41-4779-923a-1c288296e267.mirbase21...,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[24190, 24366, 24483, 43932, 11270, 1693, 3807...","[7884.689914, 7942.056819, 7980.192772, 14319....",4dfc233b-ed03-4825-8089-e04cdee99996
4,0139dd64-61be-43c9-a0cd-b1a2d2b2196c.mirbase21...,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[9720, 9903, 9884, 39952, 1102, 509, 1467, 194...","[9625.744708, 9806.970148, 9788.154392, 39564....",53d0af48-d7ea-42c9-8695-c614b89b415b


In [18]:
miRNA_merged_df.shape

(767, 5)

# Merge Clinical data and miRNA data on case_id

In [41]:
final_clinical_miRNA_df = pd.merge(merged_df, miRNA_merged_df, on='case_id', suffixes=('_clinical', '_miRNA'))
final_clinical_miRNA_df.head()

Unnamed: 0,file_name_clinical,days_to_death,pathologic_stage,age_at_initial_pathologic_diagnosis,days_to_last_followup,Death,case_id,file_name_miRNA,miRNA_ID,read_count,reads_per_million_miRNA_mapped
0,nationwidechildrens.org_clinical.TCGA-A2-A0CT.xml,-1.0,Stage IIA,71,1918.0,0,378778d2-b331-4867-a93b-c64028c8b4c7,ceed6c37-0a34-42e8-98a6-5e3626e62c2f.mirbase21...,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[13756, 13807, 13949, 55698, 5797, 518, 3747, ...","[7988.580442, 8018.197889, 8100.662153, 32345...."
1,nationwidechildrens.org_clinical.TCGA-GM-A2DD.xml,-1.0,Stage I,53,1309.0,0,b343bfe0-7c23-4c6a-8c84-9ee39db2ecda,6b6e80ce-d104-477a-91be-d69717c8c8ce.mirbase21...,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[37711, 37303, 37662, 44231, 14405, 1889, 3169...","[11857.268807, 11728.98354, 11841.861998, 1390..."
2,nationwidechildrens.org_clinical.TCGA-C8-A12P.xml,-1.0,Stage IIB,55,0.0,0,abdc76db-f85e-4337-a57e-6d098789da03,c0405ebb-1311-40b2-a5c4-522d2cc62988.mirbase21...,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[10731, 10926, 10792, 14125, 2622, 462, 698, 3...","[7918.38843, 8062.27863, 7963.400236, 10422.81..."
3,nationwidechildrens.org_clinical.TCGA-S3-A6ZF.xml,-1.0,Stage IIA,64,212.0,0,fbee40f1-d6d8-4156-8d42-36e09bb9f095,b56bae60-552c-48bf-a04d-999aa7cbde6f.mirbase21...,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[70280, 70637, 70972, 67833, 5167, 1629, 18954...","[14251.643715, 14324.037522, 14391.970087, 137..."
4,nationwidechildrens.org_clinical.TCGA-BH-A18K.xml,2763.0,Stage I,46,2763.0,1,50619f8c-10aa-464a-a227-90a7aa6ffd43,b34c0985-ac8d-44a9-8b10-559279fda5f4.mirbase21...,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[12807, 12718, 13064, 26120, 4490, 1775, 11051...","[4818.534597, 4785.049036, 4915.228857, 9827.4..."


In [20]:
final_clinical_miRNA_df.shape

(767, 11)

In [21]:
final_clinical_miRNA_df.isna().sum()

file_name_clinical                     0
days_to_death                          0
pathologic_stage                       7
age_at_initial_pathologic_diagnosis    0
days_to_last_followup                  0
Death                                  0
case_id                                0
file_name_miRNA                        0
miRNA_ID                               0
read_count                             0
reads_per_million_miRNA_mapped         0
dtype: int64

## Analysis on rows not merged

In [122]:
# Merge con outer join per includere tutte le righe
all_rows_df = pd.merge(merged_df, miRNA_merged_df, on='case_id', suffixes=('_clinical', '_miRNA'), how='outer')

# Retrieve rows with NaN values in any column
non_matching_rows_df = all_rows_df[all_rows_df.isna().any(axis=1)]

non_matching_rows_df.shape

(23, 13)

In [123]:
non_matching_rows_df.head(10)

Unnamed: 0,file_name_clinical,folder_name_clinical,days_to_death,pathologic_stage,age_at_initial_pathologic_diagnosis,days_to_last_followup,Death,case_id,folder_name_miRNA,file_name_miRNA,miRNA_ID,read_count,reads_per_million_miRNA_mapped
34,nationwidechildrens.org_clinical.TCGA-A8-A08F.xml,051ca785-e7be-4ac3-8757-1c9ace8acf7e,-1.0,Stage IIIC,59,549.0,0,0a017f15-1c6b-45e7-8d55-e0a71df1b2e8,,,,,
72,nationwidechildrens.org_clinical.TCGA-B6-A0X1.xml,492170e4-90b8-4560-89ff-effa9eeb5194,-1.0,,48,5677.0,0,178b2c48-c07d-422e-ae17-8bcfd996ad51,,,,,
76,nationwidechildrens.org_clinical.TCGA-AR-A0U1.xml,48da1f05-062b-4f2d-9936-5064d9a1a9f3,-1.0,Stage IIB,36,2134.0,0,17d9e646-6ab3-40b3-a0bc-2c834d3c3213,,,,,
133,nationwidechildrens.org_clinical.TCGA-AO-A03P.xml,affbf824-7c87-4dac-96ed-b98edeee7775,-1.0,Stage IIB,54,2576.0,0,2d4c778c-7f77-4f0a-8261-2086accf15fd,,,,,
188,nationwidechildrens.org_clinical.TCGA-BH-A1FD.xml,f4ea8ea9-c4f2-4f93-a259-29d49be19ef8,1009.0,Stage I,68,1009.0,1,3c8b5af9-c34d-43c2-b8c9-39ea11e44fa6,,,,,
193,nationwidechildrens.org_clinical.TCGA-D8-A1JM.xml,00a5e81c-cd67-483f-9d99-3c733b2ead38,-1.0,Stage IIB,59,238.0,0,3e775c99-ceda-4246-8d6f-0f58ca5097c8,,,,,
206,nationwidechildrens.org_clinical.TCGA-D8-A141.xml,a423f8fd-044c-4231-8a2c-fd59968a0ab3,-1.0,,40,113.0,0,42f6f503-4cf6-4a8e-b5fe-b44bccf6b38b,58df0564-7b33-4237-9867-c3426979fd94,494315de-dc27-480c-b09c-b37cbe1bd7ab.mirbase21...,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[45413, 45280, 45685, 113674, 7661, 1986, 3420...","[15495.213554, 15449.833082, 15588.021739, 387..."
278,nationwidechildrens.org_clinical.TCGA-D8-A13Z.xml,ab9d072b-f0ad-470c-ab84-e1276f4629ca,-1.0,,51,210.0,0,5a17dcd9-5ced-4a69-8069-23c7fd0649d1,98c6443e-64f7-438c-bbe1-bea65458e98b,4a43baff-f305-4fd4-a221-464f68d40918.mirbase21...,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[27211, 27190, 27312, 60619, 7190, 2453, 2920,...","[8545.89994, 8539.30467, 8577.62005, 19038.032..."
344,nationwidechildrens.org_clinical.TCGA-A8-A06X.xml,4b81e1c7-f651-4326-a9b7-d9ff18ccefdb,943.0,Stage IIB,77,943.0,1,70931617-b3df-4a12-8e3f-2b2307602f48,,,,,
372,nationwidechildrens.org_clinical.TCGA-A8-A08C.xml,032c6c78-a0f7-4695-a93e-3f42747353a8,-1.0,Stage IIA,65,608.0,0,78e1da41-127c-4e9c-aaaa-77a0d94c31d0,,,,,


# Save in csv file

In [44]:
#final_merged_df_to_save = encoded_final_df.drop(columns=['file_name_clinical', 'file_name_miRNA'])
final_merged_df_to_save = final_clinical_miRNA_df.drop(columns=['file_name_clinical', 'file_name_miRNA'])
final_merged_df_to_save.to_csv(FINAL_PATH, index=False)

In [46]:
final_merged_df_to_save.shape

(767, 9)

In [48]:
final_merged_df_to_save.head()

Unnamed: 0,days_to_death,pathologic_stage,age_at_initial_pathologic_diagnosis,days_to_last_followup,Death,case_id,miRNA_ID,read_count,reads_per_million_miRNA_mapped
0,-1.0,Stage IIA,71,1918.0,0,378778d2-b331-4867-a93b-c64028c8b4c7,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[13756, 13807, 13949, 55698, 5797, 518, 3747, ...","[7988.580442, 8018.197889, 8100.662153, 32345...."
1,-1.0,Stage I,53,1309.0,0,b343bfe0-7c23-4c6a-8c84-9ee39db2ecda,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[37711, 37303, 37662, 44231, 14405, 1889, 3169...","[11857.268807, 11728.98354, 11841.861998, 1390..."
2,-1.0,Stage IIB,55,0.0,0,abdc76db-f85e-4337-a57e-6d098789da03,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[10731, 10926, 10792, 14125, 2622, 462, 698, 3...","[7918.38843, 8062.27863, 7963.400236, 10422.81..."
3,-1.0,Stage IIA,64,212.0,0,fbee40f1-d6d8-4156-8d42-36e09bb9f095,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[70280, 70637, 70972, 67833, 5167, 1629, 18954...","[14251.643715, 14324.037522, 14391.970087, 137..."
4,2763.0,Stage I,46,2763.0,1,50619f8c-10aa-464a-a227-90a7aa6ffd43,"[hsa-let-7a-1, hsa-let-7a-2, hsa-let-7a-3, hsa...","[12807, 12718, 13064, 26120, 4490, 1775, 11051...","[4818.534597, 4785.049036, 4915.228857, 9827.4..."


# mRNA seq

In [None]:
def import_txt(file_path):
    dtype_map = {
        "gene_id": "string",
        "gene_name": "string",
        "gene_type": "category",
        "unstranded": "Int64",
        "tpm_unstranded": "float64",
        "fpkm_unstranded": "float64"
    }
    usecols = ['gene_id', 'gene_name', 'gene_type', 'unstranded', 'tpm_unstranded', 'fpkm_unstranded']
    temp_df = pd.read_csv(file_path, sep="\t", comment="#", usecols=usecols, dtype=dtype_map)
    return temp_df.dropna(how="all")

def mRNA_process():
    all_dfs = []
    i = 0
    for subfolder in os.listdir(mRNA_PATH):
        for file in os.listdir(f"{mRNA_PATH}/{subfolder}"):
            if file.endswith("augmented_star_gene_counts.tsv"):
                print(f"{i} | Processing file: {mRNA_PATH}/{subfolder}/{file}")
                i += 1
                selected_data = import_txt(f"{mRNA_PATH}/{subfolder}/{file}")

                if selected_data is not None and not selected_data.empty:
                    selected_data.insert(0, "file_name", file)
                    all_dfs.append(selected_data)
    df = pd.concat(all_dfs, ignore_index=True)
    return df

mRNA_df = mRNA_process()
mRNA_df.shape

0 | Processing file: ../../datasets/RNAseq/0022cd20-f64f-4773-b9ff-a3de0b71b259/8d1641ea-7552-4d23-9298-094e0056386a.rna_seq.augmented_star_gene_counts.tsv
1 | Processing file: ../../datasets/RNAseq/00469928-b243-4cae-acd7-134508e99ceb/2f51534b-248b-4999-bc3f-e42a2e98332e.rna_seq.augmented_star_gene_counts.tsv
2 | Processing file: ../../datasets/RNAseq/0094f9d0-45ec-4aad-bca0-71c60bdd7113/cafc9e36-c5f0-45df-ad03-16210ff0d870.rna_seq.augmented_star_gene_counts.tsv
3 | Processing file: ../../datasets/RNAseq/010e405c-b91d-4046-898e-105d5830d9a9/7135f14b-e84f-4ebf-8d95-b2a3c843fd4d.rna_seq.augmented_star_gene_counts.tsv
4 | Processing file: ../../datasets/RNAseq/017d71aa-0999-4d8e-9cb4-88b9013e61eb/c6683fc6-49f4-4e53-94ac-251799c2c638.rna_seq.augmented_star_gene_counts.tsv
5 | Processing file: ../../datasets/RNAseq/02e08315-4d34-421c-acc0-129b1e1de38b/4a88d54f-c88c-4ffd-84c9-069b53f2cb28.rna_seq.augmented_star_gene_counts.tsv
6 | Processing file: ../../datasets/RNAseq/03891509-3109-450d-85

In [None]:
# Drop rows with gene_id in N_unmapped, N_multimapping, N_noFeature, N_ambiguous
mRNA_df = mRNA_df[~mRNA_df['gene_id'].isin(['N_unmapped', 'N_multimapping', 'N_noFeature', 'N_ambiguous'])].reset_index(drop=True)
mRNA_df.head()

#### mRNA seq - only protein_coding

In [25]:
protein_coding_df = mRNA_df[mRNA_df["gene_type"] == "protein_coding"]
protein_coding_df.shape

(15710094, 7)

#### Aggregating mRNA data by file_name

In [12]:
aggr_mRNA_df = mRNA_df.groupby( ['file_name']).agg({
    'gene_id': list,
    'gene_name': list,
    'gene_type': list,
    'unstranded': list,
    'tpm_unstranded': list,
    'fpkm_unstranded': list
}).reset_index()

aggr_mRNA_df.shape

(787, 7)

In [10]:
aggr_mRNA_df.head()

Unnamed: 0,folder_name,file_name,gene_id,gene_name,gene_type,unstranded,tpm_unstranded,fpkm_unstranded
0,0022cd20-f64f-4773-b9ff-a3de0b71b259,8d1641ea-7552-4d23-9298-094e0056386a.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[2443, 144, 2322, 1466, 409, 1179, 11555, 2770...","[28.535, 5.169, 101.9253, 11.2845, 3.6297, 18....","[8.0216, 1.4531, 28.6527, 3.1723, 1.0204, 5.19..."
1,00469928-b243-4cae-acd7-134508e99ceb,2f51534b-248b-4999-bc3f-e42a2e98332e.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[3508, 7, 2421, 839, 744, 98, 532, 4516, 1896,...","[54.5178, 0.3343, 141.3968, 8.5928, 8.7852, 2....","[16.2355, 0.0996, 42.1081, 2.559, 2.6162, 0.60..."
2,0094f9d0-45ec-4aad-bca0-71c60bdd7113,cafc9e36-c5f0-45df-ad03-16210ff0d870.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[2890, 4, 4025, 2769, 663, 909, 6896, 3864, 31...","[30.4434, 0.1295, 159.3406, 19.2226, 5.3065, 1...","[7.6974, 0.0327, 40.2883, 4.8603, 1.3417, 3.24..."
3,010e405c-b91d-4046-898e-105d5830d9a9,7135f14b-e84f-4ebf-8d95-b2a3c843fd4d.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[3456, 22, 1779, 2176, 864, 250, 1100, 1109, 1...","[47.8819, 0.9367, 92.6273, 19.8679, 9.0951, 4....","[12.5362, 0.2452, 24.2513, 5.2017, 2.3813, 1.2..."
4,017d71aa-0999-4d8e-9cb4-88b9013e61eb,c6683fc6-49f4-4e53-94ac-251799c2c638.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[8943, 10, 2621, 1804, 2349, 1050, 736, 1765, ...","[115.377, 0.3965, 127.0776, 15.338, 23.026, 18...","[33.626, 0.1156, 37.0361, 4.4702, 6.7108, 5.29..."


In [26]:
aggr_mRNA_protein_coding_df = protein_coding_df.groupby( ['file_name']).agg({
    'gene_id': list,
    'gene_name': list,
    'gene_type': list,
    'unstranded': list,
    'tpm_unstranded': list,
    'fpkm_unstranded': list
}).reset_index()

aggr_mRNA_protein_coding_df.shape

(787, 7)

## mRNA JSON

In [13]:
df_mRNA_json = pd.read_json(mRNA_JSON_PATH)
df_mRNA_json.shape

(787, 11)

In [14]:
df_mRNA_json[['cases', 'file_name']].head(1)

Unnamed: 0,cases,file_name
0,[{'case_id': 'a76774fe-7298-4d68-a2b2-c1bad93f...,2e64abe2-6024-4d28-9e09-560ce2a9fd15.rna_seq.a...


In [15]:
# Extract the 'case_id' value from the list of dictionaries in the 'cases' column
df_mRNA_json['case_id'] = df_mRNA_json['cases'].apply(
    lambda x: x[0]['case_id'] if isinstance(x, list) and len(x) > 0 and 'case_id' in x[0] else None
)

print(df_mRNA_json[['case_id', 'file_name']].head())

                                case_id  \
0  a76774fe-7298-4d68-a2b2-c1bad93f0c31   
1  f3ba71f9-25f3-4784-bf2d-3aa522a0cba8   
2  4dfc233b-ed03-4825-8089-e04cdee99996   
3  2fe854b4-bac9-4801-9a1c-ee99388e8082   
4  30047d32-0833-4aca-98d1-3b9ab7122863   

                                           file_name  
0  2e64abe2-6024-4d28-9e09-560ce2a9fd15.rna_seq.a...  
1  1d468785-141d-40ca-acb1-d6d85a8c9d7b.rna_seq.a...  
2  748e4eaa-2b96-4dce-a903-c7df733d7f50.rna_seq.a...  
3  e614fbb4-7574-4704-9525-c0aea4c10fc6.rna_seq.a...  
4  ad8c55ae-46df-43cd-8cb5-c049e3019959.rna_seq.a...  


## Merge mRNA_df data with mRNA JSON on file_name

In [17]:
mRNA_merged_df = pd.merge(aggr_mRNA_df, df_mRNA_json[['case_id', 'file_name']], on='file_name', how='inner')
mRNA_merged_df.head()

Unnamed: 0,file_name,gene_id,gene_name,gene_type,unstranded,tpm_unstranded,fpkm_unstranded,case_id
0,00a26384-1b1c-4db4-9664-75fb9b3febdb.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[3888, 9, 1458, 2135, 440, 486, 1251, 1380, 14...","[76.5053, 0.5442, 107.8173, 27.6859, 6.5783, 1...","[19.6654, 0.1399, 27.7141, 7.1166, 1.6909, 3.2...",1d38d356-d126-4476-94d0-26616b9375b1
1,01661d94-fc16-4456-95cf-a5fa4e1e196c.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[1372, 27, 2135, 2385, 1075, 171, 1869, 2060, ...","[17.6389, 1.0668, 103.153, 20.207, 10.5009, 2....","[5.0966, 0.3082, 29.805, 5.8386, 3.0341, 0.852...",844ad251-3ee4-42f6-ad00-59a7a1670eea
2,021d9e06-9d27-400c-8776-08e89c817b46.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[2971, 24, 1956, 599, 210, 1199, 2451, 2001, 1...","[34.1691, 0.8483, 84.5407, 4.54, 1.8351, 18.49...","[13.4042, 0.3328, 33.1645, 1.781, 0.7199, 7.25...",b343bfe0-7c23-4c6a-8c84-9ee39db2ecda
3,02423743-43b4-494b-acd9-1cd3bcd3d395.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[1445, 11, 4161, 3274, 1114, 516, 3498, 3532, ...","[17.1757, 0.4018, 185.8704, 25.6461, 10.0608, ...","[4.4653, 0.1045, 48.3221, 6.6674, 2.6156, 2.13...",719082cc-1ebe-4a51-a659-85a59db1d77d
4,02a4ccea-1464-4664-bc44-bb6569a34608.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[457, 0, 1949, 4082, 1351, 135, 2478, 1894, 16...","[7.0782, 0.0, 113.4449, 41.6654, 15.8987, 2.80...","[1.9628, 0.0, 31.4592, 11.5542, 4.4088, 0.7777...",3dbe99d1-e3b8-4ee2-b6a8-2e2e12c6fbe9


#### Only protein_coding

In [27]:
mRNA_merged_protein_df = pd.merge(aggr_mRNA_protein_coding_df, df_mRNA_json[['case_id', 'file_name']], on='file_name', how='inner')
mRNA_merged_protein_df.head()

Unnamed: 0,file_name,gene_id,gene_name,gene_type,unstranded,tpm_unstranded,fpkm_unstranded,case_id
0,00a26384-1b1c-4db4-9664-75fb9b3febdb.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[3888, 9, 1458, 2135, 440, 486, 1251, 1380, 14...","[76.5053, 0.5442, 107.8173, 27.6859, 6.5783, 1...","[19.6654, 0.1399, 27.7141, 7.1166, 1.6909, 3.2...",1d38d356-d126-4476-94d0-26616b9375b1
1,01661d94-fc16-4456-95cf-a5fa4e1e196c.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[1372, 27, 2135, 2385, 1075, 171, 1869, 2060, ...","[17.6389, 1.0668, 103.153, 20.207, 10.5009, 2....","[5.0966, 0.3082, 29.805, 5.8386, 3.0341, 0.852...",844ad251-3ee4-42f6-ad00-59a7a1670eea
2,021d9e06-9d27-400c-8776-08e89c817b46.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[2971, 24, 1956, 599, 210, 1199, 2451, 2001, 1...","[34.1691, 0.8483, 84.5407, 4.54, 1.8351, 18.49...","[13.4042, 0.3328, 33.1645, 1.781, 0.7199, 7.25...",b343bfe0-7c23-4c6a-8c84-9ee39db2ecda
3,02423743-43b4-494b-acd9-1cd3bcd3d395.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[1445, 11, 4161, 3274, 1114, 516, 3498, 3532, ...","[17.1757, 0.4018, 185.8704, 25.6461, 10.0608, ...","[4.4653, 0.1045, 48.3221, 6.6674, 2.6156, 2.13...",719082cc-1ebe-4a51-a659-85a59db1d77d
4,02a4ccea-1464-4664-bc44-bb6569a34608.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[457, 0, 1949, 4082, 1351, 135, 2478, 1894, 16...","[7.0782, 0.0, 113.4449, 41.6654, 15.8987, 2.80...","[1.9628, 0.0, 31.4592, 11.5542, 4.4088, 0.7777...",3dbe99d1-e3b8-4ee2-b6a8-2e2e12c6fbe9


## Merge Clinical data and mRNA data on case_id

In [18]:
final_mRNA_df = pd.merge(merged_df, mRNA_merged_df, on='case_id', suffixes=('_clinical', '_mRNA'))
final_mRNA_df.head()

Unnamed: 0,file_name_clinical,days_to_death,pathologic_stage,age_at_initial_pathologic_diagnosis,days_to_last_followup,Death,case_id,file_name_mRNA,gene_id,gene_name,gene_type,unstranded,tpm_unstranded,fpkm_unstranded
0,nationwidechildrens.org_clinical.TCGA-A2-A0CT.xml,-1.0,Stage IIA,71,1918.0,0,378778d2-b331-4867-a93b-c64028c8b4c7,66941b24-9c8f-4657-a4eb-8cc267e38bdc.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[4901, 2, 2691, 2350, 791, 119, 896, 1752, 150...","[79.8332, 0.1001, 164.7321, 25.2268, 9.7898, 2...","[23.7455, 0.0298, 48.9978, 7.5034, 2.9119, 0.7..."
1,nationwidechildrens.org_clinical.TCGA-GM-A2DD.xml,-1.0,Stage I,53,1309.0,0,b343bfe0-7c23-4c6a-8c84-9ee39db2ecda,021d9e06-9d27-400c-8776-08e89c817b46.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[2971, 24, 1956, 599, 210, 1199, 2451, 2001, 1...","[34.1691, 0.8483, 84.5407, 4.54, 1.8351, 18.49...","[13.4042, 0.3328, 33.1645, 1.781, 0.7199, 7.25..."
2,nationwidechildrens.org_clinical.TCGA-D8-A1JM.xml,-1.0,Stage IIB,59,238.0,0,3e775c99-ceda-4246-8d6f-0f58ca5097c8,1ec0a33c-5b2e-4078-8226-48026f7d5c82.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[4951, 198, 3646, 1556, 1679, 171, 389, 6036, ...","[59.1275, 7.2669, 163.636, 12.2462, 15.2351, 2...","[15.8198, 1.9443, 43.7814, 3.2765, 4.0762, 0.7..."
3,nationwidechildrens.org_clinical.TCGA-C8-A12P.xml,-1.0,Stage IIB,55,0.0,0,abdc76db-f85e-4337-a57e-6d098789da03,bae57be0-48af-438d-88fa-cbaaf16b3d33.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[3834, 0, 3161, 1411, 637, 468, 4472, 2588, 16...","[52.339, 0.0, 162.1673, 12.6939, 6.6071, 8.568...","[15.7116, 0.0, 48.681, 3.8106, 1.9834, 2.5723,..."
4,nationwidechildrens.org_clinical.TCGA-S3-A6ZF.xml,-1.0,Stage IIA,64,212.0,0,fbee40f1-d6d8-4156-8d42-36e09bb9f095,e4559121-091f-41f2-9559-2abb1bbfd229.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[1555, 21, 4679, 2352, 1122, 529, 2247, 2042, ...","[21.6296, 0.8977, 244.5886, 21.5601, 11.8579, ...","[5.9762, 0.248, 67.5792, 5.957, 3.2763, 2.7268..."


In [37]:
mRNA_merged_df.shape

(787, 9)

In [38]:
mRNA_merged_df.isna().sum()

folder_name        0
file_name          0
gene_id            0
gene_name          0
gene_type          0
unstranded         0
tpm_unstranded     0
fpkm_unstranded    0
case_id            0
dtype: int64

## Save mRNA data in csv file

In [20]:
#final_mRNA_df_to_save = encoded_final_df.drop(columns=['file_name_clinical', 'file_name_mRNA'])
final_mRNA_df_to_save = final_mRNA_df.drop(columns=['file_name_clinical', 'file_name_mRNA'])
final_mRNA_df_to_save.shape

(773, 22)

In [22]:
final_mRNA_df_to_save.to_csv(FINAL_mRNA_PATH, index=False)

## Merge Clinical data and mRNA protein_coding data

In [28]:
final_mRNA_protein_coding_df = pd.merge(merged_df, mRNA_merged_protein_df, on='case_id', suffixes=('_clinical', '_mRNA'))
final_mRNA_protein_coding_df.head()

Unnamed: 0,file_name_clinical,days_to_death,pathologic_stage,age_at_initial_pathologic_diagnosis,days_to_last_followup,Death,case_id,file_name_mRNA,gene_id,gene_name,gene_type,unstranded,tpm_unstranded,fpkm_unstranded
0,nationwidechildrens.org_clinical.TCGA-A2-A0CT.xml,-1.0,Stage IIA,71,1918.0,0,378778d2-b331-4867-a93b-c64028c8b4c7,66941b24-9c8f-4657-a4eb-8cc267e38bdc.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[4901, 2, 2691, 2350, 791, 119, 896, 1752, 150...","[79.8332, 0.1001, 164.7321, 25.2268, 9.7898, 2...","[23.7455, 0.0298, 48.9978, 7.5034, 2.9119, 0.7..."
1,nationwidechildrens.org_clinical.TCGA-GM-A2DD.xml,-1.0,Stage I,53,1309.0,0,b343bfe0-7c23-4c6a-8c84-9ee39db2ecda,021d9e06-9d27-400c-8776-08e89c817b46.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[2971, 24, 1956, 599, 210, 1199, 2451, 2001, 1...","[34.1691, 0.8483, 84.5407, 4.54, 1.8351, 18.49...","[13.4042, 0.3328, 33.1645, 1.781, 0.7199, 7.25..."
2,nationwidechildrens.org_clinical.TCGA-D8-A1JM.xml,-1.0,Stage IIB,59,238.0,0,3e775c99-ceda-4246-8d6f-0f58ca5097c8,1ec0a33c-5b2e-4078-8226-48026f7d5c82.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[4951, 198, 3646, 1556, 1679, 171, 389, 6036, ...","[59.1275, 7.2669, 163.636, 12.2462, 15.2351, 2...","[15.8198, 1.9443, 43.7814, 3.2765, 4.0762, 0.7..."
3,nationwidechildrens.org_clinical.TCGA-C8-A12P.xml,-1.0,Stage IIB,55,0.0,0,abdc76db-f85e-4337-a57e-6d098789da03,bae57be0-48af-438d-88fa-cbaaf16b3d33.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[3834, 0, 3161, 1411, 637, 468, 4472, 2588, 16...","[52.339, 0.0, 162.1673, 12.6939, 6.6071, 8.568...","[15.7116, 0.0, 48.681, 3.8106, 1.9834, 2.5723,..."
4,nationwidechildrens.org_clinical.TCGA-S3-A6ZF.xml,-1.0,Stage IIA,64,212.0,0,fbee40f1-d6d8-4156-8d42-36e09bb9f095,e4559121-091f-41f2-9559-2abb1bbfd229.rna_seq.a...,"[ENSG00000000003.15, ENSG00000000005.6, ENSG00...","[TSPAN6, TNMD, DPM1, SCYL3, C1orf112, FGR, CFH...","[protein_coding, protein_coding, protein_codin...","[1555, 21, 4679, 2352, 1122, 529, 2247, 2042, ...","[21.6296, 0.8977, 244.5886, 21.5601, 11.8579, ...","[5.9762, 0.248, 67.5792, 5.957, 3.2763, 2.7268..."


In [29]:
# Save protein_coding mRNA data in csv file
final_mRNA_protein_coding_df.to_csv(FINAL_mRNA_protein_coding_PATH, index=False)