# **Data Loading and creating dataframes**

In [27]:
import pandas as pd
import os
import json

In [28]:
# path for the dataset folder
# Elodie's path : 
folder = "/Users/elodiehusson/Desktop/dataset_DL"

# ajoutez vos chemins à vous, mais sans les noms de fichiers, juste le dossier parent : 


### **Load data**

In [29]:
clinical = pd.read_csv("../data/clinical.tsv", sep="\t", index_col="cases.case_id")
gene_name = pd.read_csv(f"{folder}/gdc_download_20251125_142547.268493/00a1a02a-2b45-4065-81c0-dd886efe8464/9a69296a-e334-4387-8a3e-43201d647d2d.rna_seq.augmented_star_gene_counts.tsv", sep="\t", comment="#", index_col="gene_id", usecols=["gene_id", "gene_name"])
gene_name[4:].to_csv("../data/gene_name.tsv", sep="\t", index=True) 

### **Creation of the expression matrix**
-> concat every file from every patient. we only keep their 'fpkm_uq_unstranded' column. 

In [30]:
# takes around 1 minute to run

expression = []

for root, dirs, files in os.walk(folder):
    for f in files:
        if f.endswith(".tsv"):
            path = os.path.join(root, f) # full path to the file

            try:
                df = pd.read_csv(path, sep="\t", comment="#")
                sample_id = f.split(".")[0]   # sample name based on filename

                # Add only the expression column for this sample
                expression.append(
                    df.set_index("gene_id")[["fpkm_uq_unstranded"]]
                    .rename(columns={"fpkm_uq_unstranded": f})
                )

            except Exception:
                pass

# Concatenate all expression columns horizontally
expr_matrix = pd.concat(expression, axis=1)
expr_matrix = expr_matrix[4:].T.copy()
expr_matrix

gene_id,ENSG00000000003.15,ENSG00000000005.6,ENSG00000000419.13,ENSG00000000457.14,ENSG00000000460.17,ENSG00000000938.13,ENSG00000000971.16,ENSG00000001036.14,ENSG00000001084.13,ENSG00000001167.14,...,ENSG00000288661.1,ENSG00000288662.1,ENSG00000288663.1,ENSG00000288665.1,ENSG00000288667.1,ENSG00000288669.1,ENSG00000288670.1,ENSG00000288671.1,ENSG00000288674.1,ENSG00000288675.1
990d59a1-18bd-4903-b1c9-f4d8b9edf980.rna_seq.augmented_star_gene_counts.tsv,12.7498,0.0361,22.1672,2.0347,0.7283,2.1849,26.4351,18.4802,2.8694,11.9485,...,0.0,0.0000,0.0468,0.0,0.4373,0.0000,4.6207,0.0,0.0000,0.4287
fc853d38-8069-41b0-af9c-77925a3f8063.rna_seq.augmented_star_gene_counts.tsv,16.3237,0.0490,33.2109,2.1767,0.9475,9.8380,42.9446,17.0994,4.8779,13.5060,...,0.0,0.0000,0.1332,0.0,0.0000,0.0045,3.8706,0.0,0.0188,0.3335
7091c2c6-682c-4c5e-810e-fa254b3a20bc.rna_seq.augmented_star_gene_counts.tsv,11.1951,0.0404,32.0691,2.6413,0.7896,6.2225,16.8074,14.3013,5.0269,11.6965,...,0.0,0.0000,0.0942,0.0,0.0000,0.0000,3.8857,0.0,0.0124,0.1421
bafc3122-091b-4648-bc51-8e6c72e47b6a.rna_seq.augmented_star_gene_counts.tsv,19.8536,0.0138,35.4080,2.3813,0.7640,3.4978,13.4795,17.3233,4.3141,12.4163,...,0.0,0.0000,0.0536,0.0,0.0000,0.0000,3.7747,0.0,0.0127,0.2545
0f6e2216-6762-4c82-aa09-a9b36a475392.rna_seq.augmented_star_gene_counts.tsv,11.9501,0.0000,26.5317,2.7781,0.6816,6.5881,11.0855,15.4334,4.1267,11.1938,...,0.0,0.0000,0.0639,0.0,0.0000,0.0000,3.6067,0.0,0.0086,0.1969
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2df38eb8-5350-4951-9159-a8add6474efe.rna_seq.augmented_star_gene_counts.tsv,21.5420,0.1269,35.5278,1.7214,0.9467,4.7013,41.1976,27.1338,5.5665,9.2689,...,0.0,0.0000,0.0329,0.0,0.0000,0.0000,5.4264,0.0,0.0053,0.4763
a61ada75-0759-45d0-9cb0-f08847b16a5d.rna_seq.augmented_star_gene_counts.tsv,24.5194,0.0263,37.2317,1.5829,0.7970,5.0245,26.1152,27.1348,5.5435,10.6860,...,0.0,0.0000,0.0273,0.0,0.0000,0.0000,4.3526,0.0,0.0020,0.1387
4fa0416d-79d1-4b68-a146-1956bcaf49b7.rna_seq.augmented_star_gene_counts.tsv,20.4696,0.0338,29.0052,2.3045,0.8117,2.6664,22.5726,17.3186,3.6015,12.8668,...,0.0,0.0000,0.0307,0.0,0.0000,0.0000,4.3269,0.0,0.0104,0.3420
185ecfd0-680c-4874-8667-5d7543ec562c.rna_seq.augmented_star_gene_counts.tsv,13.3239,0.0000,34.2054,1.3185,0.2376,4.7621,26.6526,18.8882,2.6144,7.3573,...,0.0,0.3477,0.0220,0.0,0.0000,0.0000,3.9138,0.0,0.0022,0.6209


### **Gene name association**
If we need to associate a gene_id to a gene_name, we can use this dataframe :

In [31]:
gene_name[4:]

Unnamed: 0_level_0,gene_name
gene_id,Unnamed: 1_level_1
ENSG00000000003.15,TSPAN6
ENSG00000000005.6,TNMD
ENSG00000000419.13,DPM1
ENSG00000000457.14,SCYL3
ENSG00000000460.17,C1orf112
...,...
ENSG00000288669.1,AC008763.4
ENSG00000288670.1,AL592295.6
ENSG00000288671.1,AC006486.3
ENSG00000288674.1,AL391628.1


### **Creating map of patients' ID from *metadata.json***
We have various names for each sample, for each patient. We use this mapping to link every IDs with each other, from the .json file. 

In [32]:
json_path = "../data/metadata.cart.2025-11-25.json"

with open(json_path) as f:
    meta = json.load(f)

mapping = {
    entry["file_name"]: {
        "file_name" : entry["file_name"],
        "file_id": entry["file_id"],
        "case_id": entry["associated_entities"][0]["case_id"]
    }
    for entry in meta
}


Adding the right index : it is the "case_id". It will later be useful to use the clinical data

In [33]:
def get_case_id(filename):
    return mapping[filename]["case_id"]

expr_matrix.index = expr_matrix.index.map(get_case_id)
expr_matrix.index.name = "case_id"

expr_matrix.to_csv(f"{folder}/expression_matrix.tsv", sep="\t", index=True)
expr_matrix

gene_id,ENSG00000000003.15,ENSG00000000005.6,ENSG00000000419.13,ENSG00000000457.14,ENSG00000000460.17,ENSG00000000938.13,ENSG00000000971.16,ENSG00000001036.14,ENSG00000001084.13,ENSG00000001167.14,...,ENSG00000288661.1,ENSG00000288662.1,ENSG00000288663.1,ENSG00000288665.1,ENSG00000288667.1,ENSG00000288669.1,ENSG00000288670.1,ENSG00000288671.1,ENSG00000288674.1,ENSG00000288675.1
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
78d14f86-896a-4f98-9274-b2e8e387add9,12.7498,0.0361,22.1672,2.0347,0.7283,2.1849,26.4351,18.4802,2.8694,11.9485,...,0.0,0.0000,0.0468,0.0,0.4373,0.0000,4.6207,0.0,0.0000,0.4287
a253b21e-ba9b-4928-95d1-89b59c868794,16.3237,0.0490,33.2109,2.1767,0.9475,9.8380,42.9446,17.0994,4.8779,13.5060,...,0.0,0.0000,0.1332,0.0,0.0000,0.0045,3.8706,0.0,0.0188,0.3335
754866ea-d5ca-451a-98e4-6e0c430d3652,11.1951,0.0404,32.0691,2.6413,0.7896,6.2225,16.8074,14.3013,5.0269,11.6965,...,0.0,0.0000,0.0942,0.0,0.0000,0.0000,3.8857,0.0,0.0124,0.1421
5be6c0d9-6ad7-482f-bd00-0432702df495,19.8536,0.0138,35.4080,2.3813,0.7640,3.4978,13.4795,17.3233,4.3141,12.4163,...,0.0,0.0000,0.0536,0.0,0.0000,0.0000,3.7747,0.0,0.0127,0.2545
6ee95dc8-2bac-4691-afa2-8222e07506ad,11.9501,0.0000,26.5317,2.7781,0.6816,6.5881,11.0855,15.4334,4.1267,11.1938,...,0.0,0.0000,0.0639,0.0,0.0000,0.0000,3.6067,0.0,0.0086,0.1969
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3bfa52af-cb40-45d8-9bdc-591985aae7fb,21.5420,0.1269,35.5278,1.7214,0.9467,4.7013,41.1976,27.1338,5.5665,9.2689,...,0.0,0.0000,0.0329,0.0,0.0000,0.0000,5.4264,0.0,0.0053,0.4763
2106f0cd-cb3c-4568-bdc5-197ee11f2337,24.5194,0.0263,37.2317,1.5829,0.7970,5.0245,26.1152,27.1348,5.5435,10.6860,...,0.0,0.0000,0.0273,0.0,0.0000,0.0000,4.3526,0.0,0.0020,0.1387
f00fd40a-1254-4300-baf9-88528c453a2e,20.4696,0.0338,29.0052,2.3045,0.8117,2.6664,22.5726,17.3186,3.6015,12.8668,...,0.0,0.0000,0.0307,0.0,0.0000,0.0000,4.3269,0.0,0.0104,0.3420
6e1c9ac8-eead-4db9-8f30-ef3591daf78b,13.3239,0.0000,34.2054,1.3185,0.2376,4.7621,26.6526,18.8882,2.6144,7.3573,...,0.0,0.3477,0.0220,0.0,0.0000,0.0000,3.9138,0.0,0.0022,0.6209


### **Clinical informations**

In [34]:
clinical.index.name = "case_id"
clinical

Unnamed: 0_level_0,project.project_id,cases.consent_type,cases.days_to_consent,cases.days_to_lost_to_followup,cases.disease_type,cases.index_date,cases.lost_to_followup,cases.primary_site,cases.submitter_id,demographic.age_at_index,...,treatments.treatment_duration,treatments.treatment_effect,treatments.treatment_effect_indicator,treatments.treatment_frequency,treatments.treatment_id,treatments.treatment_intent_type,treatments.treatment_or_therapy,treatments.treatment_outcome,treatments.treatment_outcome_duration,treatments.treatment_type
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00a02e62-e1ab-467a-91b3-5f526dd2251a,TCGA-THCA,Informed Consent,-1,'--,Adenomas and Adenocarcinomas,Diagnosis,No,Thyroid gland,TCGA-EL-A3N3,53,...,'--,'--,'--,'--,2b8e17b0-6707-4709-9037-c638f4f8f186,'--,yes,'--,'--,"Surgery, NOS"
00a02e62-e1ab-467a-91b3-5f526dd2251a,TCGA-THCA,Informed Consent,-1,'--,Adenomas and Adenocarcinomas,Diagnosis,No,Thyroid gland,TCGA-EL-A3N3,53,...,'--,'--,'--,'--,7ed22d10-3b18-4830-a56f-c8795a2ef581,'--,no,'--,'--,"Pharmaceutical Therapy, NOS"
00a02e62-e1ab-467a-91b3-5f526dd2251a,TCGA-THCA,Informed Consent,-1,'--,Adenomas and Adenocarcinomas,Diagnosis,No,Thyroid gland,TCGA-EL-A3N3,53,...,'--,'--,'--,'--,c3ed53b6-dce1-4606-8b8d-4e3c3f3b53cf,'--,no,'--,'--,"Radiation Therapy, NOS"
00a02e62-e1ab-467a-91b3-5f526dd2251a,TCGA-THCA,Informed Consent,-1,'--,Adenomas and Adenocarcinomas,Diagnosis,No,Thyroid gland,TCGA-EL-A3N3,53,...,'--,'--,'--,'--,27b01177-7543-4858-bdf9-80800b9b277b,Adjuvant,no,'--,'--,"Radiation, External Beam"
00a02e62-e1ab-467a-91b3-5f526dd2251a,TCGA-THCA,Informed Consent,-1,'--,Adenomas and Adenocarcinomas,Diagnosis,No,Thyroid gland,TCGA-EL-A3N3,53,...,'--,'--,'--,'--,2a00daab-4ebe-5684-b9c4-1c1ad098b87b,'--,yes,Unknown,'--,"Radiation, Systemic"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fef9c64f-5959-4da0-aaa2-66b56fc7b4c3,TCGA-THCA,Informed Consent,-50,'--,Adenomas and Adenocarcinomas,Diagnosis,No,Thyroid gland,TCGA-EM-A3FQ,19,...,'--,'--,'--,'--,eb7de920-247f-423a-827c-17354c369002,Adjuvant,yes,Not Reported,'--,"Pharmaceutical Therapy, NOS"
fffdb1d9-58d1-425c-ac12-1e1e5f443bf7,TCGA-THCA,Informed Consent,38,'--,Adenomas and Adenocarcinomas,Diagnosis,No,Thyroid gland,TCGA-DJ-A4V0,36,...,'--,'--,'--,'--,6150a36e-03bb-4e2f-98e5-ac8a73166c4e,Adjuvant,no,'--,'--,"Radiation, External Beam"
fffdb1d9-58d1-425c-ac12-1e1e5f443bf7,TCGA-THCA,Informed Consent,38,'--,Adenomas and Adenocarcinomas,Diagnosis,No,Thyroid gland,TCGA-DJ-A4V0,36,...,'--,'--,'--,'--,7d525d1a-f73e-5861-9eea-ab045f3a751f,Adjuvant,no,'--,'--,"Pharmaceutical Therapy, NOS"
fffdb1d9-58d1-425c-ac12-1e1e5f443bf7,TCGA-THCA,Informed Consent,38,'--,Adenomas and Adenocarcinomas,Diagnosis,No,Thyroid gland,TCGA-DJ-A4V0,36,...,'--,'--,'--,'--,855b2a8d-4c27-4ede-a58a-b45925e44d22,Adjuvant,no,'--,'--,I-131 Radiation Therapy


### **Duplicates**

#### **Duplicates in the expression matrix**

In [35]:
dup_expr = expr_matrix.index.duplicated().sum()
print("There are", dup_expr, f"duplicated case_ids in expression matrix \nIt means that {dup_expr} patients had 2 samples sequenced")

There are 67 duplicated case_ids in expression matrix 
It means that 67 patients had 2 samples sequenced


In [36]:
dup_clinical = clinical.index.duplicated().sum()
print("There are", dup_clinical, f"duplicated case_ids in expression matrix \nIt means that {dup_clinical} patients had multiple clinical entries, related to their treatment types (4 duplicates actually)")


There are 1829 duplicated case_ids in expression matrix 
It means that 1829 patients had multiple clinical entries, related to their treatment types (4 duplicates actually)
