In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras import layers, callbacks
from sklearn.model_selection import train_test_split

# **Load la matrice d'expression et les metadata**

In [112]:
X = pd.read_csv("../data/THCA_expression_matrix_final.csv", index_col=0)
X = X.T

In [113]:
X.head()

Unnamed: 0,ENSG00000000003.15,ENSG00000000005.6,ENSG00000000419.13,ENSG00000000457.14,ENSG00000000460.17,ENSG00000000938.13,ENSG00000000971.16,ENSG00000001036.14,ENSG00000001084.13,ENSG00000001167.14,...,ENSG00000288661.1,ENSG00000288662.1,ENSG00000288663.1,ENSG00000288665.1,ENSG00000288667.1,ENSG00000288669.1,ENSG00000288670.1,ENSG00000288671.1,ENSG00000288674.1,ENSG00000288675.1
ID=TCGA-DJ-A2Q6_AGE=39ANS_SEX=female_STAGE=Stade_I_MUT=BRAF_TYPE=PTC,6156,7,2540,1452,297,1075,26323,3481,3414,4228,...,0,0,26,0,0,0,714,0,7,31
ID=TCGA-FK-A3SE_AGE=31ANS_SEX=female_STAGE=Stade_I_MUT=NO_CANONICAL_DRIVER_TYPE=PTC,7191,3,3815,767,276,1069,18403,3861,2562,5715,...,0,0,10,0,0,0,598,0,5,22
ID=TCGA-DJ-A2QA_AGE=57ANS_SEX=female_STAGE=Stade_III_MUT=BRAF_TYPE=PTC,2219,1,735,305,28,303,4710,2995,495,618,...,0,0,3,0,0,0,144,0,0,31
ID=TCGA-FY-A2QD_AGE=62ANS_SEX=female_STAGE=Stage_NA_MUT=NO_CANONICAL_DRIVER_TYPE=PTC,5185,0,2594,970,135,71,2986,2033,2469,2901,...,0,2,9,0,0,0,603,0,2,11
ID=TCGA-EL-A3GR_AGE=32ANS_SEX=female_STAGE=Stade_I_MUT=BRAF_TYPE=PTC,4466,3,1496,820,208,610,25601,2051,1894,2378,...,0,0,5,0,0,0,365,0,3,12


### ajout des metadata à la matrice d'expression

In [114]:
labels = pd.Index(X.index)   # index = patients

age = (
    labels
    .str.extract(r"AGE=(\d+)ANS")[0]
    .astype(float))

sex = (
    labels
    .str.extract(r"SEX=([a-zA-Z]+)")[0]
    .str.lower()
    .map({"male": 1, "female": 0}))

# ATTENTION !! Eviter d'utiliser les metadata suivantes dans X_full. 
# elles sont ajoutées seulement si on en a besoin pour un modèle

stage = (
    labels
    .str.extract(r"STAGE=([^_]+_[^_]+)")[0]) # ne pas ajouter à X_full

stage_map = {
    "Stade_I": 0,
    "Stade_II": 1,
    "Stade_III": 2,
    "Stade_IV": 3
} # ne pas ajouter à X_full
stage_enc = stage.map(stage_map)

mutation = (
    labels
    .str.extract(r"MUT=([^_]+)")[0]) # ne pas ajouter à X_full


X_full = X.copy()
X_full["age"] = age.values
X_full["sex"] = sex.values

X_full

Unnamed: 0,ENSG00000000003.15,ENSG00000000005.6,ENSG00000000419.13,ENSG00000000457.14,ENSG00000000460.17,ENSG00000000938.13,ENSG00000000971.16,ENSG00000001036.14,ENSG00000001084.13,ENSG00000001167.14,...,ENSG00000288663.1,ENSG00000288665.1,ENSG00000288667.1,ENSG00000288669.1,ENSG00000288670.1,ENSG00000288671.1,ENSG00000288674.1,ENSG00000288675.1,age,sex
ID=TCGA-DJ-A2Q6_AGE=39ANS_SEX=female_STAGE=Stade_I_MUT=BRAF_TYPE=PTC,6156,7,2540,1452,297,1075,26323,3481,3414,4228,...,26,0,0,0,714,0,7,31,39.0,0
ID=TCGA-FK-A3SE_AGE=31ANS_SEX=female_STAGE=Stade_I_MUT=NO_CANONICAL_DRIVER_TYPE=PTC,7191,3,3815,767,276,1069,18403,3861,2562,5715,...,10,0,0,0,598,0,5,22,31.0,0
ID=TCGA-DJ-A2QA_AGE=57ANS_SEX=female_STAGE=Stade_III_MUT=BRAF_TYPE=PTC,2219,1,735,305,28,303,4710,2995,495,618,...,3,0,0,0,144,0,0,31,57.0,0
ID=TCGA-FY-A2QD_AGE=62ANS_SEX=female_STAGE=Stage_NA_MUT=NO_CANONICAL_DRIVER_TYPE=PTC,5185,0,2594,970,135,71,2986,2033,2469,2901,...,9,0,0,0,603,0,2,11,62.0,0
ID=TCGA-EL-A3GR_AGE=32ANS_SEX=female_STAGE=Stade_I_MUT=BRAF_TYPE=PTC,4466,3,1496,820,208,610,25601,2051,1894,2378,...,5,0,0,0,365,0,3,12,32.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ID=TCGA-EL-A3TB_AGE=47ANS_SEX=female_STAGE=Stade_III_MUT=NO_CANONICAL_DRIVER_TYPE=PTC,4742,3,2172,1171,245,372,5861,3069,1740,2560,...,29,0,0,0,377,0,3,23,47.0,0
ID=TCGA-BJ-A45C_AGE=78ANS_SEX=male_STAGE=Stade_III_MUT=NO_CANONICAL_DRIVER_TYPE=PTC,3753,4,1491,962,166,184,1663,2433,1244,1713,...,59,0,0,0,489,0,17,3,78.0,1
ID=TCGA-DJ-A3UX_AGE=47ANS_SEX=female_STAGE=Stade_I_MUT=BRAF_TYPE=PTC,4488,5,2061,1151,250,775,9620,2795,2425,2753,...,24,0,0,0,539,0,6,11,47.0,0
ID=TCGA-DJ-A2PX_AGE=55ANS_SEX=female_STAGE=Stade_I_MUT=NO_CANONICAL_DRIVER_TYPE=PTC,6131,6,2439,1213,242,1076,8511,3439,3132,2428,...,8,0,0,0,458,0,4,35,55.0,0


# **Définition de la cible qu'on cherche à prédire**

voir si on peut pas accumuler plusieurs prédictions par la suite

Choisissez la variable que vous voulez prédire en remplacant "choix" par le nom de la liste que vous voulez :

In [146]:
liste_des_prédictions = ["stade_tumoral", "type_tumoral", "mutation"]
choix = "stade_tumoral"

### _**préparation des inputs + labels du modèle**_

Pour prédire le stade tumoral on va prédire 1 état parmi 2 possibilités : 
- **0** : stade précoce (stade I et II)
- **1** : stade avancé (stade III et IV) 
c'est utile de regrouper les stades en 2 catégories car il y a peu de patients dans les stades III et IV, ca permet d'avoir un dataset plus équilibré pour l'entrainement du modèle.


ici, j'ai crée une fonction qui réuni les 3 prédictions qu'on voudrait faire. 
Pour chaque prédiction, la variable cible est identifiée (comme label)

In [147]:
from sklearn.preprocessing import LabelEncoder

def build_task(choix, X):
    labels_index = pd.Index(X.index)
    X_out = X.copy()

    if choix == "stade_tumoral":
        # extraire le stade depuis le nom
        stade = pd.Series(stage.values, index=X_out.index)

        # garder uniquement les patients avec un stade défini
        mask = stade != "Stage_NA"
        X_out = X_out.loc[mask]

        # early vs late
        labels = stade.loc[mask].isin(
            ["Stade_III", "Stade_IV"]
        ).astype(int).values

        encoder = None

    elif choix == "type_tumoral":
        tumor_type = pd.Series(labels_index.str.extract(r"TYPE=([^_]+)")[0].values, index=X_out.index)

        encoder = LabelEncoder()
        labels = encoder.fit_transform(tumor_type)

    elif choix == "mutation":
        mutation = pd.Series(labels_index.str.extract(r"MUT=([^_]+)")[0].values, index=X_out.index)

        encoder = LabelEncoder()
        labels = encoder.fit_transform(mutation)

    else:
        raise ValueError(
            "le choix doit etre 'stade_tumoral', 'type_tumoral' ou 'mutation'"
        )

    return X_out.values, labels, encoder

X_out, labels, encoder = build_task(choix, X_full)

In [158]:
print(f"les labels de {choix} : \n{labels}")

les labels de stade_tumoral : 
[0 0 1 0 0 0 1 0 1 0 0 1 1 1 1 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1
 0 0 0 1 1 0 0 0 0 1 1 0 0 1 1 1 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1
 0 0 1 1 0 0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 1 1 0 0 1 0 0 1 0 1 0 0 0 1 0
 0 1 1 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0 1 1 1 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 1 0 1 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 1 1 0 1 1 0 1 0 1 1 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
 1 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 1
 0 1 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0
 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 1 0 0 0 0 0 1
 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 0 1 1 0 0 1 1 1 0 1 0 1 0 1 0 0 1
 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 1 0 1 1 1 0 0 0 0 1 1 0 1 1 0 1 0 0
 1 1 0 0 1 1 1 1 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 1 0
 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0]


# **Preprocessing**

il faut réduire le nombre de features (gènes) pour éviter l'overfitting et améliorer les performances du modèle. On peut utiliser des techniques comme la sélection de caractéristiques basées sur l'importance des gènes, ou des méthodes de réduction de dimensionnalité comme PCA.

In [148]:
import numpy as np

# Séparer expression / metadata
X_expr = X_out[:, :-2]   # gènes
X_meta = X_out[:, -2:]   # age, sex

# variance par gène
variances = X_expr.var(axis=0)

# garder les gènes les plus variables
k = 5000   # valeur standard en RNA-seq
top_idx = np.argsort(variances)[-k:]

X_expr_filt = X_expr[:, top_idx]

# recombiner
X_reduced = np.hstack([X_expr_filt, X_meta])

print(X_reduced.shape)

(460, 5002)


normalisation avant split ???

In [None]:
X_log = np.log1p(X.values) # log transformation

scaler = StandardScaler() # normalisation
X_scaled = scaler.fit_transform(X_log)



label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(Y)
n_classes = len(label_encoder.classes_)

# **Split les données train et test**

In [None]:
X_selection=#selon les citères qu'on veut, on peut prendre les HVG
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selection)

# Split train/validation/test
X_train, X_temp, y_train, y_temp = train_test_split(
    X_scaled, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"\nTailles des ensembles:")
print(f"Train: {X_train.shape[0]}")
print(f"Validation: {X_val.shape[0]}")
print(f"Test: {X_test.shape[0]}")


Modèle dense à 4 couches

In [None]:


model = keras.Sequential([
    
    layers.Input(shape=(input_dim,)),
    layers.BatchNormalization(),
    
   
    layers.Dense(512, activation='relu', 
                kernel_regularizer=keras.regularizers.l2(0.001)),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    

    layers.Dense(256, activation='relu',
                kernel_regularizer=keras.regularizers.l2(0.001)),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    
    layers.Dense(128, activation='relu',
                kernel_regularizer=keras.regularizers.l2(0.001)),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    

    layers.Dense(64, activation='relu',
                kernel_regularizer=keras.regularizers.l2(0.001)),
    layers.Dropout(0.3),
    
  
  
    layers.Dense(n_classes, activation='softmax')
])

In [None]:
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
model.summary()


In [None]:
early_stop = callbacks.EarlyStopping(
    monitor='val_loss',
    patience=20,
    restore_best_weights=True,
    verbose=1
)

In [None]:
model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=200,
    batch_size=16,
    callbacks=[early_stop],
    verbose=1
)

In [None]:
y_pred_proba = model.predict(X_test)
y_pred = np.argmax(y_pred_proba, axis=1)

In [None]:
#Faire une matrice de confusion