# Prédiction du modèle dans les données normales TSG3 (test)

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [2]:
data_dir = 'data/'

## Dataset d'entrainement X_train

In [3]:
data_train = pd.read_csv(data_dir + 'TSG3_train_2386_samples_882_genes.csv', sep=';', index_col=0)

In [4]:
y_train = data_train['target']
X_train = data_train.drop(columns=['target'])
print(X_train.shape)

(2386, 882)


## Dataset de test X_test

In [5]:
data_test = pd.read_csv(data_dir + 'TSG3_test_796_samples_1685_genes.csv', sep=';', index_col=0)

In [6]:
y_test = data_test['target']
X_test = data_test.drop(columns=['target'])
X_test = X_test[X_train.columns]
print(X_test.shape)

(796, 882)


## Normalisation de données

In [7]:
scaler = StandardScaler()
scaler.fit(X_train, y_train) # Calcul sur X_train uniquement
X_train_scaled = scaler.transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, index=X_train.index, columns=X_train.columns)
X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, index=X_test.index, columns=X_test.columns)

## Prédiction

In [8]:
random_state = 0
classifier = MLPClassifier(hidden_layer_sizes=(100, 100, 100), random_state=random_state, max_iter=500)
classifier.fit(X_train_scaled, y_train)

MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=500, random_state=0)

In [9]:
y_train_pred = classifier.predict(X_train_scaled)
y_test_pred = classifier.predict(X_test_scaled)

In [10]:
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)
print('Accuracy train', '{:.3f}'.format(accuracy_train))
print('Accuracy test', '{:.3f}'.format(accuracy_test))

Accuracy train 1.000
Accuracy test 0.981


## Erreurs de prédiction

In [11]:
results = pd.DataFrame()
results['real'] = y_test
results['predicted'] = y_test_pred
results['type'] = results['real'] + ' -> ' + results['predicted']
# results.head()

In [12]:
errors = results[results['real']!=results['predicted']]
print(errors.shape)
errors.head()

(15, 3)


Unnamed: 0_level_0,real,predicted,type
id_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TCGA-A6-2680-11A,colon,stomach,colon -> stomach
E-MTAB-2836-smoothmuscle_8c,muscles,stomach,muscles -> stomach
TCGA-V5-A7RE-11A,esophagus,colon,esophagus -> colon
GTEX-QVJO-0011-R9A-SM-2S1QH,central_nervous_system,brain,central_nervous_system -> brain
GTEX-WFON-1826-SM-3GILG,breast,connective_tissues,breast -> connective_tissues


In [13]:
sample_size = errors.groupby(['type']).size().reset_index(name='size')
sample_size = sample_size.sort_values(by=['size'], ascending=False).reset_index(drop=True)
sample_size

Unnamed: 0,type,size
0,central_nervous_system -> brain,2
1,connective_tissues -> breast,2
2,esophagus -> colon,2
3,artery -> connective_tissues,1
4,blood -> bronchus_lung,1
5,breast -> connective_tissues,1
6,colon -> stomach,1
7,esophagus -> stomach,1
8,kidney -> breast,1
9,muscles -> stomach,1
