# Datasets d'entraintement et de test

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Données TSG3

In [18]:
data_dir = 'data/'

In [19]:
data = pd.read_csv(data_dir + 'expression_data_tsg3_TSG3_log_rpkm_3686_samples_2095_genes.csv', sep=';')
data.index = data['gene_symbols'] + '@' + data['id_gene'].astype('string')
data.head()

Unnamed: 0,id_gene,gene_symbols,TCGA-06-0675-11A,TCGA-06-0678-11A,TCGA-06-0680-11A,TCGA-06-0681-11A,TCGA-06-AABW-11A,TCGA-22-4593-11A,TCGA-22-4609-11A,TCGA-22-5471-11A,...,SAMN03465416,SAMN03465418,SAMN03465419,SAMN03465420,SAMN03465421,SAMN04457469,SAMN04457471,SAMN04457472,SAMN04457473,SAMN04457474
NAT2@10,10,NAT2,0.117494,0.086131,0.224822,0.060679,0.11228,0.123072,0.195492,0.055526,...,0.105042,0.010828,0.008827,0.0,0.110499,0.105486,0.170669,0.0,0.109082,0.0
AANAT@15,15,AANAT,0.432958,0.25697,0.375798,0.322697,0.257526,0.044545,0.05718,0.044857,...,0.027212,0.068284,0.012425,0.03813,0.006933,0.982768,2.425997,0.737861,1.339205,0.719802
ABCA4@24,24,ABCA4,0.263762,0.246034,0.207565,0.201676,0.037162,0.439872,0.488587,0.211794,...,0.0284,0.056777,0.06858,0.063234,0.027516,0.280279,0.029935,0.174345,0.106336,0.296033
ACRV1@56,56,ACRV1,0.591774,0.340917,0.48066,0.548058,0.081401,0.037889,0.047338,0.02238,...,0.03905,0.240472,0.0,0.0,0.00542,5.124957,4.171272,3.861805,0.021013,5.561771
ADH1A@124,124,ADH1A,0.0,0.0,0.0,0.012044,0.0,0.215065,0.422064,0.261947,...,0.451805,0.664812,0.343527,0.412227,0.828346,0.402624,0.140627,0.223142,0.377207,0.219072


## Annotations TSG3

In [20]:
expgroup = pd.read_csv(data_dir + 'expression_data_tsg3_3686_samples_20982_genes__targets.csv', sep=';', index_col=0)
expgroup.head()

Unnamed: 0_level_0,tissue_group_level1,tissue_group_level2,tissue_group_level3,source,tissue_status,tissue_stage,id_topology,topology,id_topology_group,topology_group
id_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
TCGA-06-0675-11A,brain,neuro,soma,TCGA,normal,adult,C71.9,"Brain, NOS",C71,BRAIN
TCGA-06-0678-11A,brain,neuro,soma,TCGA,normal,adult,C71.9,"Brain, NOS",C71,BRAIN
TCGA-06-0680-11A,brain,neuro,soma,TCGA,normal,adult,C71.9,"Brain, NOS",C71,BRAIN
TCGA-06-0681-11A,brain,neuro,soma,TCGA,normal,adult,C71.9,"Brain, NOS",C71,BRAIN
TCGA-06-AABW-11A,brain,neuro,soma,TCGA,normal,adult,C71.9,"Brain, NOS",C71,BRAIN


## Définir les cibles

In [21]:
level = 'tissue_group_level1'
expgroup['target'] = expgroup[level] + '@' + expgroup['tissue_stage']
is_ra = expgroup['target']=='rectum_anus@adult'
expgroup.loc[is_ra, level] = 'colon'
expgroup.loc[is_ra, 'target'] = 'colon@adult'
expgroup['target'].head()

id_sample
TCGA-06-0675-11A    brain@adult
TCGA-06-0678-11A    brain@adult
TCGA-06-0680-11A    brain@adult
TCGA-06-0681-11A    brain@adult
TCGA-06-AABW-11A    brain@adult
Name: target, dtype: object

## Sélectionner les échantillons avec un effectif suffisant dans chaque groupe

In [22]:
sample_size = expgroup.groupby(['target']).size().reset_index(name='size')
sample_size = sample_size.sort_values(by=['size'], ascending=False).reset_index(drop=True)

In [23]:
min_size = 75
selected_targets = sample_size[sample_size['size']>=min_size]
print(len(selected_targets), list(selected_targets['target']))
selected_targets

20 ['brain@adult', 'esophagus@adult', 'bronchus_lung@adult', 'artery@adult', 'heart@adult', 'breast@adult', 'thyroid_gland@adult', 'blood@adult', 'skin@adult', 'fibroblast@adult', 'muscles@adult', 'connective_tissues@adult', 'colon@adult', 'kidney@adult', 'central_nervous_system@adult', 'stomach@adult', 'prostate@adult', 'liver@adult', 'uterus@adult', 'testis@adult']


Unnamed: 0,target,size
0,brain@adult,329
1,esophagus@adult,240
2,bronchus_lung@adult,237
3,artery@adult,214
4,heart@adult,177
5,breast@adult,175
6,thyroid_gland@adult,170
7,blood@adult,163
8,skin@adult,158
9,fibroblast@adult,150


In [24]:
selected_samples = expgroup[expgroup['target'].isin(selected_targets['target'])].index

In [25]:
y = expgroup.loc[selected_samples, level]

In [26]:
X = data[y.index].T
X.index.name = 'id_sample'
X = X.dropna(axis=1)
print('X', X.shape)
X.head()

X (3182, 1685)


Unnamed: 0_level_0,NAT2@10,AANAT@15,ABCA4@24,ACRV1@56,ADH1A@124,ADH4@127,ACAN@176,AHSG@197,ALPG@251,AMY2A@279,...,LOC102724804@102724804,RGS2-AS1@102724954,CASC23@103581031,LINC01337@103689917,LINC01269@103695436,SMAD1-AS1@104326058,LINC01320@104355288,LINC01525@104355292,FOXP1-AS1@104502416,LOC104613533@104613533
id_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-06-0675-11A,0.117494,0.432958,0.263762,0.591774,0.0,0.021825,0.251254,0.152161,0.0,0.211134,...,0.075353,0.0,0.0,0.0,0.086933,0.0,0.01335,0.0,0.038025,0.0
TCGA-06-0678-11A,0.086131,0.25697,0.246034,0.340917,0.0,0.0,0.280192,0.096434,0.0,0.118625,...,0.0,0.0,0.018646,0.0,0.047921,0.0,0.021715,0.0,0.0,0.0
TCGA-06-0680-11A,0.224822,0.375798,0.207565,0.48066,0.0,0.0,0.202792,0.130594,0.0,0.088475,...,0.135361,0.0,0.0,0.15953,0.0,0.0,0.048389,0.0,0.034917,0.050643
TCGA-06-0681-11A,0.060679,0.322697,0.201676,0.548058,0.012044,0.01109,0.235127,0.174686,0.0,0.137156,...,0.0,0.0,0.0,0.060769,0.0,0.055688,0.0,0.0,0.148178,0.0
TCGA-06-AABW-11A,0.11228,0.257526,0.037162,0.081401,0.0,0.0,0.318676,0.07665,0.0,0.094436,...,0.029217,0.0,0.0,0.068512,0.0,0.0,0.015271,0.0,0.085616,0.0


In [27]:
features = list(X.columns)

##  Train / test

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4, random_state=0, stratify=y)

In [29]:
train = pd.DataFrame(X_train)
train['target'] = y_train.loc[train.index]
train = train[['target', *features]]
train.head()

Unnamed: 0_level_0,target,NAT2@10,AANAT@15,ABCA4@24,ACRV1@56,ADH1A@124,ADH4@127,ACAN@176,AHSG@197,ALPG@251,...,LOC102724804@102724804,RGS2-AS1@102724954,CASC23@103581031,LINC01337@103689917,LINC01269@103695436,SMAD1-AS1@104326058,LINC01320@104355288,LINC01525@104355292,FOXP1-AS1@104502416,LOC104613533@104613533
id_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-85-7710-11A,bronchus_lung,0.163748,0.766694,0.767072,0.022848,0.13966,0.040948,0.107942,0.0,1.406033,...,0.161115,0.0,0.0,0.214748,0.864863,0.0,0.012611,0.0,0.070998,0.0
TCGA-DD-A3A8-11A,liver,4.003451,0.0,0.110885,0.026463,8.568801,9.615037,0.0,10.921722,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.401312,0.0,0.0,0.0
GTEX-PX3G-0326-SM-2I3EO,heart,0.052237,0.04533,0.061543,0.015814,0.023372,0.016306,0.149207,0.04063,0.007055,...,0.0,0.0,0.0,0.187641,0.0,0.0,0.0,0.0,0.0,0.0
GTEX-XLM4-0011-R3B-SM-4AT6E,brain,0.108329,0.350975,0.085389,0.486369,0.029594,0.041027,0.150139,0.117122,0.011917,...,0.0,0.0,0.0,0.066263,0.0,0.0,0.036595,0.0,0.062718,0.0
GTEX-RM2N-0726-SM-48FD5,esophagus,0.047108,0.176655,0.084551,0.074389,0.234415,0.75613,0.049303,0.280086,0.0,...,0.0,0.074918,0.0,0.180624,2.556904,0.0,0.0,0.0,0.0,0.0


In [30]:
train.to_csv(data_dir + 'TSG3_train_' + str(train.shape[0]) + '_samples_' + str(len(features)) + '_genes.csv', sep=';', index=True)

In [31]:
test = pd.DataFrame(X_test)
test['target'] = y_test.loc[test.index]
test = test[['target', *features]]
test.head()

Unnamed: 0_level_0,target,NAT2@10,AANAT@15,ABCA4@24,ACRV1@56,ADH1A@124,ADH4@127,ACAN@176,AHSG@197,ALPG@251,...,LOC102724804@102724804,RGS2-AS1@102724954,CASC23@103581031,LINC01337@103689917,LINC01269@103695436,SMAD1-AS1@104326058,LINC01320@104355288,LINC01525@104355292,FOXP1-AS1@104502416,LOC104613533@104613533
id_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GTEX-P4QT-1826-SM-2S1NJ,artery,0.0,0.132711,0.088478,0.029691,0.824019,0.735319,0.96907,0.0,0.0,...,3.423182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-B0-5703-11A,kidney,1.10636,0.051905,1.585207,0.013022,0.050369,0.344621,0.841954,0.095169,0.0,...,0.054291,0.0,0.0,0.064307,0.0,0.0,3.638585,0.055542,0.08039,0.0
GTEX-O5YV-1626-SM-2YUNJ,artery,0.024283,0.114085,0.078008,0.021795,0.444564,0.248989,4.624142,0.0,0.0,...,2.432673,0.0,0.0,0.106661,0.0,0.0,0.0,0.0,0.0,0.0
GTEX-Q2AG-1026-SM-33HBW,bronchus_lung,0.054009,0.092268,0.06721,0.068271,0.082892,0.058188,0.066382,0.02116,0.22326,...,0.146237,0.032765,0.0,0.040878,0.0,0.0,0.037286,0.0,0.0,0.0
GTEX-QDVJ-1826-SM-2S1P3,connective_tissues,0.077663,0.131928,0.170842,0.0,1.290073,1.738014,0.290937,0.0,0.021063,...,0.141583,0.340781,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
test.to_csv(data_dir + 'TSG3_test_' + str(test.shape[0]) + '_samples_' + str(len(features)) + '_genes.csv', sep=';', index=True)