In [3]:
# -*- coding: utf-8 -*-
__author__ = 'maoss2'
import random
import h5py
import re
import os
import seaborn as sns
import pandas as pd
import numpy as np

from glob import glob
from copy import deepcopy

# ******************************************** Global Values Section ***************************************************
project_path_on_is2 = '/is2/projects/JC_Cancers/TCGA_raw/'
ids_pattern = re.compile(r'(TCGA-\w+-\w+)', re.U | re.M | re.I)

label_file_triple_all = "/Users/maoss2/PycharmProjects/BRCA_experiments_and_paper/datasets/datasets_repository/labels_for_triple_negatives_all.tsv"
new_label_file = '/Users/maoss2/PycharmProjects/BRCA_experiments_and_paper/datasets/datasets_repository/clinical_views_labels_and_proba_copie.csv'

methyl_450_file = "/Users/maoss2/PycharmProjects/BRCA_experiments_and_paper/datasets/datasets_repository/methylome_450.tsv"
methyl_27_file = "/Users/maoss2/PycharmProjects/BRCA_experiments_and_paper/datasets/datasets_repository/methylome_27.tsv"
rnaseq_genes_file = "/Users/maoss2/PycharmProjects/BRCA_experiments_and_paper/datasets/datasets_repository/rnaseq_genes.tsv"
rnaseq_isoforms_file = "/Users/maoss2/PycharmProjects/BRCA_experiments_and_paper/datasets/datasets_repository/rnaseq_isoforms.tsv"
snp_file = "/Users/maoss2/PycharmProjects/BRCA_experiments_and_paper/datasets/datasets_repository/snp.tsv"
mirna_file = "/Users/maoss2/PycharmProjects/BRCA_experiments_and_paper/datasets/datasets_repository/mirna.tsv"
new_clinical_file = '/Users/maoss2/PycharmProjects/BRCA_experiments_and_paper/datasets/datasets_repository/new_clinical_view_copie.tsv'
old_clinical_file = '/Users/maoss2/PycharmProjects/BRCA_experiments_and_paper/datasets/datasets_repository/clinical_view.tsv'
# **********************************************************************************************************************



labels = pd.read_csv(new_label_file,sep=',', index_col="bcr_patient_barcode")

y_labels = labels['phenotype_TN']
index_pos = labels.loc[labels['phenotype_TN'] == 1].index.values
index_neg = labels.loc[labels['phenotype_TN'] == -1].index.values
name = 'BRCA_triple_neg_new_labels_unbalanced_mean.h5'

In [4]:
labels.replace(['LA', 'LB', 'HER2++', 'TN'], [1, 2, 3, 4],
               inplace=True)  # IMPORTANT POUR DECODER LEUR SIGNIFICATION
labels.replace(['NON TN', 'BASAL NON TN'], [1, 2],
               inplace=True)  # IMPORTANT POUR DECODER LEUR SIGNIFICATION
labels_phenotype_normal = labels['phenotype']
labels_phenotype_diablo = labels['phenotype_DIABLO']
labels_phenotype_diablo_TN_and_basal = labels['phenotype_DIABLO_TN_AND_BASAL']
er_positive_probability = labels['er_status_ihc_Percent_Positive']
pr_positive_probability = labels['pr_status_ihc_percent_Positive']
her_positive_probability = labels['her2_ihc_percent_Positive']
# Save to file
y_labels.to_hdf(name, "labels")
labels_phenotype_normal.to_hdf(name, 'label_normal_phenotype')
labels_phenotype_diablo.to_hdf(name, 'label_diablo_phenotype')
labels_phenotype_diablo_TN_and_basal.to_hdf(name, 'label_diablo_TN_and_basal_phenotype')
er_positive_probability.to_hdf(name, 'er_positive_probability')
pr_positive_probability.to_hdf(name, 'pr_positive_probability')
her_positive_probability.to_hdf(name, 'her_positive_probability')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->values] [items->None]

  pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [5]:
methylation_27 = pd.read_csv(methyl_27_file, sep="\t")
methylation_27.dropna(axis=0, inplace=True)  # delete the nan feature
indexes = np.array(list(map(str, np.array(methylation_27["Unnamed: 0"]))))
methylation_27.set_index(indexes, inplace=True)

methylation_450 = pd.read_csv(methyl_450_file, sep="\t")
methylation_450.dropna(axis=0, inplace=True)  # delete the nan feature
indexes = np.array(list(map(str, np.array(methylation_450["Unnamed: 0"]))))
methylation_450.set_index(indexes, inplace=True)

indexes_fusion = [el for el in methylation_27.index.values if el in methylation_450.index.values]
methylation_fusion_27 = methylation_27.loc[indexes_fusion]
methylation_fusion_450 = methylation_450.loc[indexes_fusion]
methylation_fusion_27.drop(['Unnamed: 0'], axis=1, inplace=True)
methylation_fusion_450.drop(['Unnamed: 0'], axis=1, inplace=True)
methylation_fusion = pd.concat([methylation_fusion_27, methylation_fusion_450], axis=1)
methylation_fusion = methylation_fusion.T
methylation_fusion.to_csv('methylation_fusion.tsv', sep="\t", encoding='utf-8')
methylation_fusion = methylation_fusion.loc[labels.index.values]
methylation_fusion = methylation_fusion.apply(pd.to_numeric, errors='coerce')


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike


In [33]:
rnaseq_genes = pd.read_csv(rnaseq_genes_file, sep="\t")
liste = []
for idx, el in enumerate(rnaseq_genes['Unnamed: 0']):
    if el.split('|')[0] == '?':
        liste.append(idx)
rnaseq_genes.drop(axis=0, index=liste, inplace=True)
indexes = np.array(list(map(str, np.array(rnaseq_genes["Unnamed: 0"]))))
rnaseq_genes.set_index(indexes, inplace=True)
rnaseq_genes.drop(['Unnamed: 0'], axis=1, inplace=True)
# rnaseq_genes = rnaseq_genes.T.loc[labels.index.values]
# rnaseq_genes = rnaseq_genes.loc[:, rnaseq_genes.count() > 0]


In [34]:
rnaseq_isoforms = pd.read_csv(rnaseq_isoforms_file, sep="\t")
indexes = np.array(list(map(str, np.array(rnaseq_isoforms["Unnamed: 0"]))))
rnaseq_isoforms.set_index(indexes, inplace=True)
rnaseq_isoforms.drop(['Unnamed: 0'], axis=1, inplace=True)
# rnaseq_isoforms = rnaseq_isoforms.T.loc[labels.index.values]
# rnaseq_isoforms = rnaseq_isoforms.loc[:, rnaseq_isoforms.count() > 0]


In [27]:
snps = pd.read_csv(snp_file, sep="\t")
indexes = np.array(list(map(str, np.array(snps["Unnamed: 0"]))))
snps.set_index(indexes, inplace=True)
# snps = snps.T.loc[labels.index.values]
# snps = snps.loc[:, snps.count() > 0]
# snps = snps.apply(pd.to_numeric, errors='coerce')


In [35]:
mirnas = pd.read_csv(mirna_file, sep="\t")
indexes = np.array(list(map(str, np.array(mirnas["Unnamed: 0"]))))
mirnas.set_index(indexes, inplace=True)
mirnas.drop(['Unnamed: 0'], axis=1, inplace=True)
# mirnas = mirnas.T.loc[labels.index.values]
# mirnas = mirnas.loc[:, mirnas.count() > 0]


In [11]:
clinical_data = pd.read_csv(new_clinical_file, sep='\t')
indexes = np.array(list(map(str, clinical_data['bcr_patient_barcode'].values)))
clinical_data.set_index(indexes, inplace=True)
clinical_data.drop(['Unnamed: 0', 'bcr_patient_barcode'], axis=1, inplace=True)
clinical_data = clinical_data.loc[labels.index.values]
clinical_data = clinical_data.apply(pd.to_numeric, errors='coerce')


In [12]:
# Check the examples
assert np.all(labels.index.values == rnaseq_isoforms.index.values)
assert np.all(labels.index.values == rnaseq_genes.index.values)
assert np.all(labels.index.values == snps.index.values)
assert np.all(labels.index.values == mirnas.index.values)
assert np.all(labels.index.values == clinical_data.index.values)

In [14]:
np.where(np.isnan(methylation_fusion) == True)

(array([316, 316, 316, ..., 837, 837, 837]),
 array([    0,     1,     2, ..., 19981, 19982, 19983]))

In [15]:
temp_concat = pd.concat([methylation_fusion_27, methylation_fusion_450], axis=1)

In [16]:
np.where(np.isnan(temp_concat) == True)

(array([], dtype=int64), array([], dtype=int64))

In [19]:
temp_concat.columns

Index(['TCGA-A2-A0CX', 'TCGA-A2-A0D0', 'TCGA-A2-A0D4', 'TCGA-A7-A0CD',
       'TCGA-A7-A0CE', 'TCGA-A7-A0CG', 'TCGA-A7-A0CH', 'TCGA-A7-A0CJ',
       'TCGA-A7-A0DB', 'TCGA-A7-A0DC',
       ...
       'TCGA-BH-A1EW', 'TCGA-BH-A1F0', 'TCGA-C8-A1HF', 'TCGA-C8-A1HG',
       'TCGA-C8-A1HI', 'TCGA-C8-A1HL', 'TCGA-C8-A1HM', 'TCGA-C8-A1HN',
       'TCGA-E2-A14N', 'TCGA-E2-A15I'],
      dtype='object', length=1087)

In [20]:
labels.index.values

array(['TCGA-3C-AAAU', 'TCGA-3C-AALI', 'TCGA-3C-AALJ', 'TCGA-3C-AALK',
       'TCGA-4H-AAAK', 'TCGA-5L-AAT0', 'TCGA-5L-AAT1', 'TCGA-5T-A9QA',
       'TCGA-A1-A0SB', 'TCGA-A1-A0SD', 'TCGA-A1-A0SE', 'TCGA-A1-A0SF',
       'TCGA-A1-A0SG', 'TCGA-A1-A0SH', 'TCGA-A1-A0SI', 'TCGA-A1-A0SJ',
       'TCGA-A1-A0SK', 'TCGA-A1-A0SM', 'TCGA-A1-A0SN', 'TCGA-A1-A0SO',
       'TCGA-A1-A0SP', 'TCGA-A1-A0SQ', 'TCGA-A2-A04N', 'TCGA-A2-A04P',
       'TCGA-A2-A04Q', 'TCGA-A2-A04R', 'TCGA-A2-A04T', 'TCGA-A2-A04U',
       'TCGA-A2-A04V', 'TCGA-A2-A04W', 'TCGA-A2-A04X', 'TCGA-A2-A04Y',
       'TCGA-A2-A0CK', 'TCGA-A2-A0CL', 'TCGA-A2-A0CM', 'TCGA-A2-A0CO',
       'TCGA-A2-A0CP', 'TCGA-A2-A0CQ', 'TCGA-A2-A0CR', 'TCGA-A2-A0CS',
       'TCGA-A2-A0CT', 'TCGA-A2-A0CU', 'TCGA-A2-A0CV', 'TCGA-A2-A0CW',
       'TCGA-A2-A0CX', 'TCGA-A2-A0CY', 'TCGA-A2-A0CZ', 'TCGA-A2-A0D0',
       'TCGA-A2-A0D1', 'TCGA-A2-A0D2', 'TCGA-A2-A0D3', 'TCGA-A2-A0D4',
       'TCGA-A2-A0EM', 'TCGA-A2-A0EN', 'TCGA-A2-A0EO', 'TCGA-A2-A0EP',
      

In [44]:
liste_1 = [el for el in temp_concat.columns if el in labels.index.values]
print(len(liste_1))

liste_2 = [el for el in rnaseq_genes.columns if el in labels.index.values]
print(len(liste_2))

liste_3 = [el for el in rnaseq_isoforms.columns if el in labels.index.values]
print(len(liste_3))

liste_4 = [el for el in mirnas.columns if el in labels.index.values]
print(len(liste_4))

liste_5 = [el for el in snps.columns if el in labels.index.values]
print(len(liste_5))

921


In [53]:
liste_cpg_rna = []
liste_cpg_rna.extend(liste_1)
liste_cpg_rna.extend(liste_2)
print(np.unique(np.asarray(liste_cpg_rna)).shape)
921 + 919

(922,)


1840

In [56]:
liste_cpg_rna = [el for el in liste_2 if el in liste_1]
print(len(liste_cpg_rna))

918


In [57]:
liste_mirna_rna = [el for el in liste_4 if el in liste_2]
print(len(liste_mirna_rna))

903


In [58]:
liste_mirna_rna_cpg_rna = [el for el in liste_mirna_rna if el in liste_cpg_rna]
print(len(liste_mirna_rna_cpg_rna))

902


In [67]:
patients_informations_available_for_all_view = list(set(liste_1).intersection(liste_2, liste_4))
print(len(new_list))

902


In [69]:
temp_concat.T.loc[patients_informations_available_for_all_view].shape

(902, 19984)

In [72]:
labels.loc[labels.index.isin(patients_informations_available_for_all_view)]

Unnamed: 0_level_0,Unnamed: 0,phenotype,phenotype_TN,phenotype_DIABLO,phenotype_DIABLO_TN_AND_BASAL,er_status_ihc_Percent_Positive,pr_status_ihc_percent_Positive,her2_ihc_percent_Positive
bcr_patient_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
TCGA-3C-AAAU,0,1,-1,1000.0,1000,3,3,-1
TCGA-3C-AALI,1,2,-1,1000.0,1000,1,1,-1
TCGA-3C-AALJ,2,1,-1,1000.0,1000,4,2,-1
TCGA-3C-AALK,3,2,-1,1000.0,1000,4,4,-1
TCGA-4H-AAAK,4,1,-1,1000.0,1000,3,4,1
...,...,...,...,...,...,...,...,...
TCGA-WT-AB44,1093,1,-1,1000.0,1000,4,4,-1
TCGA-XX-A899,1094,1,-1,-1.0,1,4,4,-1
TCGA-XX-A89A,1095,1,-1,-1.0,1,4,4,-1
TCGA-Z7-A8R5,1096,1,-1,1000.0,1000,-1,-1,-1


In [73]:
for bool in [True, False]:
    print(bool)

True
False


In [74]:
labels = labels.loc[labels.index.isin(patients_informations_available_for_all_view)]

In [81]:
labels.index.values.shape


(902,)

In [78]:
rnaseq_isoforms = rnaseq_isoforms.T.loc[patients_informations_available_for_all_view]
rnaseq_isoforms = rnaseq_isoforms.loc[:, rnaseq_isoforms.count() > 0]

In [80]:
rnaseq_isoforms.index.shape

(902,)

In [82]:
assert np.all(labels.index.values == rnaseq_isoforms.index.values)

AssertionError: 