# Gene sorting by Chromosome location (TreeMap)
The main idea of this paper is to rearrange gene expression profiles data into tree-map images, according to the chromosome and location each gene belongs to (chromosomal locus).

In [2]:
import pandas as pd
import numpy as np

In [3]:
# TCGA
#pp_df_gene_exp = pd.read_hdf("data/preprocessed1_expr.h5", key='expression') # only selected features
#df_gene_exp = pd.read_hdf("data/TCGA_data.h5", key='both_gene_expression') # all
brca_ex = pd.read_hdf("/mnt/ext/almacen/comun/Francis/almacen/DeepLearning-Bioinformatics/data/PanCancer/mad_filter_pancan_all_TCGA_20.h5", key = "brca")
non_brca_ex = pd.read_hdf("/mnt/ext/almacen/comun/Francis/almacen/DeepLearning-Bioinformatics/data/PanCancer/mad_filter_pancan_all_TCGA_20.h5", key = "non_brca")

In [4]:
gene_chrom = pd.read_table('/mnt/ext/almacen/comun/Francis/almacen//GenSig/data/ensembl_gene_chromosome.tsv',
                           index_col=0, 
                           header = 0,
                           names=['gene_id', 'gene_id_version', 'chromosome', 'gene_start', 'gene_end', 'transcript_start', 'transcript_end'],
                           dtype = {'Gene stable ID': str,
                                    'Gene stable ID version': str,
                                    'Chromosome/scaffold name': str,
                                    'Gene start (bp)': np.int32,
                                    'Gene end (bp)': np.int32,
                                    'Transcript start (bp)': np.int32,
                                    'Transcript end (bp)': np.int32})

  # This is added back by InteractiveShellApp.init_path()


In [5]:
gene_chrom = gene_chrom[~gene_chrom.index.duplicated()]

In [6]:
gene_chrom.shape

(64914, 6)

In [8]:
common = set(gene_chrom.gene_id_version).intersection(brca_ex.columns)

In [9]:
gene_chrom = gene_chrom.loc[gene_chrom['gene_id_version'].isin(list(common))]

In [10]:
gene_chrom['chromosome'].value_counts()

1     1644
19    1136
2     1132
17    1069
12     966
3      896
16     837
6      833
7      832
11     825
5      818
15     657
4      655
8      641
9      629
10     629
14     619
X      518
22     457
20     423
13     329
18     280
21     253
Y       19
MT      14
Name: chromosome, dtype: int64

In [13]:
# Loading 'gene_mapping' DataFrame
gene_mapping = pd.read_hdf('data/KEGG/map_ens_kegg_brite.h5', key='map')

In [17]:
gene_chrom.columns

Index(['gene_id_version', 'chromosome', 'gene_start', 'gene_end',
       'transcript_start', 'transcript_end'],
      dtype='object')

In [19]:
len(set(gene_mapping.ensId).intersection(gene_chrom.gene_id_version))

5936

In [20]:
gene_chrom.shape

(17111, 6)

In [6]:
gene_chrom.chromosome.value_counts()

1                           5317
2                           4006
11                          3285
3                           3050
17                          3024
12                          2969
19                          2959
7                           2917
6                           2907
5                           2878
16                          2513
4                           2510
8                           2386
X                           2375
9                           2275
10                          2240
14                          2230
15                          2179
20                          1397
22                          1353
13                          1335
18                          1183
21                           833
Y                            518
CHR_HSCHR6_MHC_COX_CTG1      335
CHR_HSCHR6_MHC_QBL_CTG1      317
CHR_HSCHR6_MHC_DBB_CTG1      312
CHR_HSCHR6_MHC_SSTO_CTG1     297
CHR_HSCHR6_MHC_MANN_CTG1     294
CHR_HSCHR6_MHC_MCF_CTG1      287
          

In [19]:
gene_chrom['chromosome'].value_counts().index

Index(['1', '2', '11', '3', '17', '12', '19', '7', '6', '5',
       ...
       'CHR_HSCHR10_1_CTG6', 'CHR_HG2239_PATCH', 'CHR_HSCHR12_2_CTG1',
       'CHR_HSCHR4_2_CTG12', 'CHR_HSCHR7_2_CTG7', 'CHR_HSCHR6_1_CTG10',
       'KI270711.1', 'CHR_HSCHR12_8_CTG2_1', 'CHR_HSCHR6_1_CTG3',
       'CHR_HSCHR5_7_CTG1'],
      dtype='object', length=414)