# Export gene information

In [1]:
import os
import collections

import pandas

## Compute local gene information

In [2]:
path = os.path.join('data', 'mutation-matrix.tsv.bz2')
mutation_df = pandas.read_table(path, index_col=0)

path = os.path.join('data', 'expression-matrix.tsv.bz2')
expr_df = pandas.read_table(path, index_col=0)

In [3]:
mutation_summary_df = pandas.DataFrame.from_items([
    ('entrez_gene_id', mutation_df.columns.astype(int)),
    ('n_mutations', mutation_df.sum(axis='rows')),
    ('mutation_frequency', mutation_df.mean(axis='rows')),
])

expression_summary_df = pandas.DataFrame.from_items([
    ('entrez_gene_id', expr_df.columns.astype(int)),
    ('mean_expression', expr_df.mean(axis='rows')),
])

summary_df = mutation_summary_df.merge(expression_summary_df, how='outer')
summary_df['mutation'] = summary_df.n_mutations.notnull().astype(int)
summary_df['expression'] = summary_df.mean_expression.notnull().astype(int)
summary_df.head(2)

Unnamed: 0,entrez_gene_id,n_mutations,mutation_frequency,mean_expression,mutation,expression
0,1.0,30.0,0.004106,6.709969,1,1
1,2.0,130.0,0.017794,13.337754,1,1


## Retrieve Entrez Gene

In [4]:
renamer = collections.OrderedDict([
    ('GeneID', 'entrez_gene_id'),
    ('Symbol', 'symbol'),
    ('description', 'description'),
    ('chromosome', 'chromosome'),
    ('type_of_gene', 'gene_type'),
    ('Synonyms', 'synonyms'),
    ('Other_designations', 'aliases'),
])

url = 'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz'
entrez_df = (pandas.read_table(url, compression='gzip')
    .rename(columns=renamer)
    [list(renamer.values())]
)
entrez_df.head()

Unnamed: 0,entrez_gene_id,symbol,description,chromosome,gene_type,synonyms,aliases
0,1,A1BG,alpha-1-B glycoprotein,19,protein-coding,A1B|ABG|GAB|HYST2477,alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...
1,2,A2M,alpha-2-macroglobulin,12,protein-coding,A2MD|CPAMD5|FWP007|S863-7,alpha-2-macroglobulin|C3 and PZP-like alpha-2-...
2,3,A2MP1,alpha-2-macroglobulin pseudogene 1,12,pseudo,A2MP,pregnancy-zone protein pseudogene
3,9,NAT1,N-acetyltransferase 1,8,protein-coding,AAC1|MNAT|NAT-1|NATI,arylamine N-acetyltransferase 1|N-acetyltransf...
4,10,NAT2,N-acetyltransferase 2,8,protein-coding,AAC2|NAT-2|PNAT,arylamine N-acetyltransferase 2|N-acetyltransf...


## Combine local information with Entrez Gene

In [5]:
combined_df = entrez_df.merge(summary_df, how='right').sort_values('entrez_gene_id')
len(combined_df)

22973

In [6]:
# Missing value per column
combined_df.isnull().sum()

entrez_gene_id           0
symbol                  99
description             99
chromosome              99
gene_type               99
synonyms                99
aliases                 99
n_mutations           1033
mutation_frequency    1033
mean_expression       2443
mutation                 0
expression               0
dtype: int64

In [7]:
combined_df.head()

Unnamed: 0,entrez_gene_id,symbol,description,chromosome,gene_type,synonyms,aliases,n_mutations,mutation_frequency,mean_expression,mutation,expression
0,1.0,A1BG,alpha-1-B glycoprotein,19,protein-coding,A1B|ABG|GAB|HYST2477,alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...,30.0,0.004106,6.709969,1,1
1,2.0,A2M,alpha-2-macroglobulin,12,protein-coding,A2MD|CPAMD5|FWP007|S863-7,alpha-2-macroglobulin|C3 and PZP-like alpha-2-...,130.0,0.017794,13.337754,1,1
2,3.0,A2MP1,alpha-2-macroglobulin pseudogene 1,12,pseudo,A2MP,pregnancy-zone protein pseudogene,4.0,0.000547,,1,0
3,9.0,NAT1,N-acetyltransferase 1,8,protein-coding,AAC1|MNAT|NAT-1|NATI,arylamine N-acetyltransferase 1|N-acetyltransf...,17.0,0.002327,6.728763,1,1
4,10.0,NAT2,N-acetyltransferase 2,8,protein-coding,AAC2|NAT-2|PNAT,arylamine N-acetyltransferase 2|N-acetyltransf...,26.0,0.003559,2.086277,1,1


In [8]:
# Write to TSV
path = os.path.join('data', 'genes.tsv')
combined_df.to_csv(path, sep='\t', index=False, float_format='%.4g')