In [1]:
from pathlib import Path
import os

import pandas as pd
import numpy as np

from pyensembl import EnsemblRelease, genome_for_reference_name
from pyensembl.species import human, mouse

In [2]:
def fetch_genes_and_transcripts(release, species, datadir=Path('pyannotables') / Path('data')):
    er = EnsemblRelease(release=release, species=species)
    er.download()
    er.index()
    
    genes = er.genes()
    tx = er.transcripts()

    gdf = pd.DataFrame([x.to_dict() for x in genes])
    gdf = gdf[['gene_id', 'gene_name', 'contig', 'start', 'end', 'strand', 'biotype']]
    gdf.set_index('gene_id', inplace=True)

    tdf = pd.DataFrame([x.to_dict() for x in tx])
    tdf = tdf[['gene_id', 'transcript_id']]
    tdf.set_index('transcript_id', inplace=True)

    datadir.mkdir(exist_ok=True)
    gdf.to_pickle(datadir / f'datafile_{species.latin_name}-ensembl{release}-{ref}.pkl.xz')
    tdf.to_pickle(datadir / f'datafile_{species.latin_name}-ensembl{release}-{ref}-tx2gene.pkl.xz')
    
    return gdf, tdf

## Human annotations

### Download HGNC gene name table

In [3]:
url = 'https://www.genenames.org/cgi-bin/download/custom?col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_prev_sym&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_ensembl_id&status=Approved&status=Entry%20Withdrawn&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit'
human_genes = pd.read_table(url)
human_genes

Unnamed: 0,Approved symbol,Approved name,Status,Previous symbols,Synonyms,Chromosome,Ensembl gene ID
0,A1BG,alpha-1-B glycoprotein,Approved,,,19q13.43,ENSG00000121410
1,A1BG-AS1,A1BG antisense RNA 1,Approved,"NCRNA00181, A1BGAS, A1BG-AS",FLJ23569,19q13.43,ENSG00000268895
2,A1CF,APOBEC1 complementation factor,Approved,,"ACF, ASP, ACF64, ACF65, APOBEC1CF",10q11.23,ENSG00000148584
3,A1S9T,"symbol withdrawn, see [HGNC:12469](/data/gene-...",Symbol Withdrawn,,,,
4,A2M,alpha-2-macroglobulin,Approved,,"FWP007, S863-7, CPAMD5",12p13.31,ENSG00000175899
...,...,...,...,...,...,...,...
46814,ZYG11B,"zyg-11 family member B, cell cycle regulator",Approved,ZYG11,FLJ13456,1p32.3,ENSG00000162378
46815,ZYX,zyxin,Approved,,,7q34,ENSG00000159840
46816,ZYXP1,zyxin pseudogene 1,Approved,,,8q24.23,ENSG00000274572
46817,ZZEF1,zinc finger ZZ-type and EF-hand domain contain...,Approved,,"KIAA0399, ZZZ4, FLJ10821",17p13.2,ENSG00000074755


### GRCh38

In [4]:
release = 97
species = human

ref = species.which_reference(release)
gdf, tdf = fetch_genes_and_transcripts(release, species)

INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/gokcen/Library/Caches/pyensembl/GRCh38/ensembl97/Homo_sapiens.GRCh38.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/gokcen/Library/Caches/pyensembl/GRCh38/ensembl97/Homo_sapiens.GRCh38.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/gokcen/Library/Caches/pyensembl/GRCh38/ensembl97/Homo_sapiens.GRCh38.pep.all.fa.gz.pickle


In [5]:
gdf.head()

Unnamed: 0_level_0,gene_name,contig,start,end,strand,biotype
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000000003,TSPAN6,X,100627109,100639991,-,protein_coding
ENSG00000000005,TNMD,X,100584936,100599885,+,protein_coding
ENSG00000000419,DPM1,20,50934867,50958555,-,protein_coding
ENSG00000000457,SCYL3,1,169849631,169894267,-,protein_coding
ENSG00000000460,C1orf112,1,169662007,169854080,+,protein_coding


In [6]:
gdf = gdf.merge(human_genes, how='left', left_index=True, right_on='Ensembl gene ID').set_index(gdf.index).drop('Ensembl gene ID', axis=1)
gdf.head()

Unnamed: 0_level_0,gene_name,contig,start,end,strand,biotype,Approved symbol,Approved name,Status,Previous symbols,Synonyms,Chromosome
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSG00000000003,TSPAN6,X,100627109,100639991,-,protein_coding,TSPAN6,tetraspanin 6,Approved,TM4SF6,"T245, TSPAN-6",Xq22.1
ENSG00000000005,TNMD,X,100584936,100599885,+,protein_coding,TNMD,tenomodulin,Approved,,"myodulin, ChM1L, tendin, TEM, BRICD4",Xq22.1
ENSG00000000419,DPM1,20,50934867,50958555,-,protein_coding,DPM1,dolichyl-phosphate mannosyltransferase subunit...,Approved,,"MPDS, CDGIE",20q13.13
ENSG00000000457,SCYL3,1,169849631,169894267,-,protein_coding,SCYL3,SCY1 like pseudokinase 3,Approved,,"PACE-1, PACE1",1q24.2
ENSG00000000460,C1orf112,1,169662007,169854080,+,protein_coding,C1orf112,chromosome 1 open reading frame 112,Approved,,FLJ10706,1q24.2


In [7]:
tdf.head()

Unnamed: 0_level_0,gene_id
transcript_id,Unnamed: 1_level_1
ENST00000000233,ENSG00000004059
ENST00000000412,ENSG00000003056
ENST00000000442,ENSG00000173153
ENST00000001008,ENSG00000004478
ENST00000001146,ENSG00000003137


### GRCh37

In [8]:
release = 75
species = human

ref = species.which_reference(release)
gdf, tdf = fetch_genes_and_transcripts(release, species)

INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/gokcen/Library/Caches/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/gokcen/Library/Caches/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/gokcen/Library/Caches/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.pep.all.fa.gz.pickle


## Mouse annotations

### GRCm38

In [9]:
release = 97
species = mouse

ref = species.which_reference(release)
gdf, tdf = fetch_genes_and_transcripts(release, species)

INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/gokcen/Library/Caches/pyensembl/GRCm38/ensembl97/Mus_musculus.GRCm38.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/gokcen/Library/Caches/pyensembl/GRCm38/ensembl97/Mus_musculus.GRCm38.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/gokcen/Library/Caches/pyensembl/GRCm38/ensembl97/Mus_musculus.GRCm38.pep.all.fa.gz.pickle


In [10]:
gdf.head()

Unnamed: 0_level_0,gene_name,contig,start,end,strand,biotype
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSMUSG00000000001,Gnai3,3,108107280,108146146,-,protein_coding
ENSMUSG00000000003,Pbsn,X,77837901,77853623,-,protein_coding
ENSMUSG00000000028,Cdc45,16,18780447,18811987,-,protein_coding
ENSMUSG00000000031,H19,7,142575529,142578143,-,lncRNA
ENSMUSG00000000037,Scml2,X,161082525,161258213,+,protein_coding


In [11]:
tdf.head()

Unnamed: 0_level_0,gene_id
transcript_id,Unnamed: 1_level_1
ENSMUST00000000001,ENSMUSG00000000001
ENSMUST00000000003,ENSMUSG00000000003
ENSMUST00000000010,ENSMUSG00000020875
ENSMUST00000000028,ENSMUSG00000000028
ENSMUST00000000033,ENSMUSG00000048583
