In [1]:
from pathlib import Path
import os

import pandas as pd

from pyensembl import EnsemblRelease, genome_for_reference_name
from pyensembl.species import human, mouse

In [2]:
def fetch_genes_and_transcripts(release, species, datadir=Path('data')):
    er = EnsemblRelease(release=release, species=species)
    er.download()
    er.index()
    
    genes = er.genes()
    tx = er.transcripts()

    gdf = pd.DataFrame([x.to_dict() for x in genes])
    gdf = gdf[['gene_id', 'gene_name', 'contig', 'start', 'end', 'strand', 'biotype']]
    gdf.set_index('gene_id', inplace=True)

    tdf = pd.DataFrame([x.to_dict() for x in tx])
    tdf = tdf[['gene_id', 'transcript_id']]
    tdf.set_index('transcript_id', inplace=True)

    os.makedirs(datadir, exist_ok=True)
    gdf.to_csv(datadir / f'{species.latin_name}-ensembl{release}-{ref}.csv.xz')
    tdf.to_csv(datadir / f'{species.latin_name}-ensembl{release}-{ref}-tx2gene.csv.xz')
    
    return gdf, tdf

## Human annotations

### GRCh38

In [3]:
release = 95
species = human

ref = species.which_reference(release)
gdf, tdf = fetch_genes_and_transcripts(release, species)

INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/gokcen/Library/Caches/pyensembl/GRCh38/ensembl95/Homo_sapiens.GRCh38.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/gokcen/Library/Caches/pyensembl/GRCh38/ensembl95/Homo_sapiens.GRCh38.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/gokcen/Library/Caches/pyensembl/GRCh38/ensembl95/Homo_sapiens.GRCh38.pep.all.fa.gz.pickle


In [4]:
gdf.head()

Unnamed: 0_level_0,gene_name,contig,start,end,strand,biotype
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000000003,TSPAN6,X,100627109,100639991,-,protein_coding
ENSG00000000005,TNMD,X,100584802,100599885,+,protein_coding
ENSG00000000419,DPM1,20,50934867,50958555,-,protein_coding
ENSG00000000457,SCYL3,1,169849631,169894267,-,protein_coding
ENSG00000000460,C1orf112,1,169662007,169854080,+,protein_coding


In [5]:
tdf.head()

Unnamed: 0_level_0,gene_id
transcript_id,Unnamed: 1_level_1
ENST00000000233,ENSG00000004059
ENST00000000412,ENSG00000003056
ENST00000000442,ENSG00000173153
ENST00000001008,ENSG00000004478
ENST00000001146,ENSG00000003137


### GRCh37

In [6]:
release = 75
species = human

ref = species.which_reference(release)
gdf, tdf = fetch_genes_and_transcripts(release, species)

INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/gokcen/Library/Caches/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/gokcen/Library/Caches/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/gokcen/Library/Caches/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.pep.all.fa.gz.pickle


## Mouse annotations

### GRCm38

In [7]:
release = 95
species = mouse

ref = species.which_reference(release)
gdf, tdf = fetch_genes_and_transcripts(release, species)

INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/gokcen/Library/Caches/pyensembl/GRCm38/ensembl95/Mus_musculus.GRCm38.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/gokcen/Library/Caches/pyensembl/GRCm38/ensembl95/Mus_musculus.GRCm38.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/gokcen/Library/Caches/pyensembl/GRCm38/ensembl95/Mus_musculus.GRCm38.pep.all.fa.gz.pickle


In [8]:
gdf.head()

Unnamed: 0_level_0,gene_name,contig,start,end,strand,biotype
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSMUSG00000000001,Gnai3,3,108107280,108146146,-,protein_coding
ENSMUSG00000000003,Pbsn,X,77837901,77853623,-,protein_coding
ENSMUSG00000000028,Cdc45,16,18780447,18811987,-,protein_coding
ENSMUSG00000000031,H19,7,142575529,142578143,-,lincRNA
ENSMUSG00000000037,Scml2,X,161117193,161258213,+,protein_coding


In [9]:
tdf.head()

Unnamed: 0_level_0,gene_id
transcript_id,Unnamed: 1_level_1
ENSMUST00000000001,ENSMUSG00000000001
ENSMUST00000000003,ENSMUSG00000000003
ENSMUST00000000010,ENSMUSG00000020875
ENSMUST00000000028,ENSMUSG00000000028
ENSMUST00000000033,ENSMUSG00000048583
