In [1]:
from pathlib import Path
import os
import tempfile
import urllib.request

import pandas as pd
import numpy as np
import pyranges as pr

from tqdm.auto import tqdm

In [2]:
def fetch_genes_and_transcripts(release=99, 
                                species='homo_sapiens',
                                build='GRCh38',
                                pub_build='',
                                release2=None,
                                datadir=Path('pyannotables') / Path('data'),
                                join_gdf=None):

    if release2 is None:
        release2 = release

    gtf_url = f'http://ftp.ensembl.org/pub/{pub_build.lower()}/release-{release}/gtf/{species}/{species.capitalize()}.{build}.{release2}.chr_patch_hapl_scaff.gtf.gz'

    with tempfile.TemporaryDirectory() as tmpdir:
        full_path = Path(tmpdir) / 'file.gtf.gz'
        urllib.request.urlretrieve(gtf_url, full_path)
        df = pr.read_gtf(full_path, output_df=True)
    
    gdf = df[df.Feature == 'gene'][['Chromosome', 'Source', 'Start', 'End', 'Strand',
                                    'gene_id', 'gene_name', 'gene_source',
                                    'gene_biotype']].drop_duplicates().set_index('gene_id')
    if join_gdf is not None:
        gdf = gdf.join(join_gdf)

    tdf = df[df.Feature == 'transcript'][['Chromosome', 'Start', 'End', 'Strand',
                                          'gene_id', 'transcript_id']].drop_duplicates().set_index('transcript_id')
    datadir.mkdir(exist_ok=True)
    gdf.to_pickle(datadir / f'datafile_{species}-{build}-ensembl{release}.pkl.xz')
    tdf.to_pickle(datadir / f'datafile_{species}-{build}-ensembl{release}-tx2gene.pkl.xz')
    
    return gdf, tdf

## Human annotations

### Download HGNC gene name table

In [3]:
url = 'https://www.genenames.org/cgi-bin/download/custom?col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_prev_sym&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_ensembl_id&status=Approved&status=Entry%20Withdrawn&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit'
human_genes = pd.read_table(url)
human_genes = human_genes[~human_genes['Ensembl gene ID'].isnull()].set_index('Ensembl gene ID')
human_genes.rename(columns={'Chromosome': 'Chromosome_region'}, inplace=True)
human_genes

Unnamed: 0_level_0,Approved symbol,Approved name,Status,Previous symbols,Alias symbols,Chromosome_region
Ensembl gene ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000121410,A1BG,alpha-1-B glycoprotein,Approved,,,19q13.43
ENSG00000268895,A1BG-AS1,A1BG antisense RNA 1,Approved,"NCRNA00181, A1BGAS, A1BG-AS",FLJ23569,19q13.43
ENSG00000148584,A1CF,APOBEC1 complementation factor,Approved,,"ACF, ASP, ACF64, ACF65, APOBEC1CF",10q11.23
ENSG00000175899,A2M,alpha-2-macroglobulin,Approved,,"FWP007, S863-7, CPAMD5",12p13.31
ENSG00000245105,A2M-AS1,A2M antisense RNA 1,Approved,,,12p13.31
...,...,...,...,...,...,...
ENSG00000162378,ZYG11B,"zyg-11 family member B, cell cycle regulator",Approved,ZYG11,FLJ13456,1p32.3
ENSG00000159840,ZYX,zyxin,Approved,,,7q34
ENSG00000274572,ZYXP1,zyxin pseudogene 1,Approved,,,8q24.23
ENSG00000074755,ZZEF1,zinc finger ZZ-type and EF-hand domain contain...,Approved,,"KIAA0399, ZZZ4, FLJ10821",17p13.2


### GRCh38

In [4]:
species = 'homo_sapiens'

for release in tqdm((84, 93, 99)):
    gdf, tdf = fetch_genes_and_transcripts(release, species, join_gdf=human_genes)
    
    display(gdf.head())
    display(tdf.head())

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

Unnamed: 0_level_0,Chromosome,Source,Start,End,Strand,gene_name,gene_source,gene_biotype,Approved symbol,Approved name,Status,Previous symbols,Alias symbols,Chromosome_region
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
ENSG00000223972,1,havana,11869,14409,+,DDX11L1,havana,transcribed_unprocessed_pseudogene,DDX11L1,DEAD/H-box helicase 11 like 1,Approved,,,1p36.33
ENSG00000227232,1,havana,14404,29570,-,WASH7P,havana,unprocessed_pseudogene,WASH7P,"WASP family homolog 7, pseudogene",Approved,,FAM39F,1p36.33
ENSG00000278267,1,ensembl,17369,17436,-,MIR6859-1,ensembl,miRNA,,,,,,
ENSG00000243485,1,havana,29554,31109,+,RP11-34P13.3,havana,lincRNA,MIR1302-2HG,MIR1302-2 host gene,Approved,,,1p36.33
ENSG00000274890,1,ensembl,30366,30503,+,MIR1302-2,ensembl,miRNA,,,,,,


Unnamed: 0_level_0,Chromosome,Start,End,Strand,gene_id
transcript_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENST00000456328,1,11869,14409,+,ENSG00000223972
ENST00000450305,1,12010,13670,+,ENSG00000223972
ENST00000488147,1,14404,29570,-,ENSG00000227232
ENST00000619216,1,17369,17436,-,ENSG00000278267
ENST00000473358,1,29554,31097,+,ENSG00000243485


Unnamed: 0_level_0,Chromosome,Source,Start,End,Strand,gene_name,gene_source,gene_biotype,Approved symbol,Approved name,Status,Previous symbols,Alias symbols,Chromosome_region
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
ENSG00000223972,1,havana,11869,14409,+,DDX11L1,havana,transcribed_unprocessed_pseudogene,DDX11L1,DEAD/H-box helicase 11 like 1,Approved,,,1p36.33
ENSG00000227232,1,havana,14404,29570,-,WASH7P,havana,unprocessed_pseudogene,WASH7P,"WASP family homolog 7, pseudogene",Approved,,FAM39F,1p36.33
ENSG00000278267,1,mirbase,17369,17436,-,MIR6859-1,mirbase,miRNA,,,,,,
ENSG00000243485,1,havana,29554,31109,+,MIR1302-2HG,havana,lincRNA,MIR1302-2HG,MIR1302-2 host gene,Approved,,,1p36.33
ENSG00000284332,1,mirbase,30366,30503,+,MIR1302-2,mirbase,miRNA,MIR1302-2,microRNA 1302-2,Approved,MIRN1302-2,hsa-mir-1302-2,1p36.33


Unnamed: 0_level_0,Chromosome,Start,End,Strand,gene_id
transcript_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENST00000456328,1,11869,14409,+,ENSG00000223972
ENST00000450305,1,12010,13670,+,ENSG00000223972
ENST00000488147,1,14404,29570,-,ENSG00000227232
ENST00000619216,1,17369,17436,-,ENSG00000278267
ENST00000473358,1,29554,31097,+,ENSG00000243485


Unnamed: 0_level_0,Chromosome,Source,Start,End,Strand,gene_name,gene_source,gene_biotype,Approved symbol,Approved name,Status,Previous symbols,Alias symbols,Chromosome_region
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
ENSG00000223972,1,havana,11869,14409,+,DDX11L1,havana,transcribed_unprocessed_pseudogene,DDX11L1,DEAD/H-box helicase 11 like 1,Approved,,,1p36.33
ENSG00000227232,1,havana,14404,29570,-,WASH7P,havana,unprocessed_pseudogene,WASH7P,"WASP family homolog 7, pseudogene",Approved,,FAM39F,1p36.33
ENSG00000278267,1,mirbase,17369,17436,-,MIR6859-1,mirbase,miRNA,,,,,,
ENSG00000243485,1,havana,29554,31109,+,MIR1302-2HG,havana,lncRNA,MIR1302-2HG,MIR1302-2 host gene,Approved,,,1p36.33
ENSG00000284332,1,mirbase,30366,30503,+,MIR1302-2,mirbase,miRNA,MIR1302-2,microRNA 1302-2,Approved,MIRN1302-2,hsa-mir-1302-2,1p36.33


Unnamed: 0_level_0,Chromosome,Start,End,Strand,gene_id
transcript_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENST00000456328,1,11869,14409,+,ENSG00000223972
ENST00000450305,1,12010,13670,+,ENSG00000223972
ENST00000488147,1,14404,29570,-,ENSG00000227232
ENST00000619216,1,17369,17436,-,ENSG00000278267
ENST00000473358,1,29554,31097,+,ENSG00000243485





### GRCh37

In [5]:
release = 99
release2 = 87

gdf, tdf = fetch_genes_and_transcripts(release, species, build='GRCh37', pub_build='GRCh37', release2=release2, join_gdf=human_genes)

In [6]:
gdf.head()

Unnamed: 0_level_0,Chromosome,Source,Start,End,Strand,gene_name,gene_source,gene_biotype,Approved symbol,Approved name,Status,Previous symbols,Alias symbols,Chromosome_region
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
ENSG00000223972,1,ensembl_havana,11869,14412,+,DDX11L1,ensembl_havana,pseudogene,DDX11L1,DEAD/H-box helicase 11 like 1,Approved,,,1p36.33
ENSG00000227232,1,ensembl_havana,14363,29806,-,WASH7P,ensembl_havana,pseudogene,WASH7P,"WASP family homolog 7, pseudogene",Approved,,FAM39F,1p36.33
ENSG00000243485,1,ensembl_havana,29554,31109,+,MIR1302-10,ensembl_havana,lincRNA,MIR1302-2HG,MIR1302-2 host gene,Approved,,,1p36.33
ENSG00000237613,1,ensembl_havana,34554,36081,-,FAM138A,ensembl_havana,lincRNA,FAM138A,family with sequence similarity 138 member A,Approved,,F379,1p36.33
ENSG00000268020,1,ensembl_havana,52473,54936,+,OR4G4P,ensembl_havana,pseudogene,OR4G4P,olfactory receptor family 4 subfamily G member...,Approved,,,1p36.33


## Mouse annotations

### GRCm38

In [7]:
gdf, tdf = fetch_genes_and_transcripts(99, 'mus_musculus', build='GRCm38')

In [8]:
gdf.head()

Unnamed: 0_level_0,Chromosome,Source,Start,End,Strand,gene_name,gene_source,gene_biotype
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENSMUSG00000102693,1,havana,3073253,3074322,+,4933401J01Rik,havana,TEC
ENSMUSG00000064842,1,ensembl,3102016,3102125,+,Gm26206,ensembl,snRNA
ENSMUSG00000051951,1,ensembl_havana,3205901,3671498,-,Xkr4,ensembl_havana,protein_coding
ENSMUSG00000102851,1,havana,3252757,3253236,+,Gm18956,havana,processed_pseudogene
ENSMUSG00000103377,1,havana,3365731,3368549,-,Gm37180,havana,TEC


In [9]:
tdf.head()

Unnamed: 0_level_0,Chromosome,Start,End,Strand,gene_id
transcript_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSMUST00000193812,1,3073253,3074322,+,ENSMUSG00000102693
ENSMUST00000082908,1,3102016,3102125,+,ENSMUSG00000064842
ENSMUST00000162897,1,3205901,3216344,-,ENSMUSG00000051951
ENSMUST00000159265,1,3206523,3215632,-,ENSMUSG00000051951
ENSMUST00000070533,1,3214482,3671498,-,ENSMUSG00000051951
