In [None]:
"""

This script will make a BED file that has information about all features (introns, exons, genes) that will
be used to intersect against a dataset to gather information about feature coverage, splicing, etc.

"""

In [2]:
import sys, pysam, os, numpy, re

**First, follow these steps to download the necessary BED files from the UCSC Table Browser:**

i) Go to the UCSC Table Browser (https://genome.ucsc.edu/cgi-bin/hgTables), select the genome and assembly of interest. From the Group menu, select “Genes and Gene Predictions” and select the track (i.e. NCBI RefSeq) and table (i.e. RefSeq All) of interest. Select BED as the output format and click “get output”.

ii)Under “create one BED record per:”, select “genes plus 0 bases at each end” and click the “get BED” button.

iii) Repeat steps i) and ii), but selecting “introns plus 0 bases at each end” and “exons plus 0 bases at each end” at step ii).

iv) Parse the BED files to remove "chr" to be consistent with BAM files and to remove non-standard chromosomes and keep only protein-coding genes. For each BED file (genes/introns/exons), run the following command in the terminal: 
"sed -e 's/chr//g' <genes/introns/exons>.bed | grep -v random | grep -v Un | grep -v alt | grep NM_ | sort -k1,1 -k2,2n > <genes/introns/exons>_parsed.bed"

v) Follow the steps below to remove duplicate features and combine genes, introns and exons into one file.


In [45]:
# import files 
iRefSeq_genes = open("/path/to/annotation_files/NCBI_RefSeq_hg38_genes_parsed.bed", 'r')
iRefSeq_exons = open("/path/to/annotation_files/NCBI_RefSeq_hg38_exons_parsed.bed", 'r')
iRefSeq_introns = open("/path/to/annotation_files/NCBI_RefSeq_hg38_introns_parsed.bed", 'r')

In [3]:
# merge RefSeq gene, exon, and intron datasets into 1 searchable file

# make output files for each dataset
iGenes = open("/path/to/annotation_files/NCBI_RefSeq_hg38_genes_parsed.bed", 'r')
iExons = open("/path/to/annotation_files/NCBI_RefSeq_hg38_exons_parsed.bed", 'r')
iIntrons = open("/path/to/annotation_files/NCBI_RefSeq_hg38_introns_parsed.bed", 'r')
oMerge = open("/path/to/annotation_files/NCBI_RefSeq_hg38_merge_parsed.bed", 'w')

# go through every line of gene file and collect information
for line in iGenes:
    chrom = line.split('\t')[0]                 # chromosome
    start = line.split('\t')[1]                 # start coordinate of intron (last base of exon)
    end = line.split('\t')[2]                   # end coordinate of intron (last base of intron)
    strand = line.split('\t')[5][0]             # strand of gene with intron
    gene = line.split('\t')[3]                  # gene name
    feature = "gene"                            # feature description
    
    oMerge.write(str(chrom)+'\t'+str(start)+'\t'+str(end)+'\t'+str(gene)+'\t'+str(feature)+'\t'+str(strand)+'\n')
    
# go through every line of exon file and collect information
for line in iExons:
    chrom = line.split('\t')[0]                         # chromosome
    start = line.split('\t')[1]                         # start coordinate of intron (last base of exon)
    end = line.split('\t')[2]                           # end coordinate of intron (last base of intron)
    strand = line.split('\t')[5][0]                     # strand of gene with intron
    gene = line.split('\t')[3].split('_')[1]            # gene name
    feature = line.split('\t')[3].split('_')[2]         # feature description
    number = int(line.split('\t')[3].split('_')[3])+1   # feature number
    
    oMerge.write(str(chrom)+'\t'+str(start)+'\t'+str(end)+'\tNM_'+str(gene)+'\t'+str(feature)+'_'+str(number)+'\t'+str(strand)+'\n')

# go through every line of introns file and collect information
for line in iIntrons:
    chrom = line.split('\t')[0]                         # chromosome
    start = line.split('\t')[1]                         # start coordinate of intron (last base of exon)
    end = line.split('\t')[2]                           # end coordinate of intron (last base of intron)
    strand = line.split('\t')[5][0]                     # strand of gene with intron
    gene = line.split('\t')[3].split('_')[1]            # gene name
    feature = line.split('\t')[3].split('_')[2]         # feature description
    number = int(line.split('\t')[3].split('_')[3])+1   # feature number
    
    oMerge.write(str(chrom)+'\t'+str(start)+'\t'+str(end)+'\tNM_'+str(gene)+'\t'+str(feature)+'_'+str(number)+'\t'+str(strand)+'\n')
