In [29]:
import processGFF3 as pg
import processFa as pf
from Bio.SeqFeature import SeqFeature, FeatureLocation, ExactPosition, SimpleLocation
from Bio.SeqRecord import SeqRecord
import utils as ut
import subprocess

## Set up input and outputs

In [30]:
GenomeBuild = "GRCh38" # Human
#GenomeBuild = "GRCm39" # Mouse
#GenomeBuild = "GRCz11" # Zebrafish

spp = f"Homo_sapiens.{GenomeBuild}" # Human
# Spp = f"Mus_musculus.{spp}" # Mouse
# Spp = f"Danio_rerio.{spp}" # Zebrafish

gff3_path = f"{spp}.107.gff3.gz"
fasta_path = f"{spp}.dna_sm.primary_assembly.fa.gz"
out_EI_fa_path = f"Exon_intron_junc_seqs.{GenomeBuild}.fa"
out_IE_fa_path = f"Intron_exon_junc_seqs.{GenomeBuild}.fa"
out_EI_weblogo_path = f"Exon_intron_junc_seqs.{GenomeBuild}.weblogo.eps"
out_IE_weblogo_path = f"Intron_exon_junc_seqs.{GenomeBuild}.weblogo.eps"

## Loade and parse GFF3

In [31]:
ENST_info = pg.parse_gff3(gff3_path)

-processing ENST IDs
-processing exons
-processing cds


## Load and parse genome fasta

In [None]:
print("-loading genome sequence...")
genome_seqs = pf.parse_fasta(fasta_path)

-loading genome sequence...


## Extract exon junctions

In [None]:
counter = 0
with open(out_EI_path, "w") as ei, open(out_IE_path, "w") as ie:

    for ENST in ENST_info.keys():
        features = ENST_info[ENST].features
        exons = [i for i in features if i.type=="exon"]
        sorted_exons = ut.sort_exons(exons)

        EI_junc_coord = ut.extract_exon_intron_junc_coord(sorted_exons)
        IE_junc_coord = ut.extract_intron_exon_junc_coord(sorted_exons)
        
        EI_junc_seqs = ut.extract_junc_seq(EI_junc_coord, genome_seqs, left_padding = 20, right_padding = 20)   
        IE_junc_seqs = ut.extract_junc_seq(IE_junc_coord, genome_seqs, left_padding = 20, right_padding = 20)

        for index, seq in enumerate(EI_junc_seqs):
            ei.write(f">{ENST}_{index}\n{str(seq)}\n")

        for index, seq in enumerate(IE_junc_seqs):
            ie.write(f">{ENST}_{index}\n{str(seq)}\n")

        counter += 1
        if counter % 5000 == 0:
            print(counter, " ENSTs processed")

5000  ENSTs processed
10000  ENSTs processed
15000  ENSTs processed
20000  ENSTs processed
25000  ENSTs processed
30000  ENSTs processed
35000  ENSTs processed
40000  ENSTs processed
45000  ENSTs processed
50000  ENSTs processed
55000  ENSTs processed


In [None]:
subprocess.run(["weblogo", "-f", out_EI_fa_path, "-D", "fasta", "-F", "eps","-o", out_EI_weblogo_path])
subprocess.run(["weblogo", "-f", out_IE_fa_path, "-D", "fasta", "-F", "eps","-o", out_IE_weblogo_path])

CompletedProcess(args=['weblogo', '-f', 'Intron_exon_junc_seqs.GRCz11.fa', '-D', 'fasta', '-F', 'eps', '-o', 'Intron_exon_junc_seqs.GRCz11.weblogo.eps'], returncode=0)