In [None]:
import pandas as pd
import gzip
import sys
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from collections import OrderedDict

In [None]:
# Table that contains three columns: genotype names, FASTA and GFF3 names
fdf = pd.read_csv("files_names.csv", sep="\t", index_col=0)

In [None]:
ext_len = 500 # The length of 3' UTR extension if required
for gff_row in fdf.iterrows():
    print(gff_row[0])
    # Load fasta file into dictionary for easy access. Can comment out when loaded for development
    with gzip.open("Genome/"+gff_row[1]["FASTA"], "rt") as handle:
        fasta_dict = {record.id: record.seq for record in SeqIO.parse(handle, "fasta")}

    gff = pd.read_csv("GFF3/"+gff_row[1]["GFF3"], header=None, sep="\t", comment='#', compression='gzip', low_memory=False)
    # gff = gff.iloc[:100] # For development work with fewer lines

    gene_od = OrderedDict() # dictionary of geneID: Seq after extension
    skip_first = True 
    skip_until_next_mRNA = True # Only dealing with canonical/primary transcripts
    for _, row in gff.iterrows():
        feature = row[2] # gene, mRNA, five_prime_UTR, CDS or three_prime_UTR

        # Skip rows if the associated mRNA is not primary
        if feature != "mRNA" and skip_until_next_mRNA:
            continue

        # Build the cDNA sequence from primary sequences. Non-primary skip in previous loop
        if feature in ["CDS", "five_prime_UTR","three_prime_UTR"]:
                        
            chrom = row[0]
            start = row[3]-1
            end = row[4]
            strand = row[6]
            three_prime_len = 0
            if geneID not in gene_od.keys():
                gene_od[geneID] = ""
            if strand == "+":
                gene_seq[feature] += fasta_dict[chrom][start:end]
            else:
                gene_seq[feature] += fasta_dict[chrom][start:end].reverse_complement()


        if feature == "mRNA":
            split = row[8].split(";")
            # Check if mRNA marks the beginning of a primary transcript section
            if "canonical_transcript=1" in split or "longest=1" in split:
                
                # The sequences are meant to be written when reaching a new mRNA and are 
                # not initiated when the loop first starts. Skip writing empty gene on first iter.
                if skip_first:
                    skip_first = False
                else:
                    # Write the fast3p SeqRecord to the ordered dict that will be saved to fasta
                    tpu_len = len(gene_seq["three_prime_UTR"])
                    if tpu_len < ext_len:
                        if strand == "+":
                            gene_seq["three_prime_UTR"] += fasta_dict[chrom][end:end+ext_len-tpu_len]
                        else:
                            gene_seq["three_prime_UTR"] += fasta_dict[chrom][start-ext_len-tpu_len:start].reverse_complement()
                    # Seq(gene_seq["five_prime_UTR"] + gene_seq["CDS"] + gene_seq["three_prime_UTR"]) # will build whole gene
                    gene_od[geneID] = SeqRecord(seq=Seq(gene_seq["CDS"][-500:] + gene_seq["three_prime_UTR"]), id=geneID, name='', description='')

                
                # Next few rows are associated with primary transcript and should not be skipped
                skip_until_next_mRNA = False
                
                # Initiate a dictionary that will contain the feature sequences
                for s in split:
                    if "Parent=" in s:
                        geneID = s[7:].replace(".RefGen_V4", "")
                        gene_seq = {"CDS": "", "five_prime_UTR": "", "three_prime_UTR": ""}
                        #print(geneID)
                
            # When there are multiple variants then skip non-primary rows
            else:
                skip_until_next_mRNA = True
                
    # At the final gene there will be no more primary genes to write the last gene
    # Write the fast3p SeqRecord to the ordered dict that will be saved to fasta
    tpu_len = len(gene_seq["three_prime_UTR"])
    if tpu_len < 1000:
        if strand == "+":
            gene_seq["three_prime_UTR"] += fasta_dict[chrom][end:end+1000-tpu_len]
        else:
            gene_seq["three_prime_UTR"] += fasta_dict[chrom][start-1000-tpu_len:start].reverse_complement()
    # Seq(gene_seq["five_prime_UTR"] + gene_seq["CDS"] + gene_seq["three_prime_UTR"]) # will build whole gene
    gene_od[geneID] = SeqRecord(seq=Seq(gene_seq["CDS"][-500:] + gene_seq["three_prime_UTR"]), id=geneID, name='', description='')
            
    with open('fast3p/'+gff_row[0]+'_fast3p.fa', 'w') as handle:
        SeqIO.write(gene_od.values(), handle, 'fasta')
    
    #break

# Attempt (failed) to adjust overlapping UTRs and CDS sequences

In [None]:
# 
limit_dict = {}
for rx in range(len(primary_coord_list)):
    print(primary_coord_list[rx+1][0] - primary_coord_list[rx][1])
    if primary_coord_list[rx+1][0] - primary_coord_list[rx][1] < 2000:
        print("Too close")

In [None]:
# To avoid eptending a 3' UTR into another gene we will first get the coordinates of all genes
# Then we can index the list and check that the extension doesn't go into the next gene
# If primary gene models are more than 2000 bp apart we can skip them for the time being
primary_gene_list = []
primary_coord_list = []
geneID = "" 
skip_until_next_mRNA = True # Only dealing with canonical/primary transcripts
for _, row in gff.iterrows():
    feature = row[2] # gene, mRNA, five_prime_UTR, CDS or three_prime_UTR
        
    # Skip rows if the associated mRNA is not primary
    if feature != "mRNA" and skip_until_next_mRNA:
        continue
        
        
    if feature == "mRNA":
        split = row[8].split(";")
        # Check if 
        if "canonical_transcript=1" in split or "longest=1" in split:
            skip_until_next_mRNA = False
            for s in split:
                if "Parent=" in s:
                    primary_gene_list.append(s[7:].replace(".RefGen_V4", ""))
                    primary_coord_list.append([row[3]-1, row[4], row[6]]) # list of start, end, strand                    
        else:
            skip_until_next_mRNA = True