In [98]:
#!/usr/bin/env python3

import argparse
#import cairo
import re

# parser = argparse.ArgumentParser()
# parser.add_argument("-f", "--filename", required=True, help='Fasta file containing \
# single exons (capital) and flanking introns (lower-case)')
# parser.add_argument("-m", "--motifs", required=True, help='A file containing \
# query motifs (must be one sequence per line)')
# args = parser.parse_args()
        

#Dictionaries to translate between regex and sequence
regex = {'A':'[Aa]','T':'[TtUu]','G':'[Gg]','C':'[Cc]','U':'[TtUu]',
        'W':'[AaTtUu]','S':'[GgCc]','M':'[AaCc]','K':'[GgTtUu]','R':'[AaGg]','Y':'[CcTtUu]',
        'B':'[CcGgTtUu]','D':'[AaGgTtUu]','H':'[AaCcTtUu]','V':'[AaCcGg]',
        'N':'[GgCcAaTtUu]'}

rev_regex = {'Gg':'G','Cc':'C','Aa':'A','TtUu':'T',
            'AaTtUu':'W','GgCc':'S','AaCc':'M','GgTtUu':'K','AaGg':'R','CcTtUu':'Y',
            'CcGgTtUu':'B','AaGgTtUu':'D','AaCcTtUu':'H','AaCcGg':'V',
            'GgCcAaTtUu':'N'}

In [99]:
# Convert input motifs to regex
with open("motifs.txt", 'r') as fh:
    regex_motifs=[] # create empty list to output converted motifs
    for line in fh:
        regex_motif=''
        motif=line.strip().upper()
        for each in motif:
            regex_motif+=regex[each]
        regex_motifs.append(regex_motif)

In [100]:
# Dict to store gene lengths and exon position
genes_index={}

# Dict to store motif positions
motifs_index={}

def main_coords(sequence):
    """Input single fasta sequence containing capital EXON region.
    Outputs list of [gene length, exon position, exon length]."""
    gene_len = len(seq)
    exon = re.search('[A-Z]',sequence)
    exon_start = exon.start()
    # Trim off upstream intron
    sequence = sequence[exon_start:]
    exon = re.search('[a-z]',sequence)
    exon_len = exon.start()
    out = [gene_len, exon_start, exon_len]
    return out

def motif_coords(sequence):
    """Input single fasta sequence containing capital EXON region.
    Save positions of desired motifs."""
    out=[]
    for motif in regex_motifs:
        mot = re.finditer(motif,sequence)
        for i in mot:
            out.append(i.start())
    return out
        
    

In [105]:
# Main loop
with open('sequence.txt','r') as fasta:
    NR=0 #counts lines
    for line in fasta:
        NR+=1
        # Save first header line
        if NR == 1:
            header=line.strip()
            seq=''
        # Save sequence
        elif line.startswith('>') == False:
            # Combines sequence lines if there are new line characters
            seq=seq+line.strip()
        # Save important info below
        elif line.startswith('>'):
            genes_index[header[1:]] = main_coords(seq)
            motifs_index[header[1:]] = motif_coords(seq)
            # Save next header and empty seq
            header=line.strip()
            seq=''
            
    # Final fasta record saved below
    genes_index[header[1:]] = main_coords(seq)
    motifs_index[header[1:]] = motif_coords(seq)

print(genes_index)
print(motifs_index)

{'INSR chr19:7149896-7151209 (reverse complement)': [858, 317, 36], 'MBNL chr3:152432504-152433226 (reverse complement)': [723, 306, 204], 'ADD3 chr10:111891895-111892326': [432, 168, 96]}
{'INSR chr19:7149896-7151209 (reverse complement)': [7, 130, 147, 217, 338, 379, 383, 406, 417, 470, 484, 508, 571, 595, 617, 641, 725, 825, 451, 452, 534], 'MBNL chr3:152432504-152433226 (reverse complement)': [17, 414, 465], 'ADD3 chr10:111891895-111892326': [37, 49, 117, 169, 196, 428, 135, 34, 136]}
