In [36]:
import re
from icecream import ic

In [37]:
# ic.disable()
# '''Want to read sequences and display out of memory, but can keep stats in memory'''
# def read_sequences(filename = 'genes.fa'):
#     with open(filename, 'r') as file:
#         # Read until line isn't read
#         while (line := file.readline().strip()):
#             ic(line)
#             if line.startswith('>'): # Remake this condition part to a regex
#                 title = line[1:]

#             else:
#                 display_seq_stats(title, *get_sequence_stats(line))
                
# read_sequences()

In [74]:
def splitter(fn):
    def decorator(seq):
        if not isinstance(seq, str):
            return fn(seq)
        
        else:
            re_split = re.split(r'([a-z]+)', seq)
            return fn(re_split)
    
    return decorator

@splitter
def get_sequence_stats(re_split):
    # Get statistics
    exon_len = 0
    num_exons = 0
    intron_len = 0
    num_introns = 0
    for i in range(len(re_split)): 
        if (i+1) % 2 != 0: # Could check by sequence uppercase/lowercase preferably
            exon_len += len(re_split[i])
            num_exons += 1
        else:
            intron_len += len(re_split[i])
            num_introns += 1
    
    total_len = exon_len + intron_len
    return total_len, exon_len, intron_len, num_exons, num_introns


def display_seq_stats(seq_name, total_len, exon_len, intron_len, num_exons, num_introns):
    for elem in list(locals().values()):
        print(f' {elem:^10} | ', end = ' ')
    print()
    
def gene_stats_header():
    gene = 'gene'
    size = 'size'
    exons_total_size = 'exons_len'
    introns_total_size = 'intr_len'
    exons_count = 'num_exons'
    introns_count = 'num_intrs'
    
    display_seq_stats(*locals().values())

In [75]:
def read_sequences(filename = 'genes.fa', gene_stats = False):
    INTRON = 'Intron'
    EXON = 'Exon'
    with open(filename, 'r') as file:
        # Read until line isn't read
        if gene_stats:
            stats = dict()
        
        while (line := file.readline().strip()):
            exon_count = 1
            intron_count = 1
            if line.startswith('>'): # Remake this condition part to a regex
                title = line[1:]
                print(title, '\n', '-'*20)
            else:
                re_split = re.split(r'([a-z]+)', line)
                for i in range(len(re_split)):
                    if (i+1) %2 != 0:
                        print(f'{EXON:>10} {exon_count}: {re_split[i]}')
                        exon_count += 1
                    else:
                        print(f'{INTRON:>10} {intron_count}: {re_split[i]}')
                        intron_count += 1
                stats[title] = get_sequence_stats(re_split)

    return stats

In [76]:
def main():
    # Print gene intron/exon sequences
    _ = '#' * 20
    print(_, 'INTRON/EXON SEQUENCES PER GENE',_)
    gene_stats = read_sequences(gene_stats = True) # prints sequences and returns stats
    print('\n'*3)
    
    # Display stats
    gene_stats_header()
    for k, v in gene_stats.items():
        display_seq_stats(k, *v)

In [77]:
main()

#################### INTRON/EXON SEQUENCES PER GENE ####################
gene1 
 --------------------
      Exon 1: TAGTGGTCTTTT
    Intron 1: gagtg
      Exon 2: TAGATCTGAAGGGAAAGTATTTCCACCAGTTCGGGGTCACCCAGCAGGGCAGGGTGACTTAAT
gene2 
 --------------------
      Exon 1: CGCGACTCGGCGCTCACAGTTAT
    Intron 1: agcacgttt
      Exon 2: AGACC
    Intron 2: aaaaaa
      Exon 3: CGGAGTTGGAT
    Intron 3: cccgtgtg
      Exon 4: AATCGGAGTCCTT
gene3 
 --------------------
      Exon 1: GTTACTTGTGAGCCTGGTT
    Intron 1: agaggacgggagtggta
      Exon 2: AATATAATTGTTGGC
gene4 
 --------------------
      Exon 1: AACATC
    Intron 1: ttggttaa
      Exon 2: AGGCTTTGATTAAACAATTTAAGCACGTAAATCCGAATTGACCTGATGACAATACGGAACATGCCGGCTCCGGG
gene5 
 --------------------
      Exon 1: ACCACCGGATAGGC
    Intron 1: ttt
      Exon 2: TGCT
    Intron 2: aatgc
      Exon 3: TATTAGGTCCAAAAGGTAGTATCGTAATAATGGCTCAGCCATGTCAATGTGCGGCATTCCAC
gene6 
 --------------------
      Exon 1: TAGATTCGAATCGATCGTGTTTCTCCCTCTGTGGGTTAACGA