In [None]:
import re
from icecream import ic
import numpy as np

In [None]:
def read_alignment(filename: str = 'test1.txt') -> list:
    '''
    DESCR
    -----
    Read in sequence alignment(s). 
    Entire file is line-by-line into a list object and stripped of whitespace.
    List will have representation of some row-wise separator in case of multiple alignments.
    
    ARGS
    -----
    filename: name of file
    
    RETURNS
    --------
    list of alignments separated by any separator (if applicable).
    '''
    with open(filename, 'r') as file:
        return [elem.strip() for elem in file.readlines()]


ic(read_alignment('test1.txt'));
alignments = read_alignment()

def num_alignments(alignments: list) -> int:
    '''
    DESCR
    ------
    Checks to see if how many alignments are in an iterable of alignments by counting
    the number of separators.
    '''
    return sum([1 if not re.match('^[A-Z]+|^-|^[a-z]+', seq) else 0 for seq in alignments]) + 1

ic(num_alignments(alignments));

def columnarize(alignments: list) -> list:
    '''
    This doesnt work for a single alignment; need to rework
    '''
    nested_list = list()
    temp_list = list()

    for i in range(len(alignments)):
        # If element is part of an alignment
        if re.match('^[A-Z]+|^-|^[a-z]+', alignments[i]):
            # add to a temporary alignment
            temp_list.append(alignments[i])
        # If element is a separator
        else:
            try:
                # add full alignment to a list of alignment(s)
                nested_list.append(temp_list)
                # reset temp alignment in case of more alignments
                temp_list = list()
            except:
                break
        
    nested_list.append(temp_list)
    return nested_list

ic(columnarize(alignments))
alignments = columnarize(alignments)


NUCLEOTIDES = {'G', 'C', 'A', 'T', '-'}

In [None]:
class Alignment:
    def __init__(self, alignment, score_set = None):
        '''Want to add where changing alignment calls another instance to reset calcs'''
        self.alignment = alignment                            # Storing the alignment (need to call alignment fn)
        self.__len = self.__len__()                            # Number of sequences
        self.__len_seq = self.__check_seq_len(alignment)       # Sequence length
        self.__score_set = score_set                           # Score set (could just find distinct chars in alignmnt)
        self.T = self.__T()                                    # Transposing alignment for columnarization
        self.__column_counters = self.__alignment_scores()     # Counter for each distinct char for each sequence
        self.profile = self.__get_profile(order = None)        # Alignment profile
        self.entropy = self.__get_entropy()
        
    
    ''' Accessor functions '''
    def get_len(self):
        return self.__len
    def get_seq_len(self):
        return self.__len_seq
    def get_score_set(self):
        assert self.__score_set is not None, 'score_set is undefined in class initialization'
        return self.__score_set
    def __get_profile(self, order = None):
        '''order sets the profile character order'''
        return [\
                [self.__column_counters[i][elem]\
                 for elem in sorted(self.__score_set)] for i in range(len(self.__column_counters))]
    def __get_entropy(self):
        ZERO_ENTROPY = [0,1]
        entropy = 0
        for column in self.profile:
            for elem in column:
                scaled_elem = elem / sum(column)
                if not scaled_elem in ZERO_ENTROPY:
                    frequency = elem / sum(column)
                    entropy += frequency * np.log2(frequency)
        return -entropy

    ''' Mutator functions '''
    def set_score_set(self, score_set):
        assert type(score_set) == set, f'set_score_set expected {type(set())}, got {type(score_set)}'
        self.__score_set = score_set
        
    ''' Utility functions '''
    # Transpose
    def __T(self):
        self.T = list()
        temp_str = ''
        for col in range(self.__len_seq):
            for row in range(self.__len):
                temp_str += self.alignment[row][col]
            self.T.append(temp_str)
            temp_str = ''
        return self.T
    def __alignment_scores(self):
        from collections import Counter
        # Column counts based on column contents
        column_counters = [Counter(elem) for elem in self.T]
        # Column counts including 0-values 
        for column in column_counters:
            for elem in list(self.__score_set):
                if not column[elem]:
                    column[elem] = 0
        return column_counters
        
    
    ''' Hidden Utilities '''
    def __str__(self):
        return str(self.alignment)
    
    def __len__(self):
        return len(self.alignment)
    
    def __check_seq_len(self, alignment):
        l = len(alignment[0]) # length of the first element of alignment
        for elem in alignment:
            if len(elem) != l:
                raise Exception('something went wrong')
            else:
                return l

In [None]:
b = ['AAA', 'ACC', 'ACG', 'ACT']
almnt = Alignment(b, NUCLEOTIDES)

In [None]:
b

In [None]:
almnt.T

In [None]:
a = almnt.profile
print(a)

In [None]:
# ZERO_ENTROPY = [0,1]

# entropy = 0

# for column in a:
#     ic(column)
#     for elem in column:
#         ic(elem)
#         scaled_elem = elem / sum(column)
#         ic(scaled_elem)
#         if not scaled_elem in ZERO_ENTROPY:
#             frequency = elem / sum(column)
#             ic(frequency)
#             entropy += frequency * np.log2(frequency)
#     print()
# print(-entropy)