In [1]:
import re
from icecream import ic
import numpy as np

In [2]:
def read_alignment(filename: str = 'test1.txt') -> list:
    '''
    DESCR
    -----
    Read in sequence alignment(s). 
    Entire file is line-by-line into a list object and stripped of whitespace.
    List will have representation of some row-wise separator in case of multiple alignments.
    
    ARGS
    -----
    filename: name of file
    
    RETURNS
    --------
    list of alignments separated by any separator (if applicable).
    '''
    with open(filename, 'r') as file:
        return [elem.strip() for elem in file.readlines()]

def columnarize(alignments: list) -> list:
    '''
    This doesnt work for a single alignment; need to rework
    '''
    nested_list = list()
    temp_list = list()

    for i in range(len(alignments)):
        # If element is part of an alignment
        if re.match('^[A-Z]+|^-|^[a-z]+', alignments[i]):
            # add to a temporary alignment
            temp_list.append(alignments[i])
        # If element is a separator
        else:
            try:
                # add full alignment to a list of alignment(s)
                nested_list.append(temp_list)
                # reset temp alignment in case of more alignments
                temp_list = list()
            except:
                break
    if temp_list:
        nested_list.append(temp_list)
    return nested_list

In [3]:
class Alignment:
    def __init__(self, alignment, score_set = None, order = None):
        '''Want to add where changing alignment calls another instance to reset calcs'''
        assert not order or set(order) == score_set, f'order and score_set must contain the same values'
        assert type(order) == list, f'order argument expected list object; got {type(order)}'
        self.alignment = alignment                             # Storing the alignment (need to call alignment fn)
        self.__len = self.__len__()                            # Number of sequences
        self.__len_seq = self.__check_seq_len(alignment)       # Sequence length
        self.__score_set = score_set                           # Score set (could just find distinct chars in alignmnt)
        self.__order = order
        self.T = self.__T()                                    # Transposing alignment for columnarization
        self.__column_counters = self.__alignment_scores()     # Counter for each distinct char for each sequence
        self.profile = \
        self.__get_profile(order = self.__order)               # Alignment profile
        self.entropy = self.__get_entropy()                    # Entropy
        
    
    ''' Accessor functions '''
    def get_len(self):
        return self.__len
    def get_seq_len(self):
        return self.__len_seq
    def get_score_set(self):
        assert self.__score_set is not None, 'score_set is undefined in class initialization'
        return self.__score_set

    ''' Mutator functions '''
    def set_score_set(self, score_set):
        assert type(score_set) == set, f'set_score_set expected {type(set())}, got {type(score_set)}'
        self.__score_set = score_set
        
    ''' Utility functions'''
    def display_profile(self):
        profile_arr = np.array(self.profile).T
        for i in range(profile_arr.shape[0]):
            print(f'{self.__order[i]}', end = ' ')
            for j in range(profile_arr.shape[1]):
                print(profile_arr[i][j], end = ' ')
            print()
        
    ''' Hidden utility functions '''
    def __get_profile(self, order = None):
        '''order sets the profile character order'''
        # Returns a transposed view of the alignment
        if order:
            return [\
                    [self.__column_counters[i][elem]\
                     for elem in order] for i in range(len(self.__column_counters))]
        
        if not order:
            return [\
                    [self.__column_counters[i][elem]\
                     for elem in sorted(self.__score_set)] for i in range(len(self.__column_counters))]
    def __get_entropy(self):
        ZERO_ENTROPY = [0,1]
        entropy = 0
        for column in self.profile:
            for elem in column:
                scaled_elem = elem / sum(column)
                if not scaled_elem in ZERO_ENTROPY:
                    frequency = elem / sum(column)
                    entropy += frequency * np.log2(frequency)
        return round(-entropy, 3)
    
    # Transpose
    def __T(self):
        self.T = list()
        temp_str = ''
        for col in range(self.__len_seq):
            for row in range(self.__len):
                temp_str += self.alignment[row][col]
            self.T.append(temp_str)
            temp_str = ''
        return self.T
    def __alignment_scores(self):
        from collections import Counter
        # Column counts based on column contents
        column_counters = [Counter(elem) for elem in self.T]
        # Column counts including 0-values 
        for column in column_counters:
            for elem in list(self.__score_set):
                if not column[elem]:
                    column[elem] = 0
        ##################################
        return column_counters
    
    def __check_seq_len(self, alignment):
        l = len(alignment[0]) # length of the first element of alignment
        for elem in alignment:
            if len(elem) != l:
                raise Exception('Alignment has sequences of different lengths')
            else:
                return l
            
    def __lt__(self, alignment_object: object) -> bool:
        return self.entropy < alignment_object.entropy

    def __str__(self):
        ''' 
        print() on alignment object returns the alignment
        '''
        return str(self.alignment)
    
    def __repr__(self):
        return str(self.alignment)
    
    def __len__(self):
        '''
        len() on alignment object returns the length of the list object storing the alignment
        '''
        return len(self.alignment)

In [5]:
def display_alignment_comparison():
    pass

def main():
    NUCLEOTIDES = {'G', 'C', 'A', 'T', '-'}
    ORDER = ['A', 'T', 'G', 'C', '-']
    test_files = [f'test{i+1}.txt' for i in range(3)]
    test_alignments = [read_alignment(file) for file in test_files]
    split_alignments = [columnarize(alignments) for alignments in test_alignments]
    alignment_pair_objs = [[Alignment(alignment, NUCLEOTIDES, ORDER)\
                           for alignment in almnt_pair] for almnt_pair in split_alignments]
#     print(alignment_pair_objs)
#     print()
#     print(alignment_pair_objs[0][0].profile)
    alignment_pair_objs[0][0].display_profile()
#     print()
#     print(np.array(alignment_pair_objs[0][0].profile).T)
#     print(alignment_pair)

main()

A 4 1 1 
T 0 0 1 
G 0 0 1 
C 0 3 1 
- 0 0 0 


In [None]:
import os

In [None]:
np.array([[4, 1, 1],[0, 0, 1],[0, 0, 1],[0, 3, 1],[0, 0, 0]]).shape

In [None]:
len(np.array([[4, 1, 1],[0, 0, 1],[0, 0, 1],[0, 3, 1],[0, 0, 0]]))