In [12]:
import re
from icecream import ic
import numpy as np

In [55]:
def read_alignment(filename: str = 'test1.txt') -> list:
    '''
    DESCR
    -----
    Read in sequence alignment(s). 
    Entire file is line-by-line into a list object and stripped of whitespace.
    List will have representation of some row-wise separator in case of multiple alignments.
    
    ARGS
    -----
    filename: name of file
    
    RETURNS
    --------
    list of alignments separated by any separator (if applicable).
    '''
    with open(filename, 'r') as file:
        return [elem.strip() for elem in file.readlines()]


ic(read_alignment('test1.txt'));
alignments = read_alignment()

def num_alignments(alignments: list) -> int:
    '''
    DESCR
    ------
    Checks to see if how many alignments are in an iterable of alignments by counting
    the number of separators.
    '''
    return sum([1 if not re.match('^[A-Z]+|^-|^[a-z]+', seq) else 0 for seq in alignments]) + 1

ic(num_alignments(alignments));

def columnarize(alignments: list) -> list:
    '''
    This doesnt work for a single alignment; need to rework
    '''
    nested_list = list()
    temp_list = list()

    for i in range(len(alignments)):
        # If element is part of an alignment
        if re.match('^[A-Z]+|^-|^[a-z]+', alignments[i]):
            # add to a temporary alignment
            temp_list.append(alignments[i])
        # If element is a separator
        else:
            try:
                # add full alignment to a list of alignment(s)
                nested_list.append(temp_list)
                # reset temp alignment in case of more alignments
                temp_list = list()
            except:
                break
        
    nested_list.append(temp_list)
    return nested_list

ic(columnarize(alignments))
alignments = columnarize(alignments)


NUCLEOTIDES = {'G', 'C', 'A', 'T', '-'}

ic| read_alignment('test1.txt'): ['AAA', 'ACC', 'ACG', 'ACT', '#', 'AAA-', 'A-CC', 'ACG-', 'A-CT']
ic| num_alignments(alignments): 2
ic| columnarize(alignments): [['AAA', 'ACC', 'ACG', 'ACT'], ['AAA-', 'A-CC', 'ACG-', 'A-CT']]


In [350]:
class Alignment:
    def __init__(self, alignment, score_set = None):
        self.__alignment = alignment
        self.__len = len(alignment)
        self.__len_seq = self.__check_seq_len(alignment)
        self.__score_set = score_set
        self.T = self.__T()
        self.__column_counters = self.__alignment_scores()
        self.profile = self.__get_profile(order = None)
        
    
    ''' Accessor functions '''
    def get_len(self):
        return self.__len
    def get_seq_len(self):
        return self.__len_seq
    def get_score_set(self):
        assert self.__score_set is not None, 'score_set is undefined in class initialization'
        return self.__score_set
    def __get_profile(self, order = None):
        '''order sets the profile character order'''
#         for i in range(len(self.__column_counters)):
#             for elem in sorted(self.__score_set):
#                 print(self.__column_counters[i][elem], end = ' ')
#             print()
        
#         for elem in sorted(self.__score_set):
#             for i in range(len(self.__column_counters)):
#                 print(self.__column_counters[i][elem], end = ' ')
#             print()
#
#         print(\
#               [\
#                [self.__column_counters[i][elem] \
#                 for elem in sorted(self.__score_set)] for i in range(len(self.__column_counters))])
        return [\
                [self.__column_counters[i][elem]\
                 for elem in sorted(self.__score_set)] for i in range(len(self.__column_counters))]
        
    def get_entropy(self):
        pass

    ''' Mutator functions '''
    def set_score_set(self, score_set):
        assert type(score_set) == set, f'set_score_set expected {type(set())}, got {type(score_set)}'
        self.__score_set = score_set
        
    ''' Utility functions '''
    # Transpose
    def __T(self):
        self.T = list()
        temp_str = ''
        for col in range(self.__len_seq):
            for row in range(self.__len):
                temp_str += self.__alignment[row][col]
            self.T.append(temp_str)
            temp_str = ''
        return self.T
    def __alignment_scores(self):
        from collections import Counter
        # Column counts based on column contents
        column_counters = [Counter(elem) for elem in almnt.T]
        # Column counts including 0-values 
        for column in column_counters:
            for elem in list(self.__score_set):
                if not column[elem]:
                    column[elem] = 0
        return column_counters
        
    
    ''' Hidden Utilities '''
    def __str__(self):
        return str(self.__alignment)
    
    def __check_seq_len(self, alignment):
        l = len(alignment[0]) # length of the first element of alignment
        for elem in alignment:
            if len(elem) != l:
                raise Exception('something went wrong')
            else:
                return l

In [363]:
b = ['AAA', 'ACC', 'ACG', 'ACT']
almnt = Alignment(b, NUCLEOTIDES)

In [364]:
a = almnt.profile

In [365]:
a

[[0, 4, 0, 0, 0], [0, 1, 3, 0, 0], [0, 1, 1, 1, 1]]

In [341]:


def test(nested_list, num_alignments):
    temp = ''
    # Loop for each alignment(s)
    for i in range(num_alignments):
        # Loop 
        for i in range(len(nested_list[i])):
            print(nested_list[0][i][0])
            temp += nested_list[0][i][0]
        
test(alignments, num_alignments = num_alignments)

TypeError: 'function' object cannot be interpreted as an integer

In [169]:
{k:0 for k in NUCLEOTIDES}

{'G': 0, 'A': 0, 'C': 0, '-': 0, 'T': 0}

In [97]:
temp

'AAAA'

In [101]:
from collections import Counter
c = Counter('asdf')
c

Counter({'a': 1, 's': 1, 'd': 1, 'f': 1})

In [104]:
Counter('AGC')

Counter({'A': 1, 'G': 1, 'C': 1})

In [None]:
'AAA', 'ACC', 'ACG', 'ACT'

In [105]:
a = Counter('AAA')
print(a)
b = Counter('ACC')
print(b)
c = Counter('ACG')
print(c)
d = Counter('ACT')
print(d)

Counter({'A': 3})
Counter({'C': 2, 'A': 1})
Counter({'A': 1, 'C': 1, 'G': 1})
Counter({'A': 1, 'C': 1, 'T': 1})


In [81]:
np.array(nested_list, dtype = 'object').reshape(8,1)

array([list(['A', 'C', 'C'])], dtype=object)

In [58]:
np.array(nested_list, dtype = 'object').T

array([[list(['A', 'A', 'A']), list(['A', 'A', 'A', '-'])],
       [list(['A', 'C', 'C']), list(['A', '-', 'C', 'C'])],
       [list(['A', 'C', 'G']), list(['A', 'C', 'G', '-'])],
       [list(['A', 'C', 'T']), list(['A', '-', 'C', 'T'])]], dtype=object)

In [18]:
for i in range(len(seqs)):
    print(re.match('^[A-Z]+|^-|^[a-z]', seqs[i]))

<re.Match object; span=(0, 3), match='AAA'>
<re.Match object; span=(0, 3), match='ACC'>
<re.Match object; span=(0, 3), match='ACG'>
<re.Match object; span=(0, 3), match='ACT'>
None
<re.Match object; span=(0, 3), match='AAA'>
<re.Match object; span=(0, 1), match='A'>
<re.Match object; span=(0, 3), match='ACG'>
<re.Match object; span=(0, 1), match='A'>


In [32]:
with open('test1.txt', 'r') as file:
    a = np.array([elem.strip() for elem in file.readlines()], dtype = 'object')
    
a

array(['AAA', 'ACC', 'ACG', 'ACT', '#', 'AAA-', 'A-CC', 'ACG-', 'A-CT'],
      dtype=object)

In [37]:
def array_regex(i):
    return np.array([True if re.match('[^#]', i) else False])

ar = np.vectorize(array_regex)

c = ar(a)
c

array([ True,  True,  True,  True, False,  True,  True,  True,  True])

In [35]:
def array_regex(i):
    b = np.array([True if re.match('[^#]', i) else False])
    return np.where(True, b)

ar = np.vectorize(array_regex)

c = ar(a)

ValueError: either both or neither of x and y should be given