In [1]:
from collections import defaultdict

In [16]:
def read_input(lines):
    threshold = float(lines[0])
    alphabet = lines[2].split("\t")
    alignment = []
    for line in lines[4:]:
        alignment.append(line)
    return threshold, set(sorted(alphabet)), alignment
      

In [3]:
sample_input = [
"0.289",
"--------",
"A B C D E",
"--------",
"EBA",
"E-D",
"EB-",
"EED",
"EBD",
"EBE",
"E-D",
"E-D"]

In [4]:
threshold, alphabet, alignment = read_input(sample_input)
print(alphabet)
print(alignment)

{'C', 'A', 'E', 'D', 'B'}
['EBA', 'E-D', 'EB-', 'EED', 'EBD', 'EBE', 'E-D', 'E-D']


In [5]:
def get_seed_alignment(alignment, threshold):
    counts = [defaultdict(int) for i in range(len(alignment[0]))]
    columns2remove = []
    for row in alignment:
        for i, v in enumerate(row):
            counts[i][v] += 1
    
    for col, col_counts in enumerate(counts):
        freq = col_counts["-"]/len(alignment)
        if freq >= threshold:
            columns2remove.append(col)
    return set(columns2remove)

In [6]:
columns2remove = get_seed_alignment(alignment, threshold)
print(columns2remove)

{1}


In [7]:
def init_transition_emission(n, alphabet):
    states = ["S", "I0"]
    for i in range(n):
        states.append(f"M{i+1}")
        states.append(f"D{i+1}")
        states.append(f"I{i+1}")
    states.append("E")
    
    transition = dict()
    emission = dict()
    for state in states:
        transition[state] = {s: 0 for s in states}
        emission[state] = {a: 0 for a in sorted(alphabet)}
        
    return transition, emission

In [8]:
n = len(alignment[0]) - len(columns2remove)
transition, emission = init_transition_emission(n, alphabet)
print(transition)
print(emission)

{'S': {'S': 0, 'I0': 0, 'M1': 0, 'D1': 0, 'I1': 0, 'M2': 0, 'D2': 0, 'I2': 0, 'E': 0}, 'I0': {'S': 0, 'I0': 0, 'M1': 0, 'D1': 0, 'I1': 0, 'M2': 0, 'D2': 0, 'I2': 0, 'E': 0}, 'M1': {'S': 0, 'I0': 0, 'M1': 0, 'D1': 0, 'I1': 0, 'M2': 0, 'D2': 0, 'I2': 0, 'E': 0}, 'D1': {'S': 0, 'I0': 0, 'M1': 0, 'D1': 0, 'I1': 0, 'M2': 0, 'D2': 0, 'I2': 0, 'E': 0}, 'I1': {'S': 0, 'I0': 0, 'M1': 0, 'D1': 0, 'I1': 0, 'M2': 0, 'D2': 0, 'I2': 0, 'E': 0}, 'M2': {'S': 0, 'I0': 0, 'M1': 0, 'D1': 0, 'I1': 0, 'M2': 0, 'D2': 0, 'I2': 0, 'E': 0}, 'D2': {'S': 0, 'I0': 0, 'M1': 0, 'D1': 0, 'I1': 0, 'M2': 0, 'D2': 0, 'I2': 0, 'E': 0}, 'I2': {'S': 0, 'I0': 0, 'M1': 0, 'D1': 0, 'I1': 0, 'M2': 0, 'D2': 0, 'I2': 0, 'E': 0}, 'E': {'S': 0, 'I0': 0, 'M1': 0, 'D1': 0, 'I1': 0, 'M2': 0, 'D2': 0, 'I2': 0, 'E': 0}}
{'S': {'A': 0, 'B': 0, 'C': 0, 'D': 0, 'E': 0}, 'I0': {'A': 0, 'B': 0, 'C': 0, 'D': 0, 'E': 0}, 'M1': {'A': 0, 'B': 0, 'C': 0, 'D': 0, 'E': 0}, 'D1': {'A': 0, 'B': 0, 'C': 0, 'D': 0, 'E': 0}, 'I1': {'A': 0, 'B': 0, 'C'

In [9]:
def compute_transition(transition, emission, alignment, columns2remove, alphabet):
    col = 0
    prev_states = ["S"]*len(alignment)
    for i in range(len(alignment[0])):
        for row, seq in enumerate(alignment):
            if seq[i] in alphabet:
                if i not in columns2remove:
                    state_label = f"M{col+1}"
                else:
                    state_label = f"I{col}"
            else:
                if i not in columns2remove:
                    state_label = f"D{col+1}"
                else:
                    continue
            #print(f"i: {i}, col: {col}, letter: {seq[col]}, state_label: {state_label}, prev_state: {prev_states[row]}")
            transition[prev_states[row]][state_label] += 1
            prev_states[row] = state_label
            if seq[i] != "-":
                emission[state_label][seq[i]] += 1
        if i not in columns2remove:
            col += 1            
    for prev_state in prev_states:
        transition[prev_state]["E"] += 1
        
    for prev_state, next_states in transition.items():
        total = sum((v for v in next_states.values()))
        if total != 0:
            for k in next_states.keys():
                next_states[k] /= total
                
    for state, state_emissions in emission.items():
        total = sum((v for v in state_emissions.values()))
        if total != 0:
            for k in state_emissions.keys():
                state_emissions[k] /= total

In [10]:
compute_transition(transition, emission, alignment, columns2remove, alphabet)

In [11]:
def print_output(transition, emissions, alphabet):
    header = " " + " ".join(transition.keys())
    print(header)
    for k, states in transition.items():
        row = k + " " + " ".join((str(s) for s in states.values()))
        print(row)
    print("--------")
    header = " " + " ".join(sorted(alphabet))
    print(header)
    for state, state_emissions in emissions.items():
        row = state + " " + " ".join((str(s) for s in state_emissions.values()))
        print(row)   

In [12]:
print_output(transition, emission, alphabet)

 S I0 M1 D1 I1 M2 D2 I2 E
S 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
I0 0 0 0 0 0 0 0 0 0
M1 0.0 0.0 0.0 0.0 0.625 0.375 0.0 0.0 0.0
D1 0 0 0 0 0 0 0 0 0
I1 0.0 0.0 0.0 0.0 0.0 0.8 0.2 0.0 0.0
M2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
D2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
I2 0 0 0 0 0 0 0 0 0
E 0 0 0 0 0 0 0 0 0
--------
 A B C D E
S 0 0 0 0 0
I0 0 0 0 0 0
M1 0.0 0.0 0.0 0.0 1.0
D1 0 0 0 0 0
I1 0.0 0.8 0.0 0.0 0.2
M2 0.14285714285714285 0.0 0.0 0.7142857142857143 0.14285714285714285
D2 0 0 0 0 0
I2 0 0 0 0 0
E 0 0 0 0 0


In [17]:
with open("../data/dataset_26258_15.txt", "r") as fin:
    lines = [l.strip() for l in fin]
    threshold, alphabet, alignment = read_input(lines)
    print(alphabet)
    print(alignment)
    columns2remove = get_seed_alignment(alignment, threshold)
    n = len(alignment[0]) - len(columns2remove)
    transition, emission = init_transition_emission(n, alphabet)
    compute_transition(transition, emission, alignment, columns2remove, alphabet)
    print_output(transition, emission, alphabet)

{'C', 'A', 'E', 'D', 'B'}
['-CBA-BE-E', 'BE-ABBEBB', 'BABABBEBB', 'BCB-BBE-B', 'ABBABBEBB', 'BC-ABBBBB', 'BCB-BBEBB', 'BDBAB-E-B', 'BB-ABBEDC']
 S I0 M1 D1 I1 M2 D2 I2 M3 D3 I3 M4 D4 I4 M5 D5 I5 M6 D6 I6 M7 D7 I7 E
S 0.0 0.0 0.8888888888888888 0.1111111111111111 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
I0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
M1 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
D1 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
I1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
M2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.6666666666666666 0.3333333333333333 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
D2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
I2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.6666666666666666 0.3333333333333333 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
M3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0