In [1]:
# The dishonest casino model
# (I do not check these, so no mistakes!)
fair2loaded_prob = 0.05
loaded2fair_prob = 0.10
fair2fair_prob = 0.95
loaded2loaded_prob = 0.90

loaded_probs = [1.0/10.0, 1.0/10.0, 1.0/10.0, 1.0/10.0, 1.0/10.0, 1.0/2.0]
fair_probs = [1.0/6.0, 1.0/6.0, 1.0/6.0, 1.0/6.0, 1.0/6.0, 1.0/6.0]

fair_start_prob = 1.0
loaded_start_prob = 0.0

In [2]:
# 1. Implement an HMM for a dishonest casino. 
# Always starts fair. Shifts with probability 0.05. Stays with probability 0.95
# 3/6 for 6 1/10 for others for loaded, 1/6 for all for fair
# Generate a random sequence of 300 characters

import random

# Determine casino model borders
bord = 0
loaded_borders = [bord]
for prob in loaded_probs:
    bord += prob
    loaded_borders.append(bord)

bord = 0
fair_borders = [bord]
for prob in fair_probs:
    bord += prob
    fair_borders.append(bord)

def casino_hmm(length):
    sequence = ""
    states = ""
    
    # Decided starting state
    rand = random.random()
    if rand <= fair_start_prob:
        current_state = 'F'
        current_probs = fair_borders
        stay_prob = fair2fair_prob
    else:
        current_state = 'L'
        current_probs = loaded_borders
        stay_prob = loaded2loaded_prob
        
    for roll in range(0, length):
        # roll
        roll = -1
        rand = random.random()
        
        for i in range(len(current_probs)-1):
            if rand >= current_probs[i] and rand < current_probs[i+1]:
                roll = i+1
        
        sequence += str(roll)
        states += current_state
        
        # Switch or stay
        rand = random.random()
        if rand >= stay_prob:
            # Switch to loaded
            if current_state == 'F':
                current_state = 'L'
                current_probs = loaded_borders
                stay_prob = loaded2loaded_prob
            # Switch to fair
            else:
                current_state = 'F'
                current_probs = fair_borders
                stay_prob = fair2fair_prob
    
    return sequence, states

length = 300

sequence, states = casino_hmm(length)

for i in range(0, len(sequence), 100):
    print(states[i:i+100])
    print(sequence[i:i+100])
    print()

FFFFFFFFFFFFFFFFLLLFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFLLLLLLLLLLLLLLFFFFFFFFFFFFFFFFFFFFFFFFLLLLLLLLLLLL
4532164644515446356646626644615622142562461342444656566556163623551461521222531326345316522362142566

LFFFFFFFFFFFFFLLLLLLLLLLLLLFFFFFFFFFFFFFFFFFFFFFFFFFFFFFLLLLLLLLLLLLLLLFFFFFFFFFFFFFFFFFFFFLLLLLLLFF
3426146252361366641662163355523135265446564246653636455565644266466636351244133135441662311644566215

FFFFFFFFFFFFFFFFFFLLLLLLLLFFFFFFFFFFFFFFFFFFFFFFLFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
2633443415614352116656662644412642566222132224316426126241421662152316666636164152244143565422163143



2\. Downloaded as file1.txt and file2.txt

In [3]:
# Read in a sequence
def read_sequence(filename):
    with open(filename, 'r') as f:
        sequence = f.read()
        sequence = sequence.rstrip()
        
    return sequence

In [4]:
# 3. Calculate the probabilities of the two given sequences appearing assuming the dishonest casino model

# Calculate the probability using the forward algorithm
def forward_probability(sequence):
    
    # Initialize with the starting probabilities
    f_previous = fair_start_prob
    l_previous = loaded_start_prob
    
    # For every roll
    for roll in sequence:
        roll = int(roll)-1
        
        # Calculate the fair probabilities
        f_stay = fair2fair_prob * fair_probs[roll] * f_previous
        f_switch = loaded2fair_prob * fair_probs[roll] * l_previous
        f_total = f_stay + f_switch
        
        # Calculate the loaded probabilities
        l_stay = loaded2loaded_prob * loaded_probs[roll] * l_previous
        l_switch = fair2loaded_prob * loaded_probs[roll] * f_previous
        l_total = l_stay + l_switch
        
        # Setup for next roll
        f_previous = f_total
        l_previous = l_total
    
    # Sum both ending probabilities
    probability = f_previous + l_previous
    return probability


# Read in the sequences from the two files
sequence1 = read_sequence("file1.txt")
sequence2 = read_sequence("file2.txt")

# Calculate the probabilities of the two sequences
probability1 = forward_probability(sequence1)
probability2 = forward_probability(sequence2)

print("The probability of sequence 1 is: {}".format(probability1))
print("The probability of sequence 2 is: {}".format(probability2))

The probability of sequence 1 is: 2.719953435694084e-232
The probability of sequence 2 is: 1.935330320229189e-232


In [5]:
def write_states(filename, states):
    with open(filename, 'w') as f:
        f.write(states)
        
    return

In [6]:
# 4. Determine the most likely state sequences for each of the two given sequences, assuming the dishonest casino model
# The results are saved in viterbi.1.txt and viterbi.2.txt

# Determine the state sequence using the viterbi algorithm
def state_sequence(sequence):
    
    fair_trace = []
    loaded_trace = []
    
    # Initialize with the starting probabilities
    f_previous = fair_start_prob
    l_previous = loaded_start_prob
    
    # For every roll
    for roll in sequence:
        roll = int(roll)-1
        
        # Calculate the fair probabilities
        f_stay = fair2fair_prob * fair_probs[roll] * f_previous
        f_switch = loaded2fair_prob * fair_probs[roll] * l_previous
        
        if f_stay > f_switch:
            fair_trace.append('F')
            f_max = f_stay
        else:
            fair_trace.append('L')
            f_max = f_switch
        
        # Calculate the loaded probabilities
        l_stay = loaded2loaded_prob * loaded_probs[roll] * l_previous
        l_switch = fair2loaded_prob * loaded_probs[roll] * f_previous
        
        if l_stay > l_switch:
            loaded_trace.append('L')
            l_max = l_stay
        else:
            loaded_trace.append('F')
            l_max = l_switch
        
        # Setup for next roll
        f_previous = f_max
        l_previous = l_max
    
    # Find last state
    if f_previous > l_previous:
        current_trace = 'F'
        probability = f_previous
    else:
        current_trace = 'L'
        probability = l_previous
        
    # Trace back states
    states = ""
    for i in range(len(sequence)-1, -1, -1):
        if current_trace == 'F':
            current_trace = fair_trace[i]
        else:
            current_trace = loaded_trace[i]
            
        states += current_trace
            
    # Reverse traceback 
    states = states[::-1]
    
    return probability, states

# Read in the sequences from the two files
sequence1 = read_sequence("file1.txt")
sequence2 = read_sequence("file2.txt")

probability1, states1 = state_sequence(sequence1)
probability2, states2 = state_sequence(sequence2)

print(probability1)
for i in range(0, len(sequence1), 100):
    print(states1[i:i+100])
    print(sequence1[i:i+100])
    print()

print(probability2)
for i in range(0, len(sequence2), 100):
    print(states2[i:i+100])
    print(sequence2[i:i+100])
    print()
    
write_states("viterbi.1.txt", states1)
write_states("viterbi.2.txt", states2)

9.772693754559207e-239
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
2465233416421225115645161622365516442441262566515625212115141211136664346115156624116334211551136452

FFFFFFFFFFFFFFFFFFFFFFFFFLLLLLLLLLLLLLLLLLLLLLLLLLFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
2511356441264615454425546563616666656565466625646451424332626446125161316243213261561522651526534355

FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
4562431334525612122614311524612666146135632512356461511621345254355126435163526542665626664435462644

1.1842929059563827e-238
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
2242446626264656356545414523316466366412452313664212622123542456443113162513551312322453344233133256

FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
6332555222333336542552366554353344516515

5\. The ames non-virulent strain is saved as ames.fasta. The ames ancestor virulent strain is saved as ames_ancestor.fasta

6\. Completed

7\. The ames results are in ames.predict. The ames_ancestor results are in ames_ancestor.predict