In [66]:
import pandas as pd

df = pd.read_csv('gene.ngrams', sep=' ', names=['Count', 'NGram', 'One', 'Two', 'Three'])

# Bigrams

In [12]:
from collections import defaultdict

bigram_count = defaultdict(lambda: defaultdict(int))

for count, BIGRAM, tag_one, tag_two, _ in df[df['NGram'] == '2-GRAM'].values.tolist():
    bigram_count[tag_one][tag_two] = count

In [14]:
bigram_count['*']['*']

13796

# Trigrams

In [17]:
from collections import defaultdict

trigram_count = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

for count, TRIGRAM, tag_one, tag_two, tag_three in df[df['NGram'] == '3-GRAM'].values.tolist():
    trigram_count[tag_one][tag_two][tag_three] = count

# Compute $q_\text{MLE}$

In [26]:
import itertools
from collections import defaultdict

q = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

for next, current, previous in itertools.product(['*', 'STOP', 'O', 'I-GENE'], repeat=3):
    # Ignore situations where the premise is impossible
    if not bigram_count[previous][current]:
        continue
        
    q[next][previous][current] = trigram_count[previous][current][next] / float(bigram_count[previous][current])

# Viterbi Algorithm

In [22]:
import itertools
from collections import defaultdict

pi = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))


def viterbi(xs, q, e):
    # Initialize pi
    pi[0]['*']['*'] = 1
    
    # Initialize S such that:
    #   S(-1) = S(0) = ['*']
    #   S(1:n) (inclusive) = ['O', 'I-GENE']
    #   S(n+1) = ['STOP']
    S = [['*']] + [('O', 'I-GENE') for _ in range(len(xs))] + [['STOP']] + [['*']]
    
    for k, x in enumerate(xs.split(), start=1):
        for u, v in itertools.product(S[k-1], S[k]):
            print 'Calculating pi[{}][{}][{}] = max w in {}...'.format(k, u, v, S[k-2])
            
            pi[k][u][v] = max([pi[k-1][w][u]*q[v][w][u]*e[x][v] for w in S[k-2]])

In [23]:
viterbi('The dog barks', 'foo', 'bar')

Calculating pi[1][*][O] = max w in ['*']...


TypeError: string indices must be integers, not str