# === Part I ===

In [2]:
!python count_freqs.py gene.train > gene.counts

# Unigram Tag Counts

In [4]:
import pandas as pd

df = pd.read_csv('gene.ngrams', sep=' ', names=['Count', 'NGram', 'One', 'Two', 'Three'])

unigram_count = {}

for tag, count in df[df['NGram'] == '1-GRAM'][['One', 'Count']].values:
    unigram_count[tag] = count

# Compile Token Counts

In [5]:
with open('gene.counts', 'r') as f:
    lines = f.readlines()

In [6]:
from collections import defaultdict

c = defaultdict(lambda: defaultdict(int))

for line in lines:
    count, _, tag, token = line.split()
    c[token][tag] = int(count)

# Find Infrequent Words

In [7]:
infrequent_words = set()

for token in c:
    if c[token]['O'] + c[token]['I-GENE'] < 5:
        infrequent_words.add(token)

In [28]:
import random

random.sample(infrequent_words, 1)[0]

'POI'

# Write Infrequent Words Back To Disk

In [34]:
with open('gene.train', 'r') as f:
    lines = f.readlines()
    
with open('rare.train', 'w') as f:
    for line in lines:
        # Empty line?
        if not line.strip():
            f.write(line)
            continue
        
        token, tag = line.split()
        
        if token in infrequent_words:
            f.write(' '.join(['_RARE_', tag]) + '\n')
        else:
            f.write(line)

# Recompute Word Counts

In [35]:
!python count_freqs.py rare.train > gene.counts

# Read All Words Back in One More Time

In [58]:
with open('gene.counts', 'r') as f:
    lines = f.readlines()
    
for line in lines:
    count, _, tag, token = line.split()
    c[token][tag] = int(count)

# Compute Emission Probabilities

In [59]:
e = defaultdict(lambda: defaultdict(int))

for token in c:
    for tag in c[token]:
        e[token][tag] = c[token][tag] / float(unigram_count[tag])

# === Part II ===

In [54]:
import pandas as pd

df = pd.read_csv('gene.ngrams', sep=' ', names=['Count', 'NGram', 'One', 'Two', 'Three'])

# Bigrams

In [55]:
from collections import defaultdict

bigram_count = defaultdict(lambda: defaultdict(int))

for count, BIGRAM, tag_one, tag_two, _ in df[df['NGram'] == '2-GRAM'].values.tolist():
    bigram_count[tag_one][tag_two] = count

# Trigrams

In [56]:
from collections import defaultdict

trigram_count = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

for count, TRIGRAM, tag_one, tag_two, tag_three in df[df['NGram'] == '3-GRAM'].values.tolist():
    trigram_count[tag_one][tag_two][tag_three] = count

# Compute $q_\text{MLE}$

In [57]:
import itertools
from collections import defaultdict

q = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

for next, current, previous in itertools.product(['*', 'STOP', 'O', 'I-GENE'], repeat=3):
    # Ignore situations where the premise is impossible
    if not bigram_count[previous][current]:
        continue
        
    q[next][previous][current] = trigram_count[previous][current][next] / float(bigram_count[previous][current])

# Viterbi Algorithm

In [88]:
import itertools
from collections import defaultdict

pi = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))


def viterbi(xs, q, e):
    # Initialize pi
    pi[0]['*']['*'], n = 1, len(xs)
    
    # Initialize S such that:
    #   S(-1) = S(0) = ['*']
    #   S(1:n) (inclusive) = ['O', 'I-GENE']
    #   S(n+1) = ['STOP']
    S = [['*']] + [('O', 'I-GENE') for _ in range(n) ] + [['STOP']] + [['*']]
    
    for k, x in enumerate(xs.split(), start=1):
        for u, v in itertools.product(S[k-1], S[k]):
            print 'Calculating pi[{}][{}][{}] = max w in {}...'.format(k, u, v, S[k-2])
            
            pi[k][u][v] = max([ pi[k-1][w][u]*q[v][w][u]*e[x][v] for w in S[k-2] ])
        
    return max([ pi[n][u][v]*q['STOP'][u][v] for u, v in itertools.product(S[n-1], S[n]) ])

In [89]:
viterbi('the', q, e)

Calculating pi[1][*][O] = max w in ['*']...
Calculating pi[1][*][I-GENE] = max w in ['*']...


0.0

# Debugging Viterbi

In [46]:
e['_RARE_']

defaultdict(<type 'int'>, {'I-GENE': 0.2126022594468251, 'O': 0.08339224867295612})

In [44]:
import itertools
from collections import defaultdict

highest_prob, backpointer = defaultdict(lambda: defaultdict(lambda: defaultdict(int))), defaultdict(lambda: defaultdict(lambda: defaultdict(int)))


def viterbi(xs, transition, emission):
    # Initialize highest_prob
    highest_prob[0]['*']['*'], n, = 1, len(xs.split())
    y = [''] * (n+1)
    
    # Initialize possible_tags such that:
    #   possible_tags(-1) = possible_tags(0) = ['*']
    #   possible_tags(1:n) (inclusive) = ['O', 'I-GENE']
    #   possible_tags(n+1) = ['STOP']
    possible_tags = [['*']] + [('O', 'I-GENE') for _ in range(n) ] + [['STOP']] + [['*']]
    
    print 'For k = 1...{}'.format(n)
    print
    
    for k, x in enumerate(xs.split(), start=1):
        print '** Time to compute highest_prob(k={}, u, v) for all u in possible_tags({})={} and v in possible_tags({})={}...'.format(k, k-1, possible_tags[k-1], k, possible_tags[k])
        print
        
        for u, v in itertools.product(possible_tags[k-1], possible_tags[k]):
            print '    ==========================================================================================================='
            print '    Calculating highest_prob(k={}, u={}, v={}) = max over w in possible_tags({})={} of highest_prob({}, w, u={}) * transition(v={} | w, u={}) * emission(x={} | v={})...'.format(k,u,v,k-2,possible_tags[k-2],k-1,u,v,u,x,v)
            print
            
            currents = []
            for w in possible_tags[k-2]:
                print '    Trying w={}...'.format(w)
                print '    Computing highest_prob({}, w={}, u={}) * transition(v={} | w={}, u={}) * emission(x={} | v={}) where:'.format(k-1,w,u,v,w,u,x,v)
                print
                print '        highest_prob({}, w={}, u={}) = {}'.format(k-1,w,u,highest_prob[k-1][w][u])
                print '        transition(v={} | w={}, u={}) = {}'.format(v,w,u,transition[v][w][u])
                print '        emission(x={} | v={}) = {}'.format(x,v,emission[x if x not in infrequent_words and emission[x] else '_RARE_'][v])
                print
                
                current = highest_prob[k-1][w][u] * transition[v][w][u] * emission[x if x not in infrequent_words and emission[x] else '_RARE_'][v]
                
                print '    Result = {}'.format(current)
                currents.append((current, w))
                print
               
            print
            print '    Highest probability tagging is: {}'.format(max(currents))
            print '    ==========================================================================================================='
            highest_prob[k][u][v], backpointer[k][u][v] = max(currents)
            
            print       
    
    print '** Finally compute max of highest_prob(n={}, u, v) * transition(STOP | u, v) over all u in possible_tags({})={} and v in possible_tags({})={}...'.format(n, n-1, possible_tags[n-1], n, possible_tags[n])
    print
    
    currents = []
    for u, v in itertools.product(possible_tags[n-1], possible_tags[n]):
        print '    ==========================================================================================================='
        print '    Computing highest_prob(n={}, u={}, v={}) * transition(STOP | u={}, v={}) where:'.format(n,u,v,u,v)
        print
        print '        highest_prob(n={}, u={}, v={}) = {}'.format(n,u,v,highest_prob[n][u][v])
        print '        transition(STOP | u={}, v={}) = {}'.format(u,v,transition['STOP'][u][v])
        print
        
        current = highest_prob[n][u][v] * transition['STOP'][u][v]
        print '    Result = {}'.format(current)
        currents.append((current, u, v))
       
    print
    print '    Highest probability tagging is: {}'.format(max(currents))
    print '    ==========================================================================================================='

    # Compute Backpoints
    _, y[n-1], y[n] = max(currents)
    for k in range(n-2, 0, -1):
        y[k] = backpointer[k+2][y[k+1]][y[k+2]]
        
    return y[1:]

# Clean(er) Viterbi

In [60]:
import itertools
from collections import defaultdict

highest_prob, backpointer = defaultdict(lambda: defaultdict(lambda: defaultdict(int))), defaultdict(lambda: defaultdict(lambda: defaultdict(int)))


def viterbi(xs, transition, emission):
    # Initialize highest_prob
    highest_prob[0]['*']['*'], n, = 1, len(xs.split())
    y = [''] * (n+1)
    
    # Initialize possible_tags such that:
    #   possible_tags(-1) = possible_tags(0) = ['*']
    #   possible_tags(1:n) (inclusive) = ['O', 'I-GENE']
    #   possible_tags(n+1) = ['STOP']
    possible_tags = [['*']] + [('O', 'I-GENE') for _ in range(n) ] + [['STOP']] + [['*']]
    
    for k, x in enumerate(xs.split(), start=1):
        for u, v in itertools.product(possible_tags[k-1], possible_tags[k]):
            
            currents = []
            for w in possible_tags[k-2]:
                current = highest_prob[k-1][w][u] * transition[v][w][u] * emission[x if c[x]['O'] + c[x]['I-GENE'] >= 5 else '_RARE_'][v]
                currents.append((current, w))
               
            highest_prob[k][u][v], backpointer[k][u][v] = max(currents)
    
    currents = []
    for u, v in itertools.product(possible_tags[n-1], possible_tags[n]):
        current = highest_prob[n][u][v] * transition['STOP'][u][v]
        currents.append((current, u, v))
       
    # Compute Backpoints
    _, y[n-1], y[n] = max(currents)
    for k in range(n-2, 0, -1):
        y[k] = backpointer[k+2][y[k+1]][y[k+2]]
        
    return y[1:]

In [61]:
viterbi('gene bar gene', q, e)

['I-GENE', 'I-GENE', 'O']

# Development Set

In [108]:
with open('gene.test', 'r') as f:
    tokens = [''] + [ line.strip() for line in f.readlines() ]
    
indicies = [ i for i, token in enumerate(tokens) if not token ]
sentences = [ ' '.join(tokens[beg+1:end]) for beg, end in zip(indicies, indicies[1:]) ]

In [109]:
predictions = [ (sentence.split(), viterbi(sentence, q, e)) for sentence in sentences ]

# Write Predictions Back

In [114]:
with open('gene_test.p2.out', 'w') as f:
    for sentence, tags in predictions:
        for token, tag in zip(sentence, tags):
            f.write(' '.join([token, tag]) + '\n')
        
        f.write('\n')

In [113]:
run eval_gene_tagger gene.key gene.predictions

Could not align gold standard and predictions in line 1.
Gold standard: BACKGROUND  Prediction file: Third


SystemExit: 1