# Compute Counts From Scratch

In [126]:
!python count_freqs.py gene.train > gene.counts

# Fetch NGram Counts

In [139]:
import pandas as pd

df = pd.read_csv('gene.ngrams', sep=' ', names=['Count', 'NGram', 'One', 'Two', 'Three'])

# Unigram Tag Counts

In [140]:
counts = {}

for tag, count in df[df['NGram'] == '1-GRAM'][['One', 'Count']].values:
    counts[tag] = count

# Reference Counts

In [141]:
import pandas as pd

df = pd.read_csv('gene.counts', sep=' ', names=['Count', 'WORDTAG', 'Tag', 'Token'])

In [142]:
df[df['Token'] == 'the']

Unnamed: 0,Count,WORDTAG,Tag,Token
10103,11,WORDTAG,I-GENE,the


# Compile Token Counts

In [143]:
with open('gene.counts', 'r') as f:
    lines = f.readlines()

In [144]:
from collections import defaultdict

c = defaultdict(lambda: defaultdict(int))

for line in lines:
    count, _, tag, token = line.split()
    c[token][tag] = int(count)

# Find Infrequent Words

In [145]:
infrequent_words = set()

for token in c:
    if c[token]['O'] + c[token]['I-GENE'] < 5:
        infrequent_words.add(token)

# Write Infrequent Words Back To Disk

In [147]:
with open('gene.count+infrequents', 'w') as f:
    for line in lines:
        count, WORDTAG, tag, token = line.split()
        
        if token in infrequent_words:
            f.write(' '.join([count, WORDTAG, tag, '_RARE_']) + '\n')
        else:
            f.write(line)

# Recompute Counts with Infrequent Word Class

In [148]:
with open('gene.count+infrequents', 'r') as f:
    lines = f.readlines()  

In [149]:
c = defaultdict(lambda: defaultdict(int))

for line in lines:
    count, _, tag, token = line.split()
    c[token][tag] += int(count)

# Append Infrequent Word Count

In [151]:
with open('gene.counts', 'a') as f:
    for tag in c['_RARE_']:
        f.write(' '.join([str(c['_RARE_'][tag]), 'WORDTAG', tag, '_RARE_']) + '\n')

# Read All Words Back in One More Time

In [152]:
with open('gene.counts', 'r') as f:
    lines = f.readlines()

In [153]:
from collections import defaultdict

for line in lines:
    count, _, tag, token = line.split()
    c[token][tag] = int(count)

# Compute Emission Probabilities

In [154]:
e = defaultdict(lambda: defaultdict(int))

for token in c:
    for tag in c[token]:
        e[token][tag] = c[token][tag] / float(counts[tag])

In [156]:
c['_RARE_']

defaultdict(<type 'int'>, {'I-GENE': 8732, 'O': 28781})

# Baseline Simple Tagger

In [159]:
with open('gene.dev', 'r') as f:
    tokens = [ line.strip() for line in f.readlines() ]

# Tag Each Word with $\arg \max_y e(x|y)$

In [160]:
predictions = [''] * len(tokens)

for i, token in enumerate(tokens):
    
    # Empty line?
    if not token:
        predictions[i] = ('', '')
        continue
    
    # Infrequent token or OOV?
    if token in infrequent_words or not e[token]:
        identity = '_RARE_'
    else:
        identity = token
    
    # Tag word with the tag that gives it the highest emission probability
    predictions[i] = (token, 'O' if e[identity]['O'] > e[identity]['I-GENE'] else 'I-GENE')

# Write Predictions Back to Disk

In [161]:
with open('gene.predictions', 'w') as f:
    for prediction in predictions:
        f.write(' '.join(prediction).strip() + '\n')

In [162]:
run eval_gene_tagger gene.key gene.predictions

Found 2669 GENEs. Expected 642 GENEs; Correct: 424.

	 precision 	recall 		F1-Score
GENE:	 0.158861	0.660436	0.256116
