# Compute Counts From Scratch

In [177]:
!python count_freqs.py gene.train > gene.counts

# Fetch NGram Counts

In [178]:
import pandas as pd

df = pd.read_csv('gene.ngrams', sep=' ', names=['Count', 'NGram', 'One', 'Two', 'Three'])

# Unigram Tag Counts

In [179]:
counts = {}

for tag, count in df[df['NGram'] == '1-GRAM'][['One', 'Count']].values:
    counts[tag] = count

# Reference Counts

In [180]:
import pandas as pd

df = pd.read_csv('gene.counts', sep=' ', names=['Count', 'WORDTAG', 'Tag', 'Token'])

In [202]:
df[df['Tag'] == 'I-GENE'].sort('Count', ascending=False)

Unnamed: 0,Count,WORDTAG,Tag,Token
8876,4395,WORDTAG,I-GENE,-
8563,886,WORDTAG,I-GENE,gene
22184,879,WORDTAG,I-GENE,1
8418,800,WORDTAG,I-GENE,protein
1174,423,WORDTAG,I-GENE,kinase
2785,373,WORDTAG,I-GENE,2
5454,369,WORDTAG,I-GENE,factor
15330,321,WORDTAG,I-GENE,beta
11190,310,WORDTAG,I-GENE,promoter
12300,275,WORDTAG,I-GENE,binding


# Compile Token Counts

In [182]:
with open('gene.counts', 'r') as f:
    lines = f.readlines()

In [183]:
from collections import defaultdict

c = defaultdict(lambda: defaultdict(int))

for line in lines:
    count, _, tag, token = line.split()
    c[token][tag] = int(count)

# Find Infrequent Words

In [184]:
infrequent_words = set()

for token in c:
    if c[token]['O'] + c[token]['I-GENE'] < 5:
        infrequent_words.add(token)

# Write Infrequent Words Back To Disk

In [185]:
with open('gene.count+infrequents', 'w') as f:
    for line in lines:
        count, WORDTAG, tag, token = line.split()
        
        if token in infrequent_words:
            f.write(' '.join([count, WORDTAG, tag, '_RARE_']) + '\n')
        else:
            f.write(line)

# Recompute Counts with Infrequent Word Class

In [186]:
with open('gene.count+infrequents', 'r') as f:
    lines = f.readlines()  

In [187]:
c = defaultdict(lambda: defaultdict(int))

for line in lines:
    count, _, tag, token = line.split()
    c[token][tag] += int(count)

# Append Infrequent Word Count

In [188]:
with open('gene.counts', 'a') as f:
    for tag in c['_RARE_']:
        f.write(' '.join([str(c['_RARE_'][tag]), 'WORDTAG', tag, '_RARE_']) + '\n')

# Read All Words Back in One More Time

In [189]:
with open('gene.counts', 'r') as f:
    lines = f.readlines()

In [190]:
from collections import defaultdict

for line in lines:
    count, _, tag, token = line.split()
    c[token][tag] = int(count)

# Compute Emission Probabilities

In [191]:
e = defaultdict(lambda: defaultdict(int))

for token in c:
    for tag in c[token]:
        e[token][tag] = c[token][tag] / float(counts[tag])

In [192]:
c['_RARE_']

defaultdict(<type 'int'>, {'I-GENE': 8732, 'O': 28781})

# Baseline Simple Tagger

In [193]:
with open('gene.test', 'r') as f:
    tokens = [ line.strip() for line in f.readlines() ]

# Tag Each Word with $\arg \max_y e(x|y)$

In [194]:
predictions = [''] * len(tokens)

for i, token in enumerate(tokens):
    
    # Empty line?
    if not token:
        predictions[i] = ('', '')
        continue
    
    # Infrequent token or OOV?
    if token in infrequent_words or not e[token]:
        identity = '_RARE_'
    else:
        identity = token
    
    # Tag word with the tag that gives it the highest emission probability
    predictions[i] = (token, 'O' if e[identity]['O'] > e[identity]['I-GENE'] else 'I-GENE')

# Write Predictions Back to Disk

In [199]:
with open('gene_test.p1.out', 'w') as f:
    for prediction in predictions:
        f.write(' '.join(prediction).strip() + '\n')

# Submission

In [198]:
run submit

==
== [sandbox] Submitting Solutions 
==
Login (Email address): edward.banner@gmail.com
One-time Password (from the assignment page. This is NOT your own account's password): ewhjcS9vbH

== Connecting to Coursera ... 
Hello! These are the assignment parts that you can submit:
1) Unigram Tagger
2) Trigram Tagger
3) Extended Tagger
Please enter which part you want to submit (1-3): 1
File gene_test.p1.out not found


NameError: global name 'exit' is not defined