In [1]:
import os
import operator
from collections import Counter, defaultdict

In [2]:
dropbox = os.path.expanduser("~/Dropbox/")
train_path = dropbox + "NLP Readings/hw 1/POS-training.txt"
test_path= dropbox + "/NLP Readings/hw 1/POS-test.txt"

In [3]:
def word_tag_from_file(filename):
    """
    filename: the name of the file containing tab-separated words and tags, one per line:
        #   word    TAG
    """
    with open(filename) as f:
        for line in f:
            line = line.rstrip()
            if not line:
                # skip blank lines
                continue
            _, word, tag = line.split("\t")
            yield (word, tag)

def create_dict(filename):
    """
    Create a dictionary mapping each word to its most frequent tag.
    
    Args:
        filename: the name of the file containing tab-separated words and tags, one per line:
        #   word    TAG
    """
    # count all (word, tag) pairs
    count_dict = defaultdict(dict)
    for word, tag in word_tag_from_file(filename):
        # build dictionary of form:
        # {'word': {'TAG': count, 'TAG2': count, ...},
        # {'word2: {'TAG': count}, ...}}
        try:
            count_dict[word][tag] += 1
        except KeyError:
            # this is the first time we've seen this (word, tag) pair
            count_dict[word][tag] = 1

    # get the most frequent tag and use it for unknown words
    tag_counts = defaultdict(int)
    for word, tags in count_dict.items():
        for tag, count in tags.items():
            tag_counts[tag] += count
    top_tag = max(tag_counts, key=lambda c: tag_counts[c])
    count_dict['UNK'] = top_tag
    
    # keep only the most frequent tag for each word
    # After this, all keys and values are strings
    for word, tags in count_dict.items():
        if isinstance(tags, dict):
            count_dict[word] = max(tags, key=lambda tag: tags[tag])

    return count_dict

In [4]:
look_up = create_dict(train_path)

In [6]:
wrong = []
def predict_tag (filename, lookup):
    """
    Assigns a predicted tag and computes accuracy
    Arguments:
        filename: the name of the file containing tab-separated words and tags, one per line:
        #   word    TAG
        lookup: dictionary of the tag associated with the word (both strings): {word: tag}
        
    returns:
        accuracy: the accuracy of the file
    """
    total_count = 0
    correct_count = 0
    unk = lookup['UNK']
    for word, tag in word_tag_from_file(filename):
        # why did amy skip '.'?
        #if word == ".":
        #    continue
        predicted_tag = lookup.get(word)
        if not predicted_tag:
            predicted_tag = unk
        if predicted_tag == tag:
            correct_count += 1
        else:
            wrong.append(word)
        total_count += 1
    return correct_count / total_count

In [7]:
predict_tag(train_path, look_up)

0.9406733982536203

In [8]:
wrong = []
predict_tag(test_path,look_up)

0.8544011099121319