In [97]:
#Read in all data
names = list(range(1,199))
labeled_sentences = []
sentence=[]
for name in names:
    file = open('dependency_treebank/wsj_'+str(name).zfill(4)+'.dp', 'r')
    for line in file.readlines():
        tokens = line.split("\t")
        word = ()
        if (len(tokens)>1):
            word = (tokens[0],tokens[1])
            sentence.append(word)
            if(tokens[1] == '.'):
                labeled_sentences.append(sentence)
                sentence = [] 

In [98]:
#Assign the first 90% of sentences to the training set and the remaining 10% to the test set
import math

split = math.ceil(len(labeled_sentences)*.9)
print('Split at %d' % split)
training_set = labeled_sentences[:split]
test_set = labeled_sentences[split:]

Split at 3484


In [99]:
#Create an unlabeled copy of the test set to tag
unlabeled_test_set = []
sentence = []
for item in test_set:
    for word in item:
        sentence.append(word[0])
    unlabeled_test_set.append(sentence)
    sentence = []

In [100]:
def evaluate_average(model):
    """Computes the average accuracy of a model. Evaluates accuracy sentence by sentence and
    takes the average to give long sentences the same weight as short sentences"""
    total = 0
    wrapper = []
    for sentence in test_set:
        wrapper.append(sentence)
        val = model.evaluate(wrapper)
        wrapper = []
        total += val
    avg = total/len(test_set)
    print('Average Accuracy:',avg)
    return avg

In [143]:
from nltk.util import unique_list
import numpy as np
import pandas as pd
import matplotlib as mpl
from matplotlib import pyplot

def create_table(name,results,tag_set):
    """Writes the results matrix for each model to a .csv file"""
    table = pd.DataFrame(results, index=tag_set, columns=tag_set)
    filename = name+'.csv'
    table.to_csv(filename, sep='\t')

def create_visualization(confusion_matrix):
    """Creates a beautiful confusion matrix visualization"""
    cmap = mpl.colors.ListedColormap(['white','yellow','orange','red'])
    bounds=[0,2,7,25,1420]
    norm = mpl.colors.BoundaryNorm(bounds,cmap.N)

    img = pyplot.imshow(confusion_matrix,interpolation='nearest',cmap=cmap,norm=norm)
    pyplot.colorbar(img,cmap=cmap,norm=norm,boundaries=bounds,ticks=bounds)
    pyplot.show()

def precision_recall(model,name):
    """Calculates the average precision and recall scores for each model by creating 
    a confusion matrix with the dimensions of the tagset. For each part of speech:
    precision = (# of correct tags)/(total tagged in testing)
    recall = (# of correct tags)/(total tagged in hand labeled set)
    The precision and recall values are summed over all of the parts of speech and
    divided by the number of parts of speech observed."""
    tag_set = unique_list(tag for sent in test_set+training_set
            for word, tag in sent)
    tag_set.append(None)
    indices = list(range(0,len(tag_set)))
    lookup = dict(zip(tag_set, indices))
    confusion_matrix = np.zeros((len(indices),len(indices)))
    for idx,sentence in enumerate(unlabeled_test_set):
        predicted = model.tag(sentence)
        print(predicted)
        for prediction,labeled in zip(predicted,test_set[idx]):
            confusion_matrix[lookup[prediction[1]]][lookup[labeled[1]]] += 1
    
    #create_table(name,confusion_matrix,tag_set)
    #create_visualization(confusion_matrix)
    
    precision=[]
    recall=[]
    tag_predictions = confusion_matrix.sum(axis=0)
    tag_occurences = confusion_matrix.sum(axis=1)
    for idx in indices[:-1]:
        num_correct = confusion_matrix[idx][idx]
        total_predicted = tag_predictions[idx]
        total_occurrences = tag_occurences[idx]
        if (total_predicted!=0):
            precision.append(num_correct/tag_predictions[idx])
        if (total_occurrences!=0):
            recall.append(num_correct/tag_occurences[idx]) 
    avg_precision = sum(precision)/len(precision)
    avg_recall = sum(recall)/len(recall)
    print('Average Precision:',avg_precision)
    print('Average Recall:',avg_recall)
    results = (avg_precision, avg_recall)
    return results

In [119]:
def compute_accuracy(model,name):
    """Computes overall accuracy, average accuracy, precision, and recall for the desired
    model"""
    pct = model.evaluate(test_set)
    print('Overall Accuracy:',pct)

    avg = evaluate_average(model)
    pr = precision_recall(model,name)
    results = [pct, avg, pr[0], pr[1]]
    return results

In [139]:
#accuracy computations for each model
import nltk
from collections import OrderedDict

results = OrderedDict()

unigram_tagger = nltk.UnigramTagger(training_set)
results['Unigram'] = compute_accuracy(unigram_tagger,'unigram')

Overall Accuracy: 0.8542409508373852
Average Accuracy: 0.8529120249815192
1220.0
Average Precision: 0.813363992136
Average Recall: 0.917322175822


In [144]:
bigram_tagger = nltk.BigramTagger(training_set)
results['Bigram'] = compute_accuracy(bigram_tagger,'bigram')

Overall Accuracy: 0.13452188006482982
Average Accuracy: 0.16265216972954502
[('New', 'NNP'), ('loans', None), ('continue', None), ('to', None), ('slow', None), (';', None), ('they', None), ('were', None), ('$', None), ('6.6', None), ('million', None), ('in', None), ('the', None), ('quarter', None), ('compared', None), ('with', None), ('$', None), ('361.8', None), ('million', None), ('a', None), ('year', None), ('ago', None), ('.', None)]
[('The', 'DT'), ('thrift', 'NN'), ('has', 'VBZ'), ('assets', None), ('of', None), ('$', None), ('3.2', None), ('billion', None), ('.', None)]
[('First', 'NNP'), ('of', 'IN'), ('America', 'NNP'), ('Bank', 'NNP'), ('Corp.', 'NNP'), ('said', 'VBD'), ('it', 'PRP'), ('completed', 'VBD'), ('its', 'PRP$'), ('acquisition', 'NN'), ('of', 'IN'), ('Midwest', None), ('Financial', None), ('Group', None), ('Inc.', None), ('for', None), ('about', None), ('$', None), ('250', None), ('million', None), ('.', None)]
[('First', 'NNP'), ('of', 'IN'), ('America', 'NNP'), ('

In [141]:
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(training_set, backoff=t0)
t2 = nltk.BigramTagger(training_set, backoff=t1)
t3 = nltk.TrigramTagger(training_set, backoff=t2)
results['Backoff'] = compute_accuracy(t3,'backoff')

Overall Accuracy: 0.882766072393301
Average Accuracy: 0.8825438245335168
1409.0
Average Precision: 0.848024541932
Average Recall: 0.886375514646


In [142]:
hmm_tagger = nltk.HiddenMarkovModelTagger.train(training_set)
results['HMM'] = compute_accuracy(hmm_tagger,'hmm')

Overall Accuracy: 0.8959481361426256
Average Accuracy: 0.8938869669207187
1244.0
Average Precision: 0.855096551634
Average Recall: 0.811946588796


In [109]:
#write results to a file
y_labels = ['Overall Accuracy','Average Accuracy','Average Precision','Average Recall']
table = pd.DataFrame(list(results.values()),columns=y_labels,index=list(results.keys()))
table.to_csv('results.csv', sep='\t')