In [117]:
"""
Daniel Wu
CSCI 544
Programming Assignment 2
"""

def pause():
    programPause = input("Press the <ENTER> key to continue...")
    print("Paused Program")

# Ron Artstein Notes below:

### This program is a very simple lemmatizer, which learns a
### lemmatization function from an annotated corpus. The function is
### so basic I wouldn't even consider it machine learning: it's
### basically just a big lookup table, which maps every word form
### attested in the training data to the most common lemma associated
### with that form. At test time, the program checks if a form is in
### the lookup table, and if so, it gives the associated lemma; if the
### form is not in the lookup table, it gives the form itself as the
### lemma (identity mapping).

### The program performs training and testing in one run: it reads the
### training data, learns the lookup table and keeps it in memory,
### then reads the test data, runs the testing, and reports the
### results.

### The program takes two command line arguments, which are the paths
### to the training and test files. Both files are assumed to be
### already tokenized, in Universal Dependencies format, that is: each
### token on a separate line, each line consisting of fields separated
### by tab characters, with word form in the second field, and lemma
### in the third field. Tab characters are assumed to occur only in
### lines corresponding to tokens; other lines are ignored.

import os
import sys
import re
import operator

### Global variables

# Paths for data are read from command line
# train_file = sys.argv[1]
# test_file = sys.argv[2]

train_file = "/Users/user/Desktop/Fall_2020/CSCI_544/Coding_Assignments/PA2/UD_Hindi-HDTB-master/hi_hdtb-ud-train.conllu"
test_file = "/Users/user/Desktop/Fall_2020/CSCI_544/Coding_Assignments/PA2/UD_Hindi-HDTB-master/hi_hdtb-ud-test.conllu"


# Counters for lemmas in the training data: word form -> lemma -> count
lemma_count = {}

# Lookup table learned from the training data: word form -> lemma
lemma_max = {}

# Variables for reporting results
training_stats = ['Wordform types' , 'Wordform tokens' , 'Unambiguous types' , 
                  'Unambiguous tokens' , 'Ambiguous types' , 'Ambiguous tokens' , 
                  'Ambiguous most common tokens' , 'Identity tokens']

training_counts = dict.fromkeys(training_stats , 0)

test_outcomes = ['Total test items' , 'Found in lookup table' , 'Lookup match' , 
                 'Lookup mismatch' , 'Not found in lookup table' , 'Identity match' , 
                 'Identity mismatch']

test_counts = dict.fromkeys(test_outcomes , 0)

accuracies = {}

### Training: read training data and populate lemma counters

train_data = open (train_file , 'r')

token_count = 0
type_count = 0

for line in train_data:
    
    
    # Tab character identifies lines containing tokens
    if re.search('\t' , line):
        
        token_count += 1
        
        # Tokens represented as tab-separated fields
        field = line.strip().split('\t')

        # Word form in second field, lemma in third field
        form = field[1]
        lemma = field[2]
        
        ######################################################
        ### Insert code for populating the lemma counts    ###
        ######################################################                        
        
        # If new word form
        if form not in lemma_count.keys():
            lemma_count[form] = {}            
            type_count += 1
        
        #if new lemma
        if lemma not in lemma_count[form].keys():            
            lemma_count[form][lemma] = 1
                
        elif lemma in lemma_count[form].keys():            
            lemma_count[form][lemma] += 1  

            
ambig_token_count = 0
ambig_type_count = 0
unambig_token_count = 0
unambig_type_count = 0

identity_token_count = 0

ambig_most_common_count = 0 


### Model building and training statistics

for form in lemma_count.keys():

    ######################################################
    ### Insert code for building the lookup table      ###
    ######################################################

    lemma_max[form] = max(lemma_count[form].items(), key = operator.itemgetter(1))[0]
    
    if len(lemma_count[form]) > 1:
        ambig_token_count = ambig_token_count + sum(v for v in lemma_count[form].values())
        
        ambig_type_count += 1
            
        most_common_count = max(lemma_count[form].items(), key = operator.itemgetter(1))[1]        
        
        ambig_most_common_count = ambig_most_common_count + most_common_count
        
    if form in lemma_count[form]:
        identity_token_count = identity_token_count + lemma_count[form][form]
        
unambig_token_count = token_count - ambig_token_count
unambig_type_count = type_count - ambig_type_count    

######################################################
### Insert code for populating the training counts ###
######################################################

training_counts['Wordform types'] = type_count        
training_counts['Wordform tokens'] = token_count
training_counts['Unambiguous types'] = unambig_type_count
training_counts['Unambiguous tokens'] = unambig_token_count
training_counts['Ambiguous types'] = ambig_type_count         
training_counts['Ambiguous tokens'] = ambig_token_count
training_counts['Ambiguous most common tokens'] = ambig_most_common_count
training_counts['Identity tokens'] = identity_token_count
        
### Calculate expected accuracy if we used lookup on all items ###
accuracies['Expected lookup'] = (unambig_token_count + ambig_most_common_count) / token_count

### Calculate expected accuracy if we used identity mapping on all items ###
accuracies['Expected identity'] = identity_token_count / token_count

{'Wordform types': 16879, 'Wordform tokens': 281057, 'Unambiguous types': 16465, 'Unambiguous tokens': 196204, 'Ambiguous types': 414, 'Ambiguous tokens': 84853, 'Ambiguous most common tokens': 75667, 'Identity tokens': 201485}
0.7168830521922599


In [121]:
### Testing: read test data, and compare lemmatizer output to actual lemma
    
test_data = open (test_file , 'r')

test_item_count = 0
found_count = 0
lookup_match_count = 0

id_match_count = 0
id_mismatch_count = 0

for line in test_data:

    # Tab character identifies lines containing tokens
    if re.search ('\t' , line):

        # Tokens represented as tab-separated fields
        field = line.strip().split('\t')

        # Word form in second field, lemma in third field
        form = field[1]
        lemma = field[2]

        ######################################################
        ### Insert code for populating the test counts     ###
        ######################################################
        
        test_item_count += 1
        
        if form in lemma_max:            
            found_count += 1
            
            if lemma == lemma_max[form]:                
                lookup_match_count += 1
            
        elif form not in lemma_max:
            
            if form == lemma:                                
                    id_match_count += 1
            elif form != lemma:
                id_mismatch_count += 1
                        
not_found_count = test_item_count - found_count

lookup_mismatch_count = found_count - lookup_match_count
        
    
test_counts['Total test items'] = test_item_count 
test_counts['Found in lookup table'] = found_count 
test_counts['Lookup match'] = lookup_match_count
test_counts['Lookup mismatch'] = lookup_mismatch_count
test_counts['Not found in lookup table'] = not_found_count
test_counts['Identity match'] = id_match_count
test_counts['Identity mismatch'] = id_mismatch_count


### Calculate accuracy on the items that used the lookup table ###
accuracies['Lookup'] = lookup_match_count / found_count

### Calculate accuracy on the items that used identity mapping ###
accuracies['Identity'] = id_match_count / (id_match_count + id_mismatch_count)

### Calculate overall accuracy ###
accuracies['Overall'] = (lookup_match_count + id_match_count) / test_item_count


{'Total test items': 35430, 'Found in lookup table': 33849, 'Lookup match': 32628, 'Lookup mismatch': 1221, 'Not found in lookup table': 1581, 'Identity match': 1227, 'Identity mismatch': 354}
{'Expected lookup': 0.9673162383431119, 'Expected identity': 0.7168830521922599, 'Lookup': 0.9639280333244704, 'Identity': 0.7760910815939279, 'Overall': 0.9555461473327689}


In [128]:
### Report training statistics and test results
                
output = open('lookup-output.txt' , 'w')

output.write('Training statistics\n')

for stat in training_stats:
    output.write(stat + ': ' + str(training_counts[stat]) + '\n')

for model in ['Expected lookup' , 'Expected identity']:
    output.write(model + ' accuracy: ' + str(accuracies[model]) + '\n')

output.write ('Test results\n')

for outcome in test_outcomes:
    output.write(outcome + ': ' + str(test_counts[outcome]) + '\n')

for model in ['Lookup' , 'Identity' , 'Overall']:
    output.write(model + ' accuracy: ' + str(accuracies[model]) + '\n')

output.close()

<function TextIOWrapper.close()>