In [5]:
from collections import Counter
import os

In [6]:
# training data, three fields tab-separated
train_path = os.path.expanduser("~/Dropbox/NLP Readings/hw 1/POS-training.txt")

In [197]:
tags = ['CC', 'CD',
        'DT',
        'EX',
        'FW',
        'IN', 
        'JJ', 'JJR', 'JJS',
        'LS', 
        'MD',
        'NN', 'NNS', 'NNP', 'NNPS',
        'PDT', 'POS', 'PRP', 'PRP$',
        'RB', 'RBR', 'RBS', 'RP',
        'SYM', 
        'TO', 
        'UH', 
        'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ',
        'WDT', 'WP', 'WP$', 'WRB', 
        '$', '#', '"', '(', ')', ',', '.', ':', ''
       ]
V = len(tags)

In [198]:
def word_tag_from_file(filename):
    """
    filename: the name of the file containing tab-separated words and tags, one per line:
        #   word    TAG
    """
    with open(filename) as f:
        for line in f:
            line = line.rstrip()
            if not line:
                yield ('', '') # START of sentence
            else:
                _, word, tag = line.split("\t")
                yield (word, tag)

In [199]:
def count_tags(filename):
    """
    Takes the filename of a data file,
    counts the tag bigrams and unigrams, and returns those counts as a Counter dict
    returns a 2-tuple of Counter dicts: (unigrams, bigrams)
    """
    previous_tag = ''
    bigrams = Counter()
    unigrams = Counter()
    for _, tag in word_tag_from_file(filename):
        unigrams[tag] += 1
        bigrams[(previous_tag, tag)] += 1
        previous_tag = tag
    return (unigrams, bigrams)

In [200]:
uni, bi = count_tags(train_path)

In [201]:
uni

Counter({'': 14900,
         '.': 14901,
         ':': 5,
         'CC': 2008,
         'CD': 3951,
         'DT': 8063,
         'EX': 432,
         'FW': 355,
         'HYPH': 539,
         'IN': 12696,
         'JJ': 7363,
         'JJR': 1508,
         'JJS': 321,
         'LS': 9,
         'MD': 4717,
         'NN': 21147,
         'NNP': 686,
         'NNS': 5570,
         'PDT': 116,
         'POS': 578,
         'PRP': 12334,
         'PRP$': 272,
         'RB': 5629,
         'RBR': 409,
         'RBS': 22,
         'RP': 941,
         'TO': 4790,
         'UH': 3964,
         'VB': 13727,
         'VBD': 527,
         'VBG': 946,
         'VBN': 391,
         'VBP': 5522,
         'VBZ': 2305,
         'WDT': 581,
         'WP': 748,
         'WRB': 1072})

In [202]:
bi

Counter({('', '.'): 1,
         ('', 'CC'): 24,
         ('', 'CD'): 210,
         ('', 'DT'): 724,
         ('', 'EX'): 7,
         ('', 'FW'): 7,
         ('', 'IN'): 242,
         ('', 'JJ'): 245,
         ('', 'JJR'): 87,
         ('', 'JJS'): 9,
         ('', 'LS'): 9,
         ('', 'MD'): 548,
         ('', 'NN'): 615,
         ('', 'NNP'): 27,
         ('', 'NNS'): 38,
         ('', 'PDT'): 4,
         ('', 'PRP'): 5295,
         ('', 'PRP$'): 14,
         ('', 'RB'): 568,
         ('', 'RBR'): 3,
         ('', 'RP'): 14,
         ('', 'TO'): 57,
         ('', 'UH'): 1731,
         ('', 'VB'): 1819,
         ('', 'VBD'): 29,
         ('', 'VBG'): 21,
         ('', 'VBP'): 668,
         ('', 'VBZ'): 413,
         ('', 'WDT'): 148,
         ('', 'WP'): 528,
         ('', 'WRB'): 796,
         ('.', ''): 14900,
         (':', 'NN'): 5,
         ('CC', 'CD'): 212,
         ('CC', 'DT'): 185,
         ('CC', 'IN'): 100,
         ('CC', 'JJ'): 163,
         ('CC', 'JJR'): 74,
        

In [203]:
def laplace_smooth(numerator, denomenator, V, k=1):
    return (numerator+k)/(denomenator+k*V)

In [204]:
def make_trans_matrix(unigram_counts, bigram_counts):
    matrix = []
    # Start probabilities
    start_row = []
    for tag in tags:
        pair = ('', tag)
        # NOTE: this uses global V for now
        start_prob = laplace_smooth(bigram_counts[pair], unigram_counts[''], V)
        start_row.append(start_prob)
    matrix.append(start_row)
    
    # (tag, tag) cartesian transition
    for first in tags:
        row = []
        for second in tags:
            pair = (first, second)
            # NOTE: this uses global V for now
            prob = laplace_smooth(bigram_counts[pair], unigram_counts[first], V)
            row.append(prob)
        matrix.append(row)
    return matrix

In [205]:
A = make_trans_matrix(uni, bi)

In [184]:
len(A)

45

In [185]:
len(A[0])

44

In [186]:
md_ind = tags.index('MD')
A[0][md_ind] # smoothed probability of 'MD' starting  sentence; TODO: is this correct?

0.03673715203426124

In [196]:
# Verify that columns sum to about 1.0
dot_ind = tags.index('.')
nn_ind = tags.index('NN')

col = map(sum, A)
list(col)
A[len(A)-2]

[6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.691201070592172e-05,
 6.6912010705921

In [69]:
A

[[0.0016729122055674519,
  0.014119379014989294,
  0.0485144539614561,
  0.0005353319057815846,
  0.0005353319057815846,
  0.016260706638115633,
  0.016461456102783725,
  0.005888650963597431,
  0.0006691648822269807,
  0.0006691648822269807,
  0.03673715203426124,
  0.04122055674518201,
  0.002609743040685225,
  0.001873661670235546,
  6.691648822269808e-05,
  0.00033458244111349034,
  6.691648822269808e-05,
  0.354389721627409,
  0.001003747323340471,
  0.038075481798715206,
  0.0002676659528907923,
  6.691648822269808e-05,
  0.001003747323340471,
  6.691648822269808e-05,
  0.003881156316916488,
  0.11589935760171306,
  0.1217880085653105,
  0.002007494646680942,
  0.0014721627408993577,
  6.691648822269808e-05,
  0.04476713062098501,
  0.027703426124197003,
  0.009970556745182014,
  0.03539882226980728,
  6.691648822269808e-05,
  0.053332441113490364,
  6.691648822269808e-05,
  6.691648822269808e-05,
  6.691648822269808e-05,
  6.691648822269808e-05,
  6.691648822269808e-05,
  6.6916