In [2]:
import pandas as pd
import numpy as np
from collections import Counter

In [16]:
# read in training
train = pd.read_table("/Users/amyburkhardt/Dropbox/NLP Readings/hw 1/POS-training.txt",'\t', 
                      header=None, 
                      skip_blank_lines=False, 
                      keep_default_na = False,
                      names = ['word_Num', 'word', 'tag'])

In [18]:
tags = ['CC', 'CD',
        'DT',
        'EX',
        'FW',
        'IN', 
        'JJ', 'JJR', 'JJS',
        'LS', 
        'MD',
        'NN', 'NNS', 'NNP', 'NNPS',
        'PDT', 'POS', 'PRP', 'PRP$',
        'RB', 'RBR', 'RBS', 'RP',
        'SYM', 
        'TO', 
        'UH', 
        'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ',
        'WDT', 'WP', 'WP$', 'WRB', 
        '$', '#', '"', '(', ')', ',', '.', ':'
       ]

In [19]:
def ngram_dict(data, ngrams):
    """
    Creates dict of ngrams (key) and count (value). 
    
    Arguments: 
        DataFrame with 'tag' column 
    Returns:
        A dict where key is either a unigram or a bigram tuple, and value is the count of the ngrams
    """
    if ngrams == "bigram": 
        col_1 = data['tag']
        col_2 = col_1[1:col_1.shape[0]]
        ngram_count = list(zip(col_1, col_2))
        ngram_count = dict(Counter(ngram_count))
        
    if ngrams == 'unigram': 
            ngram_count = dict(Counter(data.tag))
    
    return ngram_count
    

In [20]:
bigram_counts = ngram_dict(train, "bigram")
unigram_counts = ngram_dict(train, "unigram")

In [27]:
unigram_counts

{'': 14900,
 '.': 14901,
 ':': 5,
 'CC': 2008,
 'CD': 3951,
 'DT': 8063,
 'EX': 432,
 'FW': 355,
 'HYPH': 539,
 'IN': 12696,
 'JJ': 7363,
 'JJR': 1508,
 'JJS': 321,
 'LS': 9,
 'MD': 4717,
 'NN': 21147,
 'NNP': 686,
 'NNS': 5570,
 'PDT': 116,
 'POS': 578,
 'PRP': 12334,
 'PRP$': 272,
 'RB': 5629,
 'RBR': 409,
 'RBS': 22,
 'RP': 941,
 'TO': 4790,
 'UH': 3964,
 'VB': 13727,
 'VBD': 527,
 'VBG': 946,
 'VBN': 391,
 'VBP': 5522,
 'VBZ': 2305,
 'WDT': 581,
 'WP': 748,
 'WRB': 1072}

In [7]:
# probabilities are computed as the following: 
# out of the times we see the first tag in a labeled corpus (denominator)
# how often is the first tag folowed by the second (numerator)

In [37]:
def compute_transition_matrix (tags, bigram_counts, unigram_counts):
    """
    Compute probabilities for the transition matrix (45 x 44)
    
    Arguments: 
        tags: POS tags (that may or may not appear in training data)
        bigram_counts: count of bigrams of POS tags in training data (used for numerator)
        unigram_counts: count of unigram POS tag in training data (used for denominator)
        
    Returns: 45 x 44 matrix of transition probabilities for all possible POS tags
    
    """

    transition = [] # list of transition probabilities 
    
    # first compute the starting probabilities 

    for x in tags: 
            pair = ('',x) # here the period denotes the start of a sentence. Not very confident about this
            print(pair)
            denominator = unigram_counts['']
            try: 
                 numerator = bigram_counts[pair] + 1 
            except:
                 numerator = 1
            transition.append(numerator / denominator)


    # then compute everything else 
    
    for x in tags:
        for y in tags:
            pair = (x,y)
            try:
                denominator = unigram_counts[x] + 1 
            except: 
                denominator = 1
            try: 
                numerator = bigram_counts[pair] + 1 
            except:
                numerator = 1 # this produces probabilities of 1, when they should be much loser to zero; not right. 
            transition.append(numerator / denominator)
   
    
    transition = np.array(transition)
    tran_matrix = transition.reshape(45,44)
    
    return tran_matrix


In [38]:
tran = compute_transition_matrix (tags, bigram_counts, unigram_counts)

('', 'CC')
('', 'CD')
('', 'DT')
('', 'EX')
('', 'FW')
('', 'IN')
('', 'JJ')
('', 'JJR')
('', 'JJS')
('', 'LS')
('', 'MD')
('', 'NN')
('', 'NNS')
('', 'NNP')
('', 'NNPS')
('', 'PDT')
('', 'POS')
('', 'PRP')
('', 'PRP$')
('', 'RB')
('', 'RBR')
('', 'RBS')
('', 'RP')
('', 'SYM')
('', 'TO')
('', 'UH')
('', 'VB')
('', 'VBD')
('', 'VBG')
('', 'VBN')
('', 'VBP')
('', 'VBZ')
('', 'WDT')
('', 'WP')
('', 'WP$')
('', 'WRB')
('', '$')
('', '#')
('', '"')
('', '(')
('', ')')
('', ',')
('', '.')
('', ':')


In [36]:
tran

array([[  1.67785235e-03,   1.41610738e-02,   4.86577181e-02, ...,
          6.71140940e-05,   1.34228188e-04,   6.71140940e-05],
       [  4.97760080e-04,   1.06022897e-01,   9.25833748e-02, ...,
          4.97760080e-04,   4.97760080e-04,   4.97760080e-04],
       [  4.88360324e-02,   7.21153846e-02,   2.53036437e-04, ...,
          2.53036437e-04,   1.08805668e-02,   2.53036437e-04],
       ..., 
       [  1.00000000e+00,   1.00000000e+00,   1.00000000e+00, ...,
          1.00000000e+00,   1.00000000e+00,   1.00000000e+00],
       [  6.71050866e-05,   6.71050866e-05,   6.71050866e-05, ...,
          6.71050866e-05,   6.71050866e-05,   6.71050866e-05],
       [  1.66666667e-01,   1.66666667e-01,   1.66666667e-01, ...,
          1.66666667e-01,   1.66666667e-01,   1.66666667e-01]])