In [19]:
import math

import numpy as np
import pandas as pd

Penn Treebank tag encodings: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [3]:
tags = ['RB', 'NN', 'TO']

In [4]:
# Define 'transition_counts' dictionary
# Note: values are the same as the ones in the assignment
transition_counts = {('NN', 'NN'): 16241,
                     ('RB', 'RB'): 2263,
                     ('TO', 'TO'): 2,
                     ('NN', 'TO'): 5256,
                     ('RB', 'TO'): 855,
                     ('TO', 'NN'): 734,
                     ('NN', 'RB'): 2431,
                     ('RB', 'NN'): 358,
                     ('TO', 'RB'): 200}

In [5]:
n_tags = len(tags)
transition_matrix = np.zeros((n_tags, n_tags))
transition_matrix

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [6]:
transition_matrix.shape

(3, 3)

In [7]:
sorted_tags = sorted(tags)
sorted_tags

['NN', 'RB', 'TO']

In [31]:
for i in range(n_tags):
    for j in range(n_tags):
        tag_tuple = (sorted_tags[i], sorted_tags[j])
        transition_matrix[i, j] = transition_counts.get(tag_tuple)
transition_matrix

array([[1.6241e+04, 2.4310e+03, 5.2560e+03],
       [3.5800e+02, 2.2630e+03, 8.5500e+02],
       [7.3400e+02, 2.0000e+02, 2.0000e+00]])

In [32]:
def print_matrix(matrix):
    print(pd.DataFrame(matrix, index=sorted_tags, columns=sorted_tags))

In [33]:
print_matrix(transition_matrix)

         NN      RB      TO
NN  16241.0  2431.0  5256.0
RB    358.0  2263.0   855.0
TO    734.0   200.0     2.0


In [34]:
# Scale
transition_matrix = transition_matrix / 10
print_matrix(transition_matrix)

        NN     RB     TO
NN  1624.1  243.1  525.6
RB    35.8  226.3   85.5
TO    73.4   20.0    0.2


In [35]:
rows_sum = transition_matrix.sum(axis=1, keepdims=True)
rows_sum

array([[2392.8],
       [ 347.6],
       [  93.6]])

In [36]:
# Normalize
transition_matrix = transition_matrix / rows_sum
print_matrix(transition_matrix)

          NN        RB        TO
NN  0.678745  0.101596  0.219659
RB  0.102992  0.651036  0.245972
TO  0.784188  0.213675  0.002137


In [37]:
transition_matrix.sum(axis=1, keepdims=True)

array([[1.],
       [1.],
       [1.]])

In [38]:
t_matrix_for = np.copy(transition_matrix)
t_matrix_np = np.copy(transition_matrix)

In [39]:
for i in range(n_tags):
    t_matrix_for[i, i] =  t_matrix_for[i, i] + math.log(rows_sum[i])
print_matrix(t_matrix_for)

          NN        RB        TO
NN  8.458964  0.101596  0.219659
RB  0.102992  6.502088  0.245972
TO  0.784188  0.213675  4.541167


In [40]:
d = np.diag(t_matrix_np)
d.shape

(3,)

In [41]:
d = np.reshape(d, (3,1))
d.shape

(3, 1)

In [42]:
d = d + np.vectorize(math.log)(rows_sum)
np.fill_diagonal(t_matrix_np, d)
print_matrix(t_matrix_np)

          NN        RB        TO
NN  8.458964  0.101596  0.219659
RB  0.102992  6.502088  0.245972
TO  0.784188  0.213675  4.541167


In [43]:
t_matrix_for == t_matrix_np

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])