<a href="https://colab.research.google.com/github/dsogden/NLP-Specialization/blob/main/Chap2_W2_Working_with_Tags.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

tags = ['RB', 'NN', 'TO']

In [2]:
transition_counts = {
    ('NN', 'NN'): 16241,
    ('RB', 'RB'): 2263,
    ('TO', 'TO'): 2,
    ('NN', 'TO'): 5256,
    ('RB', 'TO'): 855,
    ('TO', 'NN'): 734,
    ('NN', 'RB'): 2431,
    ('RB', 'NN'): 358,
    ('TO', 'RB'): 200
}

### Using Numpy for matrix creation

In [3]:
num_tags = len(tags)
transition_matrix = np.zeros((num_tags, num_tags))
transition_matrix

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [4]:
sorted_tags = sorted(tags)
print(f'Unsorted: {tags}')
print(f'Sorted: {sorted_tags}')

for i in range(num_tags):
    for j in range(num_tags):
        tag_tuple = (sorted_tags[i], sorted_tags[j])
        transition_matrix[i, j] = transition_counts.get(tag_tuple)
transition_matrix

Unsorted: ['RB', 'NN', 'TO']
Sorted: ['NN', 'RB', 'TO']


array([[1.6241e+04, 2.4310e+03, 5.2560e+03],
       [3.5800e+02, 2.2630e+03, 8.5500e+02],
       [7.3400e+02, 2.0000e+02, 2.0000e+00]])

In [5]:
def print_matrix(matrix, indices, columns):
    print(pd.DataFrame(matrix, index=indices, columns=columns))
print_matrix(transition_matrix, sorted_tags, sorted_tags)

         NN      RB      TO
NN  16241.0  2431.0  5256.0
RB    358.0  2263.0   855.0
TO    734.0   200.0     2.0


In [6]:
# scale transition matrix
transition_matrix = transition_matrix / 10
print_matrix(transition_matrix, sorted_tags, sorted_tags)

        NN     RB     TO
NN  1624.1  243.1  525.6
RB    35.8  226.3   85.5
TO    73.4   20.0    0.2


In [7]:
rows_sum = transition_matrix.sum(axis=1, keepdims=True)
rows_sum

array([[2392.8],
       [ 347.6],
       [  93.6]])

In [8]:
transition_matrix = transition_matrix / rows_sum
print_matrix(transition_matrix, sorted_tags, sorted_tags)

          NN        RB        TO
NN  0.678745  0.101596  0.219659
RB  0.102992  0.651036  0.245972
TO  0.784188  0.213675  0.002137


In [9]:
transition_matrix.sum(axis=1, keepdims=True)

array([[1.],
       [1.],
       [1.]])

In [10]:
import math

t_matrix_for = np.copy(transition_matrix)
t_matrix_np = np.copy(transition_matrix)

for i in range(num_tags):
    t_matrix_for[i, i] = t_matrix_for[i, i] + math.log(rows_sum[i])
print_matrix(t_matrix_for, sorted_tags, sorted_tags)

          NN        RB        TO
NN  8.458964  0.101596  0.219659
RB  0.102992  6.502088  0.245972
TO  0.784188  0.213675  4.541167


  t_matrix_for[i, i] = t_matrix_for[i, i] + math.log(rows_sum[i])


In [11]:
d = np.diag(t_matrix_np)
d.shape

(3,)

In [12]:
d = np.reshape(d, (3, 1))
d.shape

(3, 1)

In [13]:
d = d + np.vectorize(math.log)(rows_sum)
np.fill_diagonal(t_matrix_np, d)
print_matrix(t_matrix_np, sorted_tags, sorted_tags)

          NN        RB        TO
NN  8.458964  0.101596  0.219659
RB  0.102992  6.502088  0.245972
TO  0.784188  0.213675  4.541167
