In [1]:
# Allow multiple print statements in a cell in Jupyter Notebook
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [2]:
import numpy as np
import csv

In [3]:
DATA_PATH = "./data"
DATASET_FILES = {
    "ITALIAN": {
        "TRAIN": f"{DATA_PATH}/it_isdt_train_tagged.txt",
        "DEV_RAW": f"{DATA_PATH}/it_isdt_dev_raw.txt",
        "DEV_TAGGED": f"{DATA_PATH}/it_isdt_dev_tagged.txt",
    },
    "JAPANESE": {
        "TRAIN": f"{DATA_PATH}/ja_gsd_train_tagged.txt",
        "DEV_RAW": f"{DATA_PATH}/ja_gsd_dev_raw.txt",
        "DEV_TAGGED": f"{DATA_PATH}/ja_gsd_dev_tagged.txt",
    },
}
MODEL_FILE = "hmmmodel.txt"


In [4]:
START_TAG = "<START>"
END_TAG = "<END>"

In [5]:
train_document = list()

In [6]:
with open(DATASET_FILES["ITALIAN"]["TRAIN"], mode="r") as file:
    csv_reader = csv.reader(file, delimiter=" ", skipinitialspace=True, quoting=csv.QUOTE_NONE)
    for sentence in csv_reader:
        train_document.append(sentence)

In [7]:
tag_count_dict = {START_TAG: len(train_document), }
word_tag_count_dict = {}
tag_tag_count_dict = {START_TAG: {},}

In [8]:
count = len(train_document)
# Process count number of sentences from document
for idx, sentence in enumerate(train_document):
    if idx == count:
        break

    prev_tag = START_TAG
    sentence_last_idx = len(sentence) - 1
    for idx, word_tag_pair in enumerate(sentence):
        # Extract word tag
        word, tag = word_tag_pair.rsplit("/", 1)

        # Count the Tag!
        if tag not in tag_count_dict:
            tag_count_dict[tag] = 1
        else:
            tag_count_dict[tag] += 1

        # Count the Word!
        if word not in word_tag_count_dict:
            word_tag_count_dict[word] = {tag: 1}
        else:
            # Check if the tag is in the dict
            if tag not in word_tag_count_dict[word]:
                word_tag_count_dict[word][tag] = 1
            else:
                word_tag_count_dict[word][tag] += 1

        # Count tag-tag
        if prev_tag in tag_tag_count_dict:
            if tag not in tag_tag_count_dict[prev_tag]:
                tag_tag_count_dict[prev_tag][tag] = 1
            else:
                tag_tag_count_dict[prev_tag][tag] += 1
        else:
            tag_tag_count_dict[prev_tag] = {tag: 1}

        # If this is the last word/tag pair, end add count for END_TAG
        if idx == sentence_last_idx:
            if tag not in tag_tag_count_dict:
                tag_tag_count_dict[tag] = {END_TAG: 1}

            if END_TAG not in tag_tag_count_dict[tag]:
                tag_tag_count_dict[tag][END_TAG] = 1
            else:
                tag_tag_count_dict[tag][END_TAG] +=1

        prev_tag = tag


In [9]:
tag_count_dict, tag_tag_count_dict, word_tag_count_dict

({'<START>': 13121,
  'SP': 13670,
  'FS': 11925,
  'S': 54975,
  'E': 41893,
  'RD': 35237,
  'A': 17349,
  'FC': 1241,
  'RI': 4284,
  'VA': 5923,
  'VM': 1758,
  'V': 26560,
  'PR': 2879,
  'FB': 5436,
  'PI': 949,
  'FF': 12692,
  'CC': 7549,
  'N': 4795,
  'PC': 4333,
  'AP': 1701,
  'NO': 976,
  'B': 8701,
  'I': 62,
  'T': 379,
  'PE': 647,
  'CS': 2833,
  'DI': 1502,
  'BN': 1827,
  'PD': 770,
  'DD': 1014,
  'DE': 4,
  'DQ': 875,
  'PQ': 823,
  'SW': 213,
  'PP': 29,
  'DR': 39,
  'SYM': 88,
  'X': 63,
  'PART': 24,
  'Sw': 1},
 {'<START>': {'SP': 566,
   'S': 780,
   'A': 103,
   'RI': 206,
   'V': 908,
   'RD': 2720,
   'VA': 165,
   'FB': 530,
   'E': 2096,
   'PI': 97,
   'B': 1316,
   'PC': 266,
   'CS': 418,
   'N': 378,
   'BN': 164,
   'T': 28,
   'PR': 37,
   'CC': 498,
   'PE': 88,
   'I': 24,
   'DD': 145,
   'PD': 85,
   'DI': 175,
   'PQ': 634,
   'SW': 5,
   'DQ': 512,
   'VM': 84,
   'DE': 1,
   'NO': 33,
   'X': 9,
   'FF': 45,
   'AP': 5},
  'SP': {'FS': 2465,

In [10]:
words = list(word_tag_count_dict.keys())
tags = list(tag_tag_count_dict.keys())

In [11]:
# Create row and column headers for access
from copy import deepcopy

tags_row = deepcopy(tags)

tags_col = [*tags_row, END_TAG]
tags_col.remove(START_TAG)

words_row = words
words_tags_col = deepcopy(tags)
words_tags_col.remove(START_TAG)

In [12]:
n_words = len(words)
n_tags = len(tags)

n_words, n_tags

(28307, 40)

In [13]:
# Create empty transition and emission probability matrices
transition_probability_matrix_dims = (n_tags, n_tags)
emission_probability_matrix_dims = (n_words, n_tags - 1)

transition_probability_matrix = np.zeros(shape=transition_probability_matrix_dims, dtype=np.float64)
emission_probability_matrix = np.zeros(shape=emission_probability_matrix_dims, dtype=np.float64)

In [14]:
# Fill in emission probablity matrix
for row_idx, row_word in enumerate(words):
    for col_idx, col_tag in enumerate(words_tags_col):
        if col_tag not in word_tag_count_dict[row_word]:
            emission_probability_matrix[row_idx][col_idx] = 0.0
        else:
            emission_probability_matrix[row_idx][col_idx] = word_tag_count_dict[row_word][col_tag] / tag_count_dict[col_tag]

In [15]:
emission_probability_matrix

array([[2.19458669e-04, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 8.16603774e-01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 1.81900864e-05, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 0.00000000e+00, 1.81900864e-05, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.31528895e-05, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.31528895e-05, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [16]:
tag_tag_count_dict["<START>"]

{'SP': 566,
 'S': 780,
 'A': 103,
 'RI': 206,
 'V': 908,
 'RD': 2720,
 'VA': 165,
 'FB': 530,
 'E': 2096,
 'PI': 97,
 'B': 1316,
 'PC': 266,
 'CS': 418,
 'N': 378,
 'BN': 164,
 'T': 28,
 'PR': 37,
 'CC': 498,
 'PE': 88,
 'I': 24,
 'DD': 145,
 'PD': 85,
 'DI': 175,
 'PQ': 634,
 'SW': 5,
 'DQ': 512,
 'VM': 84,
 'DE': 1,
 'NO': 33,
 'X': 9,
 'FF': 45,
 'AP': 5}

In [17]:
# Fill in transition probablity matrix
for row_idx, row_tag in enumerate(tags_row):
    for col_idx, col_tag in enumerate(tags_col):
        if col_tag not in tag_tag_count_dict[row_tag]:
            transition_probability_matrix[row_idx][col_idx] = 0.0
        else:
            transition_probability_matrix[row_idx][col_idx] = tag_tag_count_dict[row_tag][col_tag] / tag_count_dict[row_tag]

In [18]:
transition_probability_matrix

array([[4.31369560e-02, 0.00000000e+00, 5.94466885e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.28529627e-01, 1.80321873e-01, 6.80321873e-03, ...,
        1.60936357e-03, 0.00000000e+00, 9.50987564e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 9.97400419e-01],
       ...,
       [3.17460317e-02, 9.52380952e-02, 1.58730159e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.08333333e-01, 8.33333333e-02, 4.16666667e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [19]:
# Load development data
dev_raw_document = list()
dev_raw_tagged_document = list()
with open(DATASET_FILES["ITALIAN"]["DEV_RAW"], mode = "r") as raw_file, \
        open(DATASET_FILES["ITALIAN"]["DEV_TAGGED"], mode = "r") as raw_tagged_file:

    raw_csv_reader = csv.reader(raw_file, delimiter=" ", skipinitialspace=True, quoting=csv.QUOTE_NONE)
    for sentence in raw_csv_reader:
        dev_raw_document.append(sentence)

    raw_tagged_csv_reader = csv.reader(raw_tagged_file, delimiter=" ", skipinitialspace=True, quoting=csv.QUOTE_NONE)
    for sentence in raw_tagged_csv_reader:
        dev_raw_tagged_document.append(sentence)

In [20]:
sample, sample_tagged = dev_raw_document[0], dev_raw_tagged_document[0]

sample, sample_tagged

(['Corriere', 'Sport', 'da', 'pagina', '23', 'a', 'pagina', '26'],
 ['Corriere/SP',
  'Sport/SP',
  'da/E',
  'pagina/S',
  '23/N',
  'a/E',
  'pagina/S',
  '26/N'])