In [94]:
# Allow multiple print statements in a cell in Jupyter Notebook
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"


In [95]:
import numpy as np
import csv
import json
from copy import deepcopy

from typing import List, Dict


In [96]:
DATA_PATH = "./data"
OUTPUT_PATH = "./submission"
DATASET_FILES = {
    "ITALIAN": {
        "TRAIN": f"{DATA_PATH}/it_isdt_train_tagged.txt",
        "DEV_RAW": f"{DATA_PATH}/it_isdt_dev_raw.txt",
        "DEV_TAGGED": f"{DATA_PATH}/it_isdt_dev_tagged.txt",
    },
    "JAPANESE": {
        "TRAIN": f"{DATA_PATH}/ja_gsd_train_tagged.txt",
        "DEV_RAW": f"{DATA_PATH}/ja_gsd_dev_raw.txt",
        "DEV_TAGGED": f"{DATA_PATH}/ja_gsd_dev_tagged.txt",
    },
}
MODEL_FILE = f"{OUTPUT_PATH}/hmmmodel.txt"
OUTPUT_FILE = f"{OUTPUT_PATH}/hmmoutput.txt"

START_TAG = "<ST@RT$>"
END_TAG = "<6ND!>"

SMOOTHING_PARAMETER = 1.0
OPEN_CLASS_PRECENT = 1.0

In [97]:
# Italian Experiment
# EXPERIMENT_TRAIN_DOCUMENT = DATASET_FILES["ITALIAN"]["TRAIN"]
# EXPERIMENT_TEST_RAW_DOCUMENT = DATASET_FILES["ITALIAN"]["DEV_RAW"]
# EXPERIMENT_TEST_RAW_TAGGED_DOCUMENT = DATASET_FILES["ITALIAN"]["DEV_TAGGED"]


In [98]:
# Japanese Experiment
EXPERIMENT_TRAIN_DOCUMENT = DATASET_FILES["JAPANESE"]["TRAIN"]
EXPERIMENT_TEST_RAW_DOCUMENT = DATASET_FILES["JAPANESE"]["DEV_RAW"]
EXPERIMENT_TEST_RAW_TAGGED_DOCUMENT = DATASET_FILES["JAPANESE"]["DEV_TAGGED"]


In [99]:
def load_document(file_path: str):
    document = list()
    with open(file_path, mode="r") as file:
        csv_reader = csv.reader(file, delimiter=" ", skipinitialspace=True, quoting=csv.QUOTE_NONE)
        for sentence in csv_reader:
            document.append(sentence)
    return document


In [100]:
def write_model(
    out_file_path: str,
    words: List[str],
    tags: List[str],
    open_class_tags: List[str],
    tag_counts: Dict[str, int],
    transition_probabilities,
    transition_matrix_labels,
    emission_probabilities,
    emission_matrix_row_labels,
    emission_matrix_col_labels,
    smoothing_parameter,
):
    """Writes the model parameters to a txt file in JSON format

    Args:
        out_file_path (str): output model path
        words (List[str]): list of words
        tags (List[str]): list of tags
        open_class_tags (List[str]): list of open class tags
        tag_counts (Dict[str, int]): list of tag counts
        transition_probabilities (_type_): list of transition probabilities
        transition_matrix_labels (_type_): list of transition matric labels
        emission_probabilities (_type_): list of emission probabilities
        emission_matrix_row_labels (_type_): list of emission matrix row labels
        emission_matrix_col_labels (_type_): list of emission matrix column labels
        smoothing_parameter (_type_): smoothing parameter for laplace smoothing
    """
    with open(out_file_path, mode="w") as output_file:
        out = dict()
        out["tags"] = tags
        out["open_class_tags"] = open_class_tags
        out["words"] = words
        out["tag_counts"] = tag_counts
        out["smoothing_parameter"] = smoothing_parameter
        out["transition_probabilities"] = transition_probabilities.tolist()
        out["transition_matrix_labels"] = transition_matrix_labels
        out["emission_probabilities"] = emission_probabilities.tolist()
        out["emission_matrix_row_labels"] = emission_matrix_row_labels
        out["emission_matrix_col_labels"] = emission_matrix_col_labels
        json.dump(out, output_file, ensure_ascii=False)


In [101]:
def load_model(model_path: str):
    """Load the model file to respective objects
    """
    model_data = None
    with open(model_path, mode="r") as model_file:
        model_data = json.load(model_file)
    return (
        model_data["words"],
        model_data["tags"],
        model_data["open_class_tags"],
        model_data["tag_counts"],
        model_data["smoothing_parameter"],
        np.array(model_data["transition_probabilities"]),
        model_data["transition_matrix_labels"],
        np.array(model_data["emission_probabilities"]),
        model_data["emission_matrix_row_labels"],
        model_data["emission_matrix_col_labels"],
    )


In [102]:
# Load training document
train_document = load_document(EXPERIMENT_TRAIN_DOCUMENT)


In [103]:
def count_occurrences(train_document: List[List[str]]):
    tag_counts = {
        START_TAG: len(train_document),
    }
    word_tag_counts = {}
    tag_tag_counts = {
        START_TAG: {},
    }

    count = len(train_document)

    # Process count number of sentences from document
    for idx, sentence in enumerate(train_document):
        if idx == count:
            break

        prev_tag = START_TAG
        sentence_last_idx = len(sentence) - 1
        for idx, word_tag_pair in enumerate(sentence):
            # Extract word tag
            word, tag = word_tag_pair.rsplit("/", 1)

            # Count the Tag!
            if tag not in tag_counts:
                tag_counts[tag] = 1
            else:
                tag_counts[tag] += 1

            # Count the Word - Tag (Emission)
            if word not in word_tag_counts:
                word_tag_counts[word] = {tag: 1}
            else:
                # Check if the tag is in the dict
                if tag not in word_tag_counts[word]:
                    word_tag_counts[word][tag] = 1
                else:
                    word_tag_counts[word][tag] += 1

            # Count tag-tag (Transition)
            if prev_tag in tag_tag_counts:
                if tag not in tag_tag_counts[prev_tag]:
                    tag_tag_counts[prev_tag][tag] = 1
                else:
                    tag_tag_counts[prev_tag][tag] += 1
            else:
                tag_tag_counts[prev_tag] = {tag: 1}

            prev_tag = tag

    return (tag_counts, tag_tag_counts, word_tag_counts)


In [104]:
tag_counts, tag_tag_counts, word_tag_counts = count_occurrences(train_document)


In [105]:
words = list(word_tag_counts.keys())
tags = list(tag_counts.keys())


In [106]:
def calculate_probabilities(
    tags: List[str],
    words: List[str],
    tag_counts: Dict[str, int],
    tag_tag_counts: Dict[str, Dict[str, int]],
    word_tag_counts: Dict[str, Dict[str, int]],
    smoothing_parameter: float,
):
    # Create row and column headers for access
    # Transition Matric Labels (same for both row and column)
    transition_matrix_labels = {tag: i for i, tag in enumerate(tags)}
    transition_matrix_n_rows, transition_matrix_n_cols = len(transition_matrix_labels), len(transition_matrix_labels)

    # Emission Matrix Labels
    emission_col_labels = deepcopy(tags)
    emission_col_labels.remove(START_TAG)

    emission_matrix_n_rows, emission_matrix_n_cols = len(words), len(emission_col_labels)
    emission_matrix_row_labels = {word: i for i, word in enumerate(words)}
    emission_matrix_col_labels = {tag: i for i, tag in enumerate(emission_col_labels)}

    # Create empty transition and emission probability matrices
    transition_probabilities = np.zeros(shape=(transition_matrix_n_rows, transition_matrix_n_cols), dtype=np.float64)
    emission_probabilities = np.zeros(shape=(emission_matrix_n_rows, emission_matrix_n_cols), dtype=np.float64)

    # Fill in emission probablity matrix
    for row_word, row_idx in emission_matrix_row_labels.items():
        for col_tag, col_idx in emission_matrix_col_labels.items():
            if col_tag not in word_tag_counts[row_word]:
                emission_probabilities[row_idx][col_idx] = 0.0
            else:
                emission_probability = word_tag_counts[row_word][col_tag] / tag_counts[col_tag]

                if emission_probability > 1:
                    emission_probability = 1

                emission_probabilities[row_idx][col_idx] = emission_probability

    # Fill in transition probablity matrix
    for row_tag, row_idx in transition_matrix_labels.items():
        for col_tag, col_idx in transition_matrix_labels.items():
            if col_tag not in tag_tag_counts[row_tag]:
                transition_probabilities[row_idx][col_idx] = 0.0
            else:
                # Laplace Smoothing
                transition_probabilities[row_idx][col_idx] = (tag_tag_counts[row_tag][col_tag] + smoothing_parameter) / (
                    tag_counts[row_tag] + smoothing_parameter * len(tag_counts)
                )

    return (
        transition_probabilities,
        transition_matrix_labels,
        emission_probabilities,
        emission_matrix_row_labels,
        emission_matrix_col_labels,
    )


In [107]:
(
    transition_probabilities,
    transition_matrix_labels,
    emission_probabilities,
    emission_matrix_row_labels,
    emission_matrix_col_labels,
) = calculate_probabilities(tags, words, tag_counts, tag_tag_counts, word_tag_counts, SMOOTHING_PARAMETER)


In [109]:
def calculate_open_classes(
    emission_probabilities, tags, threshold: float = 0.2
):
    n_open_tags = int(threshold * len(tags))
    
    unqiue_counts = (emission_probabilities != 0).sum(axis=0)

    reverse_sorted_counts = unqiue_counts.argsort()[::-1]

    open_class_tags_idx = reverse_sorted_counts[:n_open_tags]

    open_class_tags = list(map(tags.__getitem__, open_class_tags_idx))

    return open_class_tags


In [110]:
open_class_tags = calculate_open_classes(emission_probabilities, tags, OPEN_CLASS_PRECENT)

In [111]:
# Save the model
write_model(
    MODEL_FILE,
    words,
    tags,
    open_class_tags,
    tag_counts,
    transition_probabilities,
    transition_matrix_labels,
    emission_probabilities,
    emission_matrix_row_labels,
    emission_matrix_col_labels,
    SMOOTHING_PARAMETER,
)


In [112]:
(
    words,
    tags,
    open_class_tags,
    tag_counts,
    SMOOTHING_PARAMETER,
    transition_probabilities,
    transition_matrix_labels,
    emission_probabilities,
    emission_matrix_row_labels,
    emission_matrix_col_labels,
) = load_model(MODEL_FILE)


In [113]:
def viterbi_decoding(
    tags,
    tag_counts,
    open_class_tags,
    emission_probabilities,
    emission_matrix_row_labels,
    emission_matrix_col_labels,
    transition_probabilities,
    transition_matrix_labels,
    sentence,
    smoothing_parameter,
):
    n_words_in_sentence = len(sentence)
    n_tags = len(tags)

    viterbi_matrix = np.zeros(shape=(n_tags, n_words_in_sentence), dtype=np.float64)
    backtrack_matrix = np.zeros(shape=(n_tags, n_words_in_sentence), dtype=np.int32)

    cumulative_probability = 0

    for idx, tag in enumerate(tags):
        # handle new word in corpus
        word = sentence[0]

        # Emission Probablity
        # approach: set emission probability = 1 i.e. use transision probability alone
        if word not in emission_matrix_row_labels:
            em_prob = 1.0

        elif word not in emission_matrix_row_labels or tag not in emission_matrix_col_labels:
            em_prob = 0.0

        else:
            em_prob = emission_probabilities[emission_matrix_row_labels[word]][emission_matrix_col_labels[tag]]

        # Transision Probability
        if START_TAG not in transition_matrix_labels or tag not in transition_matrix_labels:
            trans_prob = float(1 / (tag_counts[START_TAG] + n_tags))
        else:
            trans_prob = transition_probabilities[transition_matrix_labels[START_TAG]][transition_matrix_labels[tag]]

        viterbi_matrix[idx][0] = trans_prob * em_prob

        backtrack_matrix[idx][0] = 0

    for idx in range(1, n_words_in_sentence):

        word = sentence[idx]
        tags_to_consider = tags
        if word not in emission_matrix_row_labels:
            tags_to_consider = open_class_tags

        for end_tag in tags_to_consider:

            for start_tag in tags:

                # emission
                if word not in emission_matrix_row_labels:
                    em_prob = 1.0
                elif word not in emission_matrix_row_labels or end_tag not in emission_matrix_col_labels:
                    em_prob = 0.0
                else:
                    em_prob = emission_probabilities[emission_matrix_row_labels[word]][
                        emission_matrix_col_labels[end_tag]
                    ]
                    if em_prob == 0.0:
                        continue

                # set transition key of the beginning of sentence: tag1-tag2 (follow model format)
                if start_tag not in transition_matrix_labels or end_tag not in transition_matrix_labels:
                    trans_prob = 1 / (tag_counts[start_tag] + n_tags)
                else:
                    trans_prob = transition_probabilities[transition_matrix_labels[start_tag]][
                        transition_matrix_labels[end_tag]
                    ]
                    if trans_prob == 0:
                        continue

                cumulative_probability = (
                    viterbi_matrix[transition_matrix_labels[start_tag]][idx - 1] * trans_prob * em_prob
                )
                if cumulative_probability == 0:
                    continue

                if cumulative_probability > viterbi_matrix[transition_matrix_labels[end_tag]][idx]:
                    viterbi_matrix[transition_matrix_labels[end_tag]][idx] = cumulative_probability
                    backtrack_matrix[transition_matrix_labels[end_tag]][idx] = transition_matrix_labels[start_tag]
                else:
                    continue

    return (viterbi_matrix, backtrack_matrix)


In [114]:
def viterbi_backtrack(tags, viterbi_matrix, backtrack_matrix, sentence):
    n_tags = len(tags)
    n_words_in_sentence = len(sentence)

    # Backtracking
    best_idx = 0
    for i in range(n_tags):
        if viterbi_matrix[i][n_words_in_sentence - 1] > viterbi_matrix[best_idx][n_words_in_sentence - 1]:
            best_idx = i

    output = [f"{sentence[n_words_in_sentence - 1]}/{tags[best_idx]}"]

    for idx in range(n_words_in_sentence - 1, 0, -1):
        best_idx = backtrack_matrix[best_idx][idx]
        output.insert(0, f"{sentence[idx - 1]}/{tags[best_idx]}")

    return output


In [115]:
# Load development data
dev_raw_document = load_document(EXPERIMENT_TEST_RAW_DOCUMENT)
dev_raw_tagged_document = load_document(EXPERIMENT_TEST_RAW_TAGGED_DOCUMENT)


In [116]:
# Test Block

SAMPLE_IDX = 101
sample, sample_tagged = dev_raw_document[SAMPLE_IDX], dev_raw_tagged_document[SAMPLE_IDX]

viterbi_matrix, backtrack_matrix = viterbi_decoding(
    tags,
    tag_counts,
    open_class_tags,
    emission_probabilities,
    emission_matrix_row_labels,
    emission_matrix_col_labels,
    transition_probabilities,
    transition_matrix_labels,
    sample,
    SMOOTHING_PARAMETER,
)
output = viterbi_backtrack(tags, viterbi_matrix, backtrack_matrix, sample)
output, sample_tagged


(['われわれ/NP',
  'の/PN',
  '組織/NN',
  'が/PS',
  'つぶれれ/VV',
  'ば/PC',
  '良い/JJ',
  'と/PQ',
  '日共/NN',
  'や/PH',
  '権力/NN',
  'が/PS',
  '願望/NN',
  'を/PS',
  'もっ/VV',
  'て/PC',
  'も/PK',
  '今/NR',
  'の/PN',
  'ところ/NB',
  '手/NN',
  'を/PS',
  '下す/VV',
  '方法/NN',
  'が/PS',
  'ない/JJ',
  '。/SYM'],
 ['われわれ/NP',
  'の/PN',
  '組織/NN',
  'が/PS',
  'つぶれれ/VV',
  'ば/PC',
  '良い/JJ',
  'と/PC',
  '日共/NN',
  'や/PH',
  '権力/NN',
  'が/PS',
  '願望/NN',
  'を/PS',
  'もっ/VV',
  'て/PC',
  'も/PK',
  '今/NR',
  'の/PN',
  'ところ/NN',
  '手/XS',
  'を/PS',
  '下す/VV',
  '方法/NN',
  'が/PS',
  'ない/JJ',
  '。/SYM'])

In [117]:
predicted_tags = list()
for sentence in dev_raw_document:
    viterbi_matrix, backtrack_matrix = viterbi_decoding(
        tags,
        tag_counts,
        open_class_tags,
        emission_probabilities,
        emission_matrix_row_labels,
        emission_matrix_col_labels,
        transition_probabilities,
        transition_matrix_labels,
        sentence,
        SMOOTHING_PARAMETER,
    )
    output = viterbi_backtrack(tags, viterbi_matrix, backtrack_matrix, sentence)
    predicted_tags.append(output)


In [118]:
def accuracy(tagged_true: List[List[str]], tagged_preds: List[List[str]]):
    total_count, correct_count = 0, 0
    for sentence_true, sentence_pred in zip(tagged_true, tagged_preds):
        for word_tag_true, word_tag_pred in zip(sentence_true, sentence_pred):
            if word_tag_true == word_tag_pred:
                correct_count += 1
            total_count += 1
    return correct_count / total_count


In [119]:
accuracy(dev_raw_tagged_document, predicted_tags)


0.8932207814811591

In [120]:
def write_output(output_file_path: str, predicted_tags: str):
    with open(output_file_path, mode="w") as file:
        for predicted_row in predicted_tags:
            file.write(" ".join(predicted_row) + "\n")


In [121]:
write_output(OUTPUT_FILE, predicted_tags)


TODO: Use Open Class labels to handle unseen words
