### Celtic Mutations -- From Scratch
### Hidden Markov Model

In [14]:
train_file: str = 'data/train.tsv'
test_file: str = ''
val_split: float = 0.1

states: dict = {
    'S': 0,
    'U': 0,
    'T': 0,
    'H': 0,
    'N': 0
}

In [15]:
def load_data(file: str) -> list:
    print("Loading data from file {}...".format(file))
    file = open(file, 'r')
    data = []
    sentence = []
    for line in file:
        pieces = line.rstrip("\n").split("\t")
        if pieces[0] == '<S>':
            data.append(sentence)
            sentence = []
        else:
            sentence.append(pieces)
    print("Loaded {} sentences".format(len(data)))
    return data

In [16]:
train_data: list = load_data(train_file)
if len(test_file) > 0:
    test_data: list = load_data(test_file)
print("Splitting data...")
num_train_samples: int = int(len(train_data)*(1-val_split))
val_data: list = train_data[num_train_samples:]
print(len(val_data), " validation samples")
train_data: list = train_data[:num_train_samples]
print(len(train_data), " training samples")

Loading data from file data/train.tsv...
Loaded 395922 sentences
Loading data from file data/train.tsv...
Loaded 395922 sentences
Splitting data...
39593  validation samples
356329  training samples


In [17]:
def compute_probabilities_from_counts(counts_dict: dict) -> dict:
    counts_sum: int = sum(counts_dict.values())
    probabilities_dict: dict = {}
    for count_id in counts_dict:
        count = counts_dict[count_id]
        probabilities_dict[count_id] = count / counts_sum
    assert round(sum(probabilities_dict.values()), 2) == 1.0, "All probabilities should sum to 1 but got {}".format(round(sum(probabilities_dict.values()), 2))
    return probabilities_dict

In [18]:
def get_max_float_id_from_dict(float_dict: dict) -> str:
    max_value: float = -0.1
    max_id: float = None
    for dict_id in float_dict:
        if float_dict[dict_id] > max_value:
            max_value = float_dict[dict_id]
            max_id = dict_id
    return max_id


In [19]:
def generate_initial_state_probabilities(data: list) -> dict:
    initial_state_counts: dict = states.copy()
    for sentence in data:
        initial_state: str = sentence[0][1]
        if initial_state in initial_state_counts:
            initial_state_counts[initial_state] += 1
    initial_state_probabilities: dict = compute_probabilities_from_counts(initial_state_counts)
    return initial_state_probabilities

In [20]:
def generate_transition_state_probabilities(data: list) -> dict:
    # create a dictionary with two levels, the first being the previous state and the second being the current state
    transition_state_counts: dict = {state: states.copy() for state in states}
    for sentence in data:
        # since we enumerate over a list that excludes the first item, the enumeration index is one behind
        for prev_idx, word in enumerate(sentence[1:]):
            prev_state: str = sentence[prev_idx][1]
            current_state: str = word[1]
            if prev_state in transition_state_counts and current_state in transition_state_counts[prev_state]:
                transition_state_counts[prev_state][current_state] += 1
    transition_state_probabilities: dict = {state: {} for state in states}
    for prev_state in transition_state_counts:
        transition_state_probabilities[prev_state] = compute_probabilities_from_counts(transition_state_counts[prev_state])
    return transition_state_probabilities

In [21]:
def generate_emission_probabilities(data: list) -> dict:
    emission_counts_by_state: dict = {state: {} for state in states}
    for sentence in data:
        for word_state_pair in sentence:
            word, state = word_state_pair
            if state in emission_counts_by_state:
                # initialize word in state dict if the first occurrence of word X in state Y
                if word not in emission_counts_by_state[state]:
                    emission_counts_by_state[state][word] = 0
                emission_counts_by_state[state][word] += 1
    emission_probabilities_by_state: dict = {state: {} for state in states}
    for state in emission_counts_by_state:
        emission_probabilities_by_state[state] = compute_probabilities_from_counts(emission_counts_by_state[state])
    return emission_probabilities_by_state

In [22]:
class HMM:
    def __init__(self, data):
        self.initial_state_probabilities, self.transition_probabilities, self.emission_probabilities = self.fit(data)

    @staticmethod
    def fit(data: list) -> tuple:
        initial_state_probabilities = generate_initial_state_probabilities(data)
        transition_probabilities = generate_transition_state_probabilities(data)
        emission_probabilities = generate_emission_probabilities(data)
        return initial_state_probabilities, transition_probabilities, emission_probabilities

    def predict(self, sentence: str) -> dict:
        words: list = sentence.split(" ")
        state_sequence: list = []
        # begin each sentence using initial state probabilities, then switches to transition probabilities
        state_probabilities: dict = self.initial_state_probabilities.copy()
        for idx, word in enumerate(words):
            for state in state_probabilities:
                if word in self.emission_probabilities[state]:
                    state_probabilities[state] = state_probabilities[state]*self.emission_probabilities[state][word]
                else:
                    state_probabilities[state] = 0
            state_sequence.append(get_max_float_id_from_dict(state_probabilities))
            # for next word, initialize probabilities as transition probabilities from the previous state
            state_probabilities = self.transition_probabilities[state_sequence[idx]].copy()
        return state_sequence

    def evaluate(self, sentences: list, labels: list) -> float:
        total, correct = 0, 0
        for sentence_idx, sentence in enumerate(sentences):
            predicted_sequence: list = self.predict(sentence)
            total += len(predicted_sequence)
            correct += sum([int(predicted == labels[sentence_idx][tag_idx]) for tag_idx, predicted in enumerate(predicted_sequence)])
        return correct / total

In [23]:
model = HMM(train_data)

In [24]:
val_sentence = " ".join([word[0] for word in val_data[100]])
print("Sentence:", val_sentence)
print("Predicted Sequence:", model.predict(val_sentence))
print("Ground Truth Sequence:", [word[1] for word in val_data[100]])

Sentence: d'fhan sé ina tost ar feadh scaithimh eile sular labhair sé .
Predicted Sequence: ['N', 'N', 'N', 'S', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
Ground Truth Sequence: ['N', 'N', 'N', 'S', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']


In [25]:
val_sentence = " ".join([word[0] for word in val_data[1]])
print("Sentence:", val_sentence)
print("Predicted Sequence:", model.predict(val_sentence))
print("Ground Truth Sequence:", [word[1] for word in val_data[1]])

Sentence: cuairt ar an leabharlann
Predicted Sequence: ['N', 'N', 'N', 'N']
Ground Truth Sequence: ['N', 'N', 'N', 'N']


In [26]:
def format_data(data: list) -> tuple:
    sentences: list = []
    labels: list = []
    for sentence in val_data:
        sentences.append((" ".join(word_state_pair[0] for word_state_pair in sentence)))
        labels.append([word_state_pair[1] for word_state_pair in sentence])
    return sentences, labels

In [33]:
val_sentences, val_labels = format_data(val_data)
val_acc = model.evaluate(val_sentences, val_labels)
print("Validation Accuracy: ", str(round(val_acc*100, 2)) + "%")

[['socraíodh', 'N'], ['go', 'N'], ['raibh', 'N'], ['gá', 'N'], ['lena', 'N'], ['leithéid', 'N'], [',', 'N'], ['mar', 'N'], ['go', 'N'], ['bíonn', 'U'], ['na', 'N']]
Validation Accuracy:  89.81%


In [32]:
if "test_data" in globals():
    test_sentences, test_labels = format_data(test_data)
    test_acc = model.evaluate(test_sentences, test_labels)
    print("Testing Accuracy: ", str(round(test_acc*100, 2)) + "%")

Testing Accuracy:  89.81434392455854%


In [None]:
if "test_data" in globals():
    test_sentences, test_labels = format_data(test_data)
    test_acc = model.evaluate(test_sentences, test_labels)
    print("Testing Accuracy: ", str(round(test_acc*100, 2)) + "%")