### Celtic Mutations -- From Scratch
### Hidden Markov Model

In [32]:
train_file: str = 'data/train.tsv'
test_file: str = ''
val_split: float = 0.3

states: dict = {
    'S': 0,
    'U': 0,
    'T': 0,
    'H': 0,
    'N': 0
}

In [24]:
def load_data(file: str) -> list:
    print("Loading data from file {}...".format(file))
    file = open(file, 'r')
    data = []
    sentence = []
    for line in file:
        pieces = line.rstrip("\n").split("\t")
        if pieces[0] == '<S>':
            data.append(sentence)
            sentence = []
        else:
            sentence.append(pieces)
    print("Loaded {} sentences".format(len(data)))
    return data

In [25]:
train_data: list = load_data('data/train.tsv')
if len(test_file) <= 0:
    print("Splitting data...")
    num_train_samples: int = int(len(train_data)*(1-val_split))
    val_data: list = train_data[num_train_samples:]
    print(len(val_data), " validation samples")
    train_data: list = train_data[:num_train_samples]
    print(len(train_data), " training samples")

Loading data from file data/train.tsv...
Loaded 395922 sentences
Splitting data...
118777  validation samples
277145  training samples


In [127]:
def compute_probabilities_from_counts(counts_dict: dict) -> dict:
    counts_sum: int = sum(counts_dict.values())
    probabilities_dict: dict = {}
    for count_id in counts_dict:
        count = counts_dict[count_id]
        probabilities_dict[count_id] = count / counts_sum
    assert round(sum(probabilities_dict.values()), 2) == 1.0, "All probabilities should sum to 1 but got {}".format(round(sum(probabilities_dict.values()), 2))
    return probabilities_dict

In [138]:
def generate_initial_state_probabilities(data: list) -> dict:
    initial_state_counts: dict = states.copy()
    for sentence in data:
        initial_state: str = sentence[0][1]
        if initial_state in initial_state_counts:
            initial_state_counts[initial_state] += 1
    initial_state_probabilities: dict = compute_probabilities_from_counts(initial_state_counts)
    return initial_state_probabilities

In [139]:
def generate_transition_state_probabilities(data: list) -> dict:
    # create a dictionary with two levels, the first being the previous state and the second being the current state
    transition_state_counts: dict = {state: states.copy() for state in states}
    for sentence in data:
        # since we enumerate over a list that excludes the first item, the enumeration index is one behind
        for prev_idx, word in enumerate(sentence[1:]):
            prev_state: str = sentence[prev_idx][1]
            current_state: str = word[1]
            if prev_state in transition_state_counts and current_state in transition_state_counts[prev_state]:
                transition_state_counts[prev_state][current_state] += 1
    transition_state_probabilities: dict = {state: {} for state in states}
    for prev_state in transition_state_counts:
        transition_state_probabilities[prev_state] = compute_probabilities_from_counts(transition_state_counts[prev_state])
    return transition_state_probabilities

In [140]:
def generate_emission_probabilities(data: list) -> dict:
    emission_counts_by_state: dict = {state: {} for state in states}
    for sentence in data:
        for word_state_pair in sentence:
            word, state = word_state_pair
            if state in emission_counts_by_state:
                # initialize word in state dict if the first occurrence of word X in state Y
                if word not in emission_counts_by_state[state]:
                    emission_counts_by_state[state][word] = 0
                emission_counts_by_state[state][word] += 1
    emission_probabilities_by_state: dict = {state: {} for state in states}
    for state in emission_counts_by_state:
        emission_probabilities_by_state[state] = compute_probabilities_from_counts(emission_counts_by_state[state])
    return emission_probabilities_by_state

In [141]:
def fit(data: list) -> dict:
    initial_state_probabilities = generate_initial_state_probabilities(data)
    transition_probabilities = generate_transition_state_probabilities(data)
    emission_probabilities = generate_emission_probabilities(data)


In [142]:
fit(val_data)

{'tuas': 0.0017801452187695772, 'cártaí': 6.50437676088884e-05, 'bí': 0.04899507379044268, 'daoine': 0.004320275511706166, 'míchumas': 0.0004587297294521603, 'cáipéis': 0.00020882472758643119, 'seirbhísí': 0.0008010653484463098, 'beadh': 0.005990873332397616, 'baint': 0.005169267846811657, 'taobh': 0.0035774072184888623, 'béim': 0.00023963493329590463, 'meon': 0.00012324082283789382, 'cineálta': 1.0270068569824484e-05, 'mothaigh': 0.000338912262804208, 'comh': 0.012882089342749845, 'conaic': 0.0010988973369712197, 'cur': 0.02334044250302111, 'tarraing': 0.00046557644183204327, 'foghlaim': 0.001037276925552273, 'déanamh': 0.020834545771983938, 'toil': 0.0008661091160551982, 'faigheann': 0.0007976419922563683, 'ceart': 0.004422976197404412, 'cóir': 0.004381895923125113, 'tuiscint': 0.0007531383617871289, 'cosaint': 0.0015370869292837311, 'ceiltibéirigh': 3.423356189941495e-06, 'guí': 5.1350342849122425e-05, 'cisteáin': 1.369342475976598e-05, 'slán': 3.4233561899414946e-05, 'cun': 0.04187