## Data preparation
In this step we are going to prepare the data, so we can use it with our HMM to train the parameters  
First we are trying to use only the x and y position

In [7]:
from conversion_utils import *

def getCombinedStrokeSequencesFromDataFrames(paths, features):
    stroke_sequences = []
    for path in paths:
        strokes = getDataFrameFromPath(path)
        strokes = normalizeDataframeColumnsMinMax(strokes, features)
        #strokes = standardizeDataframeColumns(strokes, features)
        strokes = convertToNumpySequences(strokes, features)

        #strokes = getSequencesWhereLengthNot(strokes, 0)
        strokes = getSequencesWhereLengthBiggerThan(strokes, 10)
        #strokes = getSequencesWhereLengthSmallerThan(strokes, 100)

        stroke_sequences = stroke_sequences + strokes

    return stroke_sequences


In [8]:
import numpy as np
from pomegranate import *

def trainModel(dataset, features = ['x_position', 'y_position'], max_states = 15):
    strokes = getCombinedStrokeSequencesFromDataFrames([dataset], features)
    train, test, validate = generateTrainTestValidateSplit(strokes)

    print(f'Training best model for dataset {dataset}', end = ' - ')
    print(f'Training set size: {len(train)} | Test set size: {len(test)}')

    best_model = [0]
    max_prob = float('-inf')
    for states in range(2, max_states):
        improvement, model, success = trainModelWithStates(train, states)

        if not success:
            continue

        log_prob = sum(model.log_probability(sequence) / len(sequence) for sequence in test)
        print(f'Trained model with {states} states | Log-probability on test set: {log_prob}')

        if log_prob > max_prob:
            best_model = [states, model]
            max_prob = log_prob

    print(f'Finished training model for dataset {dataset} | Best performing: {best_model[0]} states | Log-prob: {max_prob}')

    return dataset, model, validate

def trainModelWithStates(train, states, number_of_tries = 10):
    weights = []
    for seq in train:
        weights.append(len(seq))

    best_model = [float('-inf'), np.nan, False]

    for i in range(number_of_tries):
        model, history = HiddenMarkovModel.from_samples(MultivariateGaussianDistribution,
                n_components = states,
                X = train,
                # weights = weights,
                algorithm = 'baum-welch',
                n_init = 10,
                return_history = True,
                verbose = False)

        improvement = history.total_improvement[len(history.total_improvement) - 1]
        if not np.isnan(improvement):
            if improvement > best_model[0]: ## TODO: Improvement says nothing about quality of the model 
                best_model = [improvement, model, True]

    return best_model

In [9]:
datasets = ['data/train/cua-db.db', 'data/test1/cua-db.db', 'data/test2/cua-db.db', 'data/test3/cua-db.db', 'data/test4/cua-db.db', 'data/test5/cua-db.db']

models = []
max_states = 10

print('------------------------------------------------')
print(f'Training model with up to {max_states} states!')
print('------------------------------------------------')

for dataset in datasets:
    dataset, model, validate = trainModel(dataset, features =  ['x_position', 'y_position'], max_states = max_states)
    models.append([dataset, model, validate])

print('------------------------------------------------')
print(f'Now verifying model with validation set!')
print('------------------------------------------------')

for m in models:
    dataset = m[0]
    model = m[1]
    validate = m[2]

    print(f'Verifying {dataset}')

    for n in models:
        dataset_validate = n[0]
        validate = n[2]
        log_prob = sum(model.log_probability(sequence) / len(sequence) for sequence in validate)
        print(f'Log-Probability for {dataset_validate}\t=\t{log_prob}')
    
    print('----------------------------------')



------------------------------------------------
Training model with up to 10 states!
------------------------------------------------
Training best model for dataset data/dominik/cua-db.db - Training set size: 823 | Test set size: 235
Trained model with 2 states | Log-probability on test set: 1078.9897544749379
Trained model with 3 states | Log-probability on test set: 1182.2445626084066
Trained model with 4 states | Log-probability on test set: 1293.3782209118413
Trained model with 5 states | Log-probability on test set: 1316.9269449008189
Trained model with 6 states | Log-probability on test set: 1351.6059304491635
Trained model with 7 states | Log-probability on test set: 1352.7796159418083
Trained model with 8 states | Log-probability on test set: 1368.7977072949038
Trained model with 9 states | Log-probability on test set: 1419.2854120706927
Finished training model for dataset data/dominik/cua-db.db | Best performing: 9 states | Log-prob: 1419.2854120706927
Training best model fo


### Checking why some log_proability calls return NaN
e.g. when using [data/sabrina/cua-db.db]: Trained model with 7 states | Log-probability on test set: nan

Solution: This does not depend on the data that is used, because in some instances the training works just fine, whereas in other instances it is possible to fail. This is by random so it might be a bug in the library / or some sort of wrong initializiation, or just an implementation detail, that a solution is not found every time.

In [72]:
from pomegranate import *
import numpy as np

strokes = getCombinedStrokeSequencesFromDataFrames(['data/test5/cua-db.db'], features =  ['x_position', 'y_position'])
train, test, validate = generateTrainTestValidateSplit(strokes)
states = 7

print(f'Train len: {len(train)} | Test len: {len(test)} | Validate len: {len(validate)}')

weights = []

for seq in train:
    weights.append(len(seq))

for i in range(10):
    model, history = HiddenMarkovModel.from_samples(MultivariateGaussianDistribution,
        n_components = states,
        X = train,
        weights = weights,
        algorithm = 'baum-welch',
        use_pseudocount = False,
        n_init = 10,
        return_history = True,
        verbose = False)

    log_prob = sum(model.log_probability(sequence) / len(sequence) for sequence in validate)
    print(log_prob)

    #if np.isnan(history.total_improvement[len(history.total_improvement) - 1]):
    #    print('Did not work')


Train len: 77 | Test len: 22 | Validate len: 12
-117.5650259480935
-118.66991525099479
nan
-118.53140852129953
-118.16725629380666
-112.7903271858375
-118.66991523282542
-118.66991415338306
-112.70555901730881
-112.54481101739562
