In [1]:
%matplotlib inline

In [11]:
import pandas
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import sklearn
import sklearn.linear_model
from datetime import datetime, date
from sklearn_pandas import DataFrameMapper
import warnings
import timeit
from collections import defaultdict, OrderedDict
import tabulate
import time
import GPy
import glob
from functools import reduce
from IPython.display import display

timeit.template = """
def inner(_it, _timer{init}):
    {setup}
    _t0 = _timer()
    for _i in _it:
        retval = {stmt}
    _t1 = _timer()
    return _t1 - _t0, retval
"""

matplotlib.style.use('ggplot')

RANDOM_SEED = 33

## Markov Models
The code below trains a simple Markov model for given data. The transition probabilities are simply the observed transition counts normalized to valid probability simplexes on each row of the matrix. 

In (somewhat) conforming with `sklearn` API, the model implements `fit` and `score`, as well as helper methods with convenience interfaces. 

In [56]:
ALPHABET = ['A', 'o', 'e', 't', 'p', 'g', 'k']
TRAINING_FILE_PATTERN = 'data/markov/symbol/language-training-lang{lang}-*'
TEST_FILE_PATTERN = 'data/markov/symbol/language-test-*'


class MarkovModel:
    def __init__(self, alphabet=ALPHABET):
        self.map = {letter: index for (index, letter) in enumerate(ALPHABET)}
        self.transitions = np.zeros((len(ALPHABET), len(ALPHABET)))
        self.probabilities = np.ones_like(self.transitions)  # default initialization    
    
    def fit(self, sequence):
        for first, second in zip(sequence, sequence[1:]):
                self.transitions[self.map[first], self.map[second]] += 1
                
        self.probabilities = self.transitions / np.sum(self.transitions, axis=1)
    
    def fit_from_path(self, path):
        with open(path) as f:
            sequence = f.read()
            self.fit(sequence)

    def fit_from_glob(self, pattern):
        for path in glob.glob(pattern):
            self.fit_from_path(path)
    
    def score(self, sequence, log=False):
        log_prob = np.sum([np.log(self.probabilities[self.map[first], self.map[second]])
                       for first, second in zip(sequence, sequence[1:])])
        if not log:
            return np.exp(log_prob)

        return log_prob
    

## Genertive Markov Model classifier

The code below implements a simple generative Markov model classifier. For a new input sequence $v_{1:T}^*$, and a given class $c$, the model considers the posterior $P(c | v_{1:T}^*) \propto P(v_{1:T}^* | c) P(c)$, the product of the data's likelihood according to the class multiplied by the prior for that class - Bayes's rule. To normalize, we must divide by the sum of posterior probabilities over all classes: 

$$ P(c | v_{1:T}^*) = \frac{P(v_{1:T}^* | c) P(c)}{\sum_{c' \in C} P(v_{1:T}^* | c') P(c')} $$

To classify, the model selects the argmax class over all class probabilities. In conforming with the `sklearn` API, the model implements `predict_proba`, `predict`, and `score`.

In [63]:
LANGUAGES = ('A', 'B', 'C')


class GenerativeMarkovModelClassifier:
    def __init__(self, prior=None, langauges=LANGUAGES):
        self.n = len(langauges)
        self.classes = langauges
        self.models = [MarkovModel() for i in range(self.n)]
        
        if not prior:
            self.prior = np.ones((self.n,)) / self.n

        elif not type(prior) == dict:
            if not len(prior) == self.n:
                raise ValueError('Prior and languages should have the same length')
                
            if not 1.0 == sum(prior):
                raise ValueError('Prior should sum up to 1')
            
            if any(prior < 0) or any (prior > 1):
                raise ValueError('Prior should be a valid probability simplex')
                
            self.prior = prior
    
    def fit_from_pattern(self, training_file_pattern=TRAINING_FILE_PATTERN):
        for (lang, model) in zip(self.classes, self.models):
            model.fit_from_glob(training_file_pattern.format(lang=lang))

    def predict_proba(self, sequence):
        probs = np.array([model.score(sequence) for model in self.models]) * self.prior
        return probs / np.sum(probs)
            
    def predict(self, sequence):
        probs = self.predict_proba(sequence)
        return self.classes[np.argmax(probs)]
    
    def score(self, sequence):
        return np.max(self.predict_proba(sequence))
    
    def predict_from_pattern(self, test_file_pattern=TEST_FILE_PATTERN, probs=False):
        predictions = []
        
        for path in glob.glob(test_file_pattern):
            with open(path) as f:
                sequence = f.read()
                if probs:
                    predictions.append(self.predict_proba(sequence))
                else:
                    predictions.append(self.predict(sequence))
            
        return predictions
    

In [64]:
gmmc = GenerativeMarkovModelClassifier()
gmmc.fit_from_pattern()
gmmc.predict_from_pattern()



['C', 'C', 'A', 'B', 'A', 'A', 'B', 'A', 'C', 'A']

## HMM 

Given that there are three speakers in the room, we should set $H = 3$, making $A$ a $3 \times 3$ matrix, and given the same seven phonemes from the previous exercise, we should set $V = 7$, making $B$ a $3 \times 7$ matrix.

A reasonable assumption is to set $\vec{a} = [\frac{1}{3}, \frac{1}{3}, \frac{1}{3}]^T$, a uniform distribution over the initial probabilities of each speaker starting. 

Note: 

$A_{ij} = P(h_{t + 1} = j | h_t = i)$, the row-major matrix notation.

$B_{ij} = P(v_t = j | h_t = i)$, again, row-major, unlike Barber's notation

In [79]:
data = open('data/hmm/speaker').read()
transitions = list(zip(data, data[1:]))
mapping = {letter: index for (index, letter) in enumerate(ALPHABET)}
mapped_transitions = [(mapping[a], mapping[b]) for (a, b) in transitions]

In [142]:
H = 3
V = 7
A = np.array([[0.5, 0.25, 0.25], [0.25, 0.5, 0.25], [0.25, 0.25, 0.5]])
# B = np.ones((H, V)) / V
B = np.random.dirichlet(np.ones((V,)), 3)
a = np.ones((H, 1)) / H    


array([ 1.,  1.,  1.])

In [143]:
def m_step(a, A, B, data):
    mapped_data = [mapping[d] for d in data]
    mapped_transitions = list(zip(mapped_data, mapped_data[1:]))
    # to update a, we look at how likely the observed chain is to have started at i
    a_new = np.zeros_like(a)
    
    for h_i in range(H):
        log_p_h_i = 0
        h = np.zeros_like(a)
        h[h_i] = 1
        for t in range(len(mapped_data)):
            v_t = np.zeros((V, 1))
            v_t[mapped_data[t]] = 1
            p_v_t = np.matmul(B, v_t)
            log_p_h_i += np.log(np.dot(h.T, p_v_t))
            
        a_new[h_i] = log_p_h_i
      
    a_new = a_new - a_new.max()
    a_new = np.exp(a_new)
    a_new = a_new / a_new.sum()
    
    # to update A, we look at the probabilities of the transitions, I guess
    A_new = np.zeros_like(A)
    for v_t, v_t_1 in mapped_transitions:
        p_h_t = B[:,v_t].reshape((1, H))
        p_h_t_1 = B[:,v_t_1].reshape((H, 1))
        A_new += p_h_t * p_h_t_1
        
    A_new = A_new / A_new.sum(axis=1)
    
    # to update B, we do something similary?
    B_new = np.zeros_like(B)
    h = a.copy()
    for t in range(len(mapped_data)):
        v_t = np.zeros((V, 1))
        v_t[mapped_data[t]] = 1
        B_new +=  np.matmul(h, v_t.T)
        h = np.matmul(A, h)

    B_new = B_new / B_new.sum(axis=1).reshape((H, 1))
    

    return a_new, A_new, B_new
  
    
a_new, A_new, B_new = m_step(a, A, B, data)

In [146]:
B_new

array([[ 0.176,  0.188,  0.141,  0.168,  0.167,  0.121,  0.039],
       [ 0.176,  0.188,  0.141,  0.168,  0.167,  0.121,  0.039],
       [ 0.176,  0.188,  0.141,  0.168,  0.167,  0.121,  0.039]])

In [105]:
v = np.zeros((V, 1))
v[4] = 1

p = np.ones((1, H)) / H


np.matmul(v, p)

array([[ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.33333333,  0.33333333,  0.33333333],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ]])

In [104]:
B

array([[ 0.14285714,  0.14285714,  0.14285714,  0.14285714,  0.14285714,
         0.14285714,  0.14285714],
       [ 0.14285714,  0.14285714,  0.14285714,  0.14285714,  0.14285714,
         0.14285714,  0.14285714],
       [ 0.14285714,  0.14285714,  0.14285714,  0.14285714,  0.14285714,
         0.14285714,  0.14285714]])