# Hidden Markov Models (HMM)
A simple hidden markov model with NLTK.

Build the model from a given list of tokens (list of strings). The method builds a model with all state transitions and their probability.

In [1]:
import nltk

def build_model(tokens):

    fdist = nltk.FreqDist(tokens)

    bgs = nltk.bigrams(tokens)
    fdist_bgs = nltk.FreqDist(bgs)

    hmm = {}
    for k,v in fdist_bgs.items():

        if k[0] in hmm:
            hmm[k[0]].append({'prob': v / fdist[k[0]], 'state': k[1]})
        else:
            hmm[k[0]] = [{'prob': v / fdist[k[0]], 'state': k[1]}]

    return hmm




Get the next observation from a given state (word) or None when the state is unknown.

In [2]:
def get_observation(state, model):
    observation = None
    if state in model:
        max_prob = 0
        for n in model[state]:
            if n['prob'] > max_prob:
                max_prob = n['prob']
                observation = n['state']

    return observation

Load the text from the corpus

In [3]:
def load_tokens(file_path):
    from xml.etree import cElementTree as ET
    t = ET.parse(file_path)
    token = []
    for article in list(t.getroot()):
        
        for sentence in  list(article):
            for word in list(sentence):
                
                if word.text:
                    token.append(word.text)
                
    return token
        
    
blick = load_tokens('../data/NOAHsCorpusOfSwissGermanDialects_Release2.1/blick.xml')
blogs = load_tokens('../data/NOAHsCorpusOfSwissGermanDialects_Release2.1/blogs.xml')
schobinger = load_tokens('../data/NOAHsCorpusOfSwissGermanDialects_Release2.1/schobinger.xml')
swatch = load_tokens('../data/NOAHsCorpusOfSwissGermanDialects_Release2.1/swatch.xml')
wiki = load_tokens('../data/NOAHsCorpusOfSwissGermanDialects_Release2.1/wiki.xml')


The two hidden markov models from the corpus 'blick' and 'wiki'.

In [4]:
model_b = build_model(blick)
model_wiki = build_model(wiki)

Create random sentences from the two hiden markov models.

In [5]:
import random

def get_random(e):
    
    return random.choice(e)


for i in range(10):

    start = get_random(wiki)

    sentence = [start]


    o = start


    while(True):


        o = get_observation(o, model_b)

        if o == None or sentence[-1] or sentence[-2] == o:
            o = get_random(blick)

        sentence.append(o)

        if o in ['.', '?', '!']:
            break


        o = get_observation(o, model_wiki)

        if o == None or sentence[-1] or sentence[-2] == o:
            o = get_random(wiki)

        sentence.append(o)

        if o in ['.', '?', '!']:
            break
            
    print(' '.join(sentence))
    print('---')

e Päärli Religion hei die ou Hochdytsch Miss de elegante es Mal isch mit .
---
grad Persönlichkeitsstruktur Nägger Buech , Rüeblihose " hend 1680er .
---
me d Sprach bis vo Schrifststöuer siner und em d , Smartphones .
---
närvös Glück Burge D : , Bürger gliich isch vergrabe het erschte Gotechrieg .
---
Matur reagiert Aesch — .
---
dr dänkt , .
---
grüezi findi aus und de , isch gschriebe beherrscht no i in Achter , en de für geds erschte zwöi dr im dann .
---
dogmatische , aagschtoosse vor .
---
an Mischig der , wo Sitzheizig d , e Laura Schtile Summer är d : de Seglä Sproch vom Bueb Pragmatik Nla-vereine sind d ändere oisem i Lidschatte französische .
---
Getraide hät grüezi färbt ene mini tötet d ume druggta chunnt Türa d Zäntrum Stund Ziitig lertne und de macht Strooss Platsch het Marco ab bi erreicht nonig sy was ere Veröffentlichung am und 15 denn isch tatsächlich hops vo Europa khönnd Personalunion Bald vo Wese und .
---
