# Simple Naive Bayes Model with NLTK to predict the next word

In [1]:
import nltk
from nltk.util import ngrams
from random import shuffle

Load the text from the corpus

In [2]:

def load_sentences(file_path):
    from xml.etree import cElementTree as ET
    t = ET.parse(file_path)
    sentences = []
    for article in list(t.getroot()):
        
        for sentence in list(article):
            cur = []
            for word in list(sentence):
                cur.append(word.text)
                
            sentences.append(cur)
        
    return sentences
        
    
blick = load_sentences('../data/NOAHsCorpusOfSwissGermanDialects_Release2.1/blick.xml')
blogs = load_sentences('../data/NOAHsCorpusOfSwissGermanDialects_Release2.1/blogs.xml')
schobinger = load_sentences('../data/NOAHsCorpusOfSwissGermanDialects_Release2.1/schobinger.xml')
swatch = load_sentences('../data/NOAHsCorpusOfSwissGermanDialects_Release2.1/swatch.xml')
wiki = load_sentences('../data/NOAHsCorpusOfSwissGermanDialects_Release2.1/wiki.xml')



#all_sentences = blick + blogs + schobinger + swatch + wiki
all_sentences = blick + blogs

shuffle(all_sentences)


Build feature from a word and feauterset (tuple with feautre and label) from a sentence.

In [3]:
def build_feature(word):
    
    f = {}
    
    if len(word) == 1:
        f['single'] = word
    
    for n in list(ngrams(word, 2)):
        f[n] = True
        
    return f


def build_featureset(sentences):
    featureset = []
    
    for sentence in sentences:
    
        for i, token in enumerate(sentence):
            if i < len(sentence) - 1 and token is not None and len(token) > 1:
                # tuple -> (feature, label)
                featureset.append((build_feature(token), sentence[i + 1]))
    
    return featureset


Split data in test and traing set

In [4]:
featuresets = build_featureset(all_sentences)

train_set, test_set = featuresets[500:], featuresets[:500]

print('train_set:', len(train_set))
print('test_set:', len(test_set))

train_set: 37918
test_set: 500


Train the model

In [5]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
print('labels: ', len(classifier.labels()))

labels:  7966


Not the best metric for our multi label problem, but for our example we stick with 'accuracy'

In [6]:
accuracy = nltk.classify.accuracy(classifier, test_set)
print('accuracy', accuracy)

accuracy 0.026


Create random sentences from the naive bayes model.

In [8]:
import random

flattened = [val for sublist in wiki for val in sublist]

def get_random(e):
    
    return random.choice(e)

for i in range(10):

    token = get_random(flattened)

    max_tokens = 10
    count = 0
    
    sentence = []
    
    while(True):
        sentence.append(token)
        
        if token in ['.', '?', '!']:
            break

        if count >= max_tokens:
            break


        token = classifier.classify(build_feature(token))
        count += 1

    print(' '.join(sentence))
    print('---')
    


Buslinie äscht gschmolze zerstört würke seriös Mord entwicklet Mirasol-technik flopped ersetze
---
Frankriich prüglet überstoht khai tuctucsit usanand setzand unterstützt ort Lauderdale vorhande
---
isch echt cool ghänged usezögere Mischig Vorfahrä damiti Profässer Von ehre
---
d .
---
Woatan besichtige Gc-fän chöi säch verbruuched vorzgah svizzera wärded übersetzt wirde
---
Sternbilder verhänkt hetmich zuechund gmachd Brüst zämekratze undernoh Regio ungerteilt games
---
Herkunft hönd wehre Frönd wehre Frönd wehre Frönd wehre Frönd wehre
---
ryychi zerschtört wirsch „ .
---
Faltejura Lift ufefahre damiti Profässer Von ehre Frönd wehre Frönd wehre
---
; .
---
