# Whatsapp text generation using Markov chains with back-off

Source: https://blog.dataiku.com/2016/10/08/machine-learning-markov-chains-generate-clinton-trump-quotes

In [4]:
# Setting up Google Colab

from google.colab import drive
drive.mount("/content/gdrive")

%cd gdrive/My Drive/Projektit/whatsapp-analysis/src

! pip install emoji

Mounted at /content/gdrive
/content/gdrive/My Drive/Projektit/whatsapp-analysis/src
Collecting emoji
[?25l  Downloading https://files.pythonhosted.org/packages/24/fa/b3368f41b95a286f8d300e323449ab4e86b85334c2e0b477e94422b8ed0f/emoji-1.2.0-py3-none-any.whl (131kB)
[K     |████████████████████████████████| 133kB 9.1MB/s 
[?25hInstalling collected packages: emoji
Successfully installed emoji-1.2.0


In [5]:
# Import libraries

import numpy as np
import random
import nltk
from whatsapp_analysis.config import data_path
from whatsapp_analysis.helper import import_data, preprocess_data

In [6]:
# Read and pre-process data

df = import_data(data_path)
df = preprocess_data(df)
messages = df[(df['media_count'] == 0) & (df['word_count'] > 1) & (df['link_count'] == 0)]['message']

In [8]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [35]:
# Source: https://blog.dataiku.com/2016/10/08/machine-learning-markov-chains-generate-clinton-trump-quotes

class markov(object):
    def __init__(self, corpus, n_grams, min_length):       
        """
        corpus = list of string text ["speech1", "speech2", ..., "speechn"]
        n_grams = max sequence length
        min_length = minimum number of next words required for back-off scheme       
        """
        self.grams = {}
        self.n_grams = n_grams
        self.corpus = corpus   
        self.min_length = min_length
        self.sequences()

    def tokenizer(self, speech, gram):
        """tokenize speeches in corpus, i.e. split speeches into words"""
        tokenized_speech = nltk.word_tokenize(speech)

        if len(tokenized_speech) < gram:
            pass       
        else:
            for i in range(len(tokenized_speech) - gram):
                yield (tokenized_speech[i:i + (gram + 1)])

    def sequences(self):
        """
        create all sequences of length up to n_grams
        along with the pool of next words.
        """        
        for gram in range(1, self.n_grams + 1):
            dictionary = {}            
            for speech in self.corpus:               
                for sequence in self.tokenizer(speech, gram):
                    key_id = tuple(sequence[0:-1])

                    if key_id in dictionary:
                        dictionary[key_id].append(sequence[gram])
                    else:
                        dictionary[key_id] = [sequence[gram]]                        
            self.grams[gram] = dictionary

    def next_word(self, key_id):
        """returns the next word for an input sequence
        but backs off to shorter sequence if length
        requirement is not met.         
        """
        for i in range(len(key_id)):
            try:
                if len(self.grams[len(key_id)][key_id]) >= self.min_length:
                    return random.choice(self.grams[len(key_id)][key_id])
            except KeyError:
                pass
        # if the length requirement isn't met, we shrink the key_id
            if len(key_id) > 1:
                key_id = key_id[1:]
        # when we're down to only a one-word sequence,
        #ignore the requirement
        try:
            return random.choice(self.grams[len(key_id)][key_id])
        except KeyError:
            # key does not exist: should only happen when user enters
            # a sequence whose last word was not in the corpus
            # choose next word at random
            return random.choice(' '.join(self.corpus).split())

    def next_key(self, key_id, res):
        return tuple(key_id[1:]) + tuple([res])

    def generate_markov_text(self, start, size=25):
        """"start is a sentence of at least n_grams words"""
        key_id = tuple(nltk.word_tokenize(start))[ - self.n_grams:]
        gen_words = []
        i = 0
        while i <= size:
            result = self.next_word(key_id)
            key_id = self.next_key(key_id, result)
            gen_words.append(result)
            i += 1
        print('Seed:', start)       
        print('Result:', ' '.join(gen_words).replace(' .', '.').replace(' ,', ','))

mark = markov(messages, n_grams=5, min_length=5)
mark.generate_markov_text('mitä tehdään lauantaina', size=50)

Seed: mitä tehdään lauantaina
Result: käydä kyselemässä että opin elämästä kun 2/9 * oblivion music television näky, mut ens viikon perjantaina saadaan ihmiset pelkää mikroja😃 Tää puukoriste on kai virallinen asiointikieli on semmonen autossa ja luot yhteyden ottamisest apreeseensissä eikä edes teamssii pyytäny jälkikäteen usb voi ydinvoimalais ni sitten just nyt luet siellä kermaviiliämpäreitä ?
