# Whatsapp Text Generation

Source:  https://towardsdatascience.com/generating-singlish-text-messages-with-a-lstm-network-7d0fdc4593b6

In [48]:
import sys
import os.path
import numpy as np
import pickle

sys.path.append('..\src')

from whatsapp_analysis.config import data_path
from whatsapp_analysis.helper import import_data, preprocess_data
from keras import optimizers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Activation, Bidirectional, Dense, Dropout, Embedding, LSTM
from keras.models import Sequential, load_model
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from statistics import mode

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
# Load and pre-process Whatsapp data
# Media messages and one-word messages are excluded from the data

df = import_data(data_path)
df = preprocess_data(df)
messages = df[(df['media_count'] == 0) & (df['word_count'] > 1)]['message']
messages = [text_to_word_sequence(message) for message in messages]

In [32]:
lengths = [len(message) for message in messages]
sequence_length = mode(lengths)

print('Number of messages:', len(messages))
print('Sequence length:', sequence_length)

Number of messages: 29591
Sequence length: 3


In [42]:
# Creating a vocabulary

filtered_messages = [message for message in messages if len(message) >= sequence_length]
print('Number of filtered messages:', len(filtered_messages))

vocab = set()
for message in filtered_messages:
    vocab.update(message)
vocab_size = len(vocab) + 1
print('Vocab size:', vocab_size)

number_of_words = sum(lengths)
print('Total words:', number_of_words)
print('Vocab / total words ratio:', round(vocab_size / number_of_words, 3))
print('Vocab size ^ 0.25:', vocab_size ** 0.25)

Number of filtered messages: 26121
Vocab size: 51154
Total words: 257097
Vocab / total words ratio: 0.199
Vocab size ^ 0.25: 15.039032566586808


In [39]:
vocab

{'racee',
 'määrältä',
 'parkkipaikoista',
 'kiroilen',
 'ruvettii',
 'folio',
 'hồ',
 'msg',
 'ajattelemme',
 'firman',
 'epätodennäkösetä',
 'huikee',
 'kolaroi',
 'hyökänny',
 'tapit',
 "pienemm'älki",
 'märkiä',
 'ainua',
 'onnistui',
 'naapurun',
 'touhulta',
 'lähtiessään',
 'shtml',
 'hallis',
 'asunnosta',
 "aukaisu'",
 'tarkastus',
 'sivustunut',
 'kuplivista',
 'serkku',
 'alaraja',
 'lattiassa',
 'laitanpa',
 'mössöt',
 'ajo',
 'kuvanlaatu',
 'persettä',
 'salsan',
 'lähetyksesi',
 'abc',
 'huolloilla',
 'pääkipuja',
 'luojan',
 'parkkiksel',
 'tämmöistä',
 'tilastokeskuksen',
 'kohtuulliselta',
 'alakanavaan',
 'lähetättekö',
 'koirabisneksen',
 'fina',
 'lennätte',
 'tjaua',
 'lafkakohtasta',
 'sot',
 'kaduttaa',
 'työkaveril',
 'suomalaisista',
 'trollaatko',
 'yhtäö',
 'ootellaan',
 'kuunteleen',
 'ratkes',
 'sivuston',
 'entj',
 'liiketilan',
 'kattooks',
 'vedetki',
 'ratsiavalvonnasta',
 'haluutko',
 'tsiikaile',
 'pulloo',
 'skeemojen',
 'ajatki',
 'uuteen',
 'bugeja

In [45]:
# Tokenizing words

tokenizer = Tokenizer()
tokenizer.fit_on_texts(filtered_messages)
encoded_messages = tokenizer.texts_to_sequences(filtered_messages)

In [46]:
# Model definition

embedding_dims = 3 # round(vocab_size**0.25)
model = Sequential()
model.add(Embedding(vocab_size, embedding_dims, input_length=sequence_length-1, mask_zero=True))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.5))
model.add(Dense(vocab_size))
model.add(Activation('softmax'))

print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 2, 3)              153462    
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               135168    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 51154)             13146578  
_________________________________________________________________
activation (Activation)      (None, 51154)             0         
Total params: 13,435,208
Trainable params: 13,435,208
Non-trainable params: 0
_________________________________________________________________
None


In [51]:
# Prepare training data

def previous_words_to_many_words_context(encoded_messages, sequence_length):
    X_sequences = []
    Y_last_words = []
    for message in encoded_messages:
        for i in range(0, len(message)-sequence_length):
            for j in range(1, sequence_length):
                seq = message[i:i+j]
                seq = [0] * (sequence_length - len(seq) - 1) + seq
                X_sequences.append(seq)
                Y_last_words.append(message[i+j])
    X_sequences = np.array(X_sequences)
    Y_last_words = np.array(Y_last_words)
    
    print('Number of sequences:', len(X_sequences))
    return X_sequences, Y_last_words

X_sequences, Y_last_words = previous_words_to_many_words_context(encoded_messages, sequence_length)

Number of sequences: 343772


In [49]:
# Compiling the model

adam = optimizers.Adam(lr=0.001)
if os.path.exists('best_weights.hdf5'):
    model.load_weights('best_weights.hdf5')
    
model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['sparse_categorical_accuracy'])
checkpoint = ModelCheckpoint("best_weights.hdf5", monitor='sparse_categorical_accuracy', verbose=1, save_best_only=True, mode='max')
earlystopping = EarlyStopping(patience=4, monitor='sparse_val_categorical_accuracy')
callbacks = [earlystopping, checkpoint]

In [None]:
# Fitting the model

model.fit(X_sequences, Y_last_words, epochs=10, callbacks=callbacks, verbose=1, validation_split=0.1)
model.save('model.h5')

with open(context + '_objects.pickle', 'wb') as f:
        pickle.dump(X_sequences, f)
        pickle.dump(tokenizer, f)
        pickle.dump(sequence_length, f)

Epoch 1/10