# Whatsapp text generation with LSTM Recurrent Neural Network

First (unsuccessful) attempt at generating text based on Whatsapp messages using deep learning.

Ideas for improvement:
- Predicting words instead of characters. However, this might not work because the training data is in Finnish and contains loads of spoken language and typos.
- Using split messages instead of one large string
- Using UTF-8 characters instead of ASCII characters. Use emojis as single tokens.
- Remove punctuation as it is not typically used very much in Whatsapp messages
- Recuce batch size 

Source: https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/ 

In [1]:
# Setting up Google Colab

from google.colab import drive
drive.mount("/content/gdrive")

%cd gdrive/My Drive/Projektit/whatsapp-analysis/src

! pip install emoji

Mounted at /content/gdrive
/content/gdrive/My Drive/Projektit/whatsapp-analysis/src
Collecting emoji
[?25l  Downloading https://files.pythonhosted.org/packages/24/fa/b3368f41b95a286f8d300e323449ab4e86b85334c2e0b477e94422b8ed0f/emoji-1.2.0-py3-none-any.whl (131kB)
[K     |████████████████████████████████| 133kB 7.3MB/s 
[?25hInstalling collected packages: emoji
Successfully installed emoji-1.2.0


In [2]:
# Import libraries

import numpy as np
import sys
from whatsapp_analysis.config import data_path
from whatsapp_analysis.helper import import_data, preprocess_data
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

In [3]:
# Read and pre-process data
# - extract messages that are longer than 1 word and have no media or links
# - convert messages to lowercase
# - join messages to a single string
# - encode messages to ascii to make the dictionary smaller

df = import_data(data_path)
df = preprocess_data(df)
messages = df[(df['media_count'] == 0) & (df['word_count'] > 1) & (df['link_count'] == 0)]['message']
messages = [message.lower() for message in messages]
text = ' '.join(messages)
text = text.encode('ascii', 'ignore').decode()

In [4]:
# Creating a vocabulary and mapping characters to integers

chars = sorted(list(set(text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [5]:
# Vocabulary statistics

n_chars = len(text)
n_vocab = len(chars)

print('Total characters:', n_chars)
print('Total vocab:', n_vocab)

Total characters: 1535411
Total vocab: 70


In [6]:
# Prepare the dataset of input to output pairs encoded as integers

seq_length = 100
dataX = []
datay = []

for i in range(0, n_chars - seq_length, 1):
    seq_in = text[i:i+seq_length]
    seq_out = text[i+seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    datay.append(char_to_int[seq_out])

n_patterns = len(dataX)
print('Total patterns:', n_patterns)

X = np.reshape(dataX, (n_patterns, seq_length, 1))
X = X / float(n_vocab)
y = np_utils.to_categorical(datay)

Total patterns: 1535311


In [7]:
# Define the LSTM model

# Small model:
# model = Sequential()
# model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
# model.add(Dropout(0.2))
# model.add(Dense(y.shape[1], activation='softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='adam')

# Larger model:

model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Define the checkpoint

filepath = "weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [8]:
# Fit the model

model.fit(X, y, epochs=10, batch_size=128, callbacks=callbacks_list, verbose=1)

Epoch 1/10

Epoch 00001: loss improved from inf to 2.55548, saving model to weights-improvement-01-2.5555.hdf5
Epoch 2/10

Epoch 00002: loss improved from 2.55548 to 2.34724, saving model to weights-improvement-02-2.3472.hdf5
Epoch 3/10

Epoch 00003: loss improved from 2.34724 to 2.22933, saving model to weights-improvement-03-2.2293.hdf5
Epoch 4/10

Epoch 00004: loss improved from 2.22933 to 2.15877, saving model to weights-improvement-04-2.1588.hdf5
Epoch 5/10

Epoch 00005: loss improved from 2.15877 to 2.11106, saving model to weights-improvement-05-2.1111.hdf5
Epoch 6/10

Epoch 00006: loss improved from 2.11106 to 2.07563, saving model to weights-improvement-06-2.0756.hdf5
Epoch 7/10

Epoch 00007: loss improved from 2.07563 to 2.04816, saving model to weights-improvement-07-2.0482.hdf5
Epoch 8/10

Epoch 00008: loss improved from 2.04816 to 2.02598, saving model to weights-improvement-08-2.0260.hdf5
Epoch 9/10

Epoch 00009: loss improved from 2.02598 to 2.00941, saving model to weig

<tensorflow.python.keras.callbacks.History at 0x7f0069344908>

In [28]:
# Load the network weights

filename = "weights-improvement-10-1.9934.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Reverse mapping

int_to_char = dict((i, c) for i, c in enumerate(chars))

# Pick a random seed from messages

start = np.random.randint(0, len(X)-1)
pattern = dataX[start]

print('Seed:')
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")

# Generate characters

print('Result:')
for i in range(100):
	x = np.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = np.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]

Seed:
" n maksaa softapivitykset? 0e huollon yhteydess no sit 5/5 iha jees toki huolto oli 169e mut ei ny ka "
Result:
ikki tarvii kaikki tiet et se on koko tietoturvaittu kaikki tiet et se on kyll tiet et se on koko si