# Language Models for text generation
* One traditional approach
* And one deep approach

----

## The objective is to create a model which tries to assess the liklehood of language:

$P(w_{t+1} | w_{t-1+n}, ..., w_{t})$

---

## Traditional approach - trigram Markov Chain

* for a given dataset
* bin the words into size 2 unique pairs
* for a given pair, find the succeeding word
* build a transition probability matrix (markov chain) of these relationships
* use a sampler on this matrix to generate new text

---

### Imports

In [None]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import numpy as np
import re

### Download text data

In [None]:
data = fetch_20newsgroups(remove=['headers', 'footers'])
text = data['data']

### Calculate the trigram probabilities

In [None]:
def trigram_word_distribution(data):
    """create a probability distribution over all trigrams
    
        params: data - a Bunch data object from sklearn
        returns: [Bigram probability distribution, trigram probability distribution]
    """
    
    all_data = ' '.join([' '.join(re.findall('(?u)\\b\\w\\w*\\b',article.lower())) for article in text]).split()
    tri_gram = [' '.join([x,y]) for x,y in zip(all_data[:-1:], all_data[1::])]
    next_word = all_data[2:] + [' '] * 1
    words = pd.DataFrame({'seed_word':all_data[:-1],'gram_words':tri_gram, "next_word":next_word})
    words['seed_next_word'] = words['seed_word'].shift(-1)
    seed_word_distribution = words.groupby('seed_word')['seed_next_word'].value_counts(normalize=True)
    gram_word_distribution = words.groupby('gram_words')['next_word'].value_counts(normalize=True)
    
    return [seed_word_distribution, gram_word_distribution]

In [None]:
dist = trigram_word_distribution(data)

### Generate a sentence

In [None]:
def trigram_text_generation(seed, length, distribution):
    """seed a distribution with a seed word, and ask it to make more words
        
        params: seed - A seed word, 
                length -Length of the generated sentence
                distribution - A word probability distribution
                
        returns: generated sentence
    """
    
    try:
        seed = seed.lower()
        seed += ' ' + np.random.choice(distribution[0][seed].index, p=distribution[0][seed].values)
        for i in range(length):
             seed += ' ' + np.random.choice(distribution[1][' '.join(seed.split()[-2:])].index, p=distribution[1][' '.join(seed.split()[-2:])].values)
        return seed
    
    except:
        print('Oops! Try another word as seed word')
        return None

### An example

In [None]:
seed = 'It'
sentence_length = 20
sentence = trigram_text_generation(seed,sentence_length, dist)
sentence

### Drawbacks:

* It is still imperfect at capturing anything above basic syntax, and has no semantics or pragmatic capability
* A quad-gram would be better, but as we increase gram size, transition matrices require increasingly more computation power to train, and space to store.
* N-grams are a sparse representation of language -  any word not present in the training corpus has a zero probability chance of being used

---

## Deep approach - Bidirectional LSTM with trainable Embeddings

### Imports

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense, Embedding, BatchNormalization, Flatten, Bidirectional, LSTM, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.utils import to_categorical
from sklearn.datasets import fetch_20newsgroups
from tqdm import tqdm
import numpy as np
import re

#### Create a continuous list of words

In [None]:
data = fetch_20newsgroups(remove=['headers', 'footers'])
text = data['data']

In [None]:
all_data = ' '.join([' '.join(re.findall('(?u)\\b[a-zA-Z]*\\b',article.lower())) for article in text]).split()

#### Integer encode sequences of words

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_data)
sequences = tokenizer.texts_to_sequences(all_data)

#### Split the data into X and y sequences
* X contains sequences of length `seq_length` of the previous words
* y is the next word
* For example:
    * if the sentence is `i saw a dog on the street`, and `seq_length` = 3, we have
    * X = 'i saw a', 'saw a dog', 'a dog on', ....
    * y = 'dog', 'on', 'the' ...

In [None]:
X_data= []
y_data = []
seq_length=10
for i in tqdm(range(len(sequences)-seq_length)):
    X_data.append(sequences[i:i+seq_length])
    y_data.append(sequences[i+seq_length])

#### Work out the vocab list and size of vocab
* Words are assigned values from 1 to the total number of words (e.g. 10,000). The Embedding layer needs to allocate a vector representation for each word in this vocabulary from index 1 to the largest index and because indexing of arrays is zero-offset, the index of the word at the end of the vocabulary will be 10,000; that means the array must be 10,000 + 1 in length.

Therefore, when specifying the vocabulary size to the Embedding layer, we specify it as 1 larger than the actual vocabulary.

In [None]:
vocab_list = list(tokenizer.word_index.keys())
vocab_size = len(tokenizer.word_index) + 1

#### Reshape the X and y data

In [None]:
X = np.array(X_data).reshape(len(X_data), seq_length)
y = to_categorical(y_data) #onehot encode our y data

In [None]:
X.shape, y.shape

----

### Build the model

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=32, input_length=seq_length))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(128, return_sequences=True), merge_mode='sum'))
model.add(LSTM(128))
model.add(BatchNormalization(axis=-1, momentum=0.99, epsilon=1e-3, center=True, scale=True))
model.add(Dense(100, activation='relu'))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

### Save the best version of the model

In [None]:
filepath=f"best_weights.hdf5"
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=15, min_delta=0.0001) 
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True, mode='max')
callbacks = [early_stop, checkpoint]

### Look at the parameters of the model

In [None]:
model.summary()

### Fit the model

In [None]:
history = model.fit(X, y, epochs=1, batch_size=128, validation_split=0.2, callbacks=callbacks)

### If trained, load weights from disk

In [None]:
# load a saved model
# filename = "weights_01.hdf5"
# model.load_weights(filename)
# model.compile(loss='categorical_crossentropy', optimizer='adam')

----

### An example
* For a given input string generate some new text
* the input string has to be pre-prepared

In [None]:
def prepare_input(seed_input):
    """prepare a string for the LSTM"""
    
    seed_input = seed_input.split()
    try:
        return np.expand_dims(np.array([tokenizer.word_index[x] for x in seed_input]),axis=0)
    except:
        return 'please try with a different sentence'

In [None]:
def generate_text(input_string, sentence_length):
    """generate some new text as a string"""
    
    seed = prepare_input(input_string)
    for i in range(sentence_length-10):
    #predict next word based on window of 10 previous words - and add to embedded doc
        next_word = np.argmax(model.predict(seed[:,i:])).reshape(1,-1)
        seed = np.append(seed,next_word,axis=1)

    return ' '.join([tokenizer.index_word[x] for x in seed[0,:]])

### Unlike the traditional method, our seed has to be 10 words not 1 word

In [None]:
input_string = 'it was a dark and stormy night in berlin because'
assert len(input_string.split()) == 10

### Produce some text

In [None]:
generate_text(input_string, sentence_length=30)

---