Example script to generate text from Nietzsche's writings.
At least 20 epochs are required before the generated text
starts sounding coherent.
It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.
If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.

All code is from here: https://github.com/keras-team/keras/blob/master/examples/lstm_text_generation.py

It is highly recommended to use this examples during homework: https://github.com/keras-team/keras/tree/master/examples

In [14]:
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import io
### Import Libraries 
import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
import seaborn as sns
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Input, SimpleRNN, GRU
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\komyshev.da\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
### Reading csv data files using pandas dataframe 

train = pd.read_csv("./train.csv", encoding = "ISO-8859-1")

### Before cleaning the dataset I would like to perform EDA(Exploratory data analysis) by performing data visualization to understand
### the distribution of different classes. I will be performing EDA on training dataset

categorywise_data = train.drop(['id', 'comment_text'], axis=1)     ### Removed unnecessary columns - id and comment_text
counts_category = []                                               ### A list that contains tuple which consists of class label and number of comments for that particular class 
categories = list(categorywise_data.columns.values)
for i in categories:
    counts_category.append((i, categorywise_data[i].sum()))
    
dataframe = pd.DataFrame(counts_category, columns=['Labels', 'number_of_comments'])   ### Dataframe made up of category and total number of comments

### Data cleaning/Preparation 

def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)      ### conversion of contraction words to expanded words
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)                                                 ### removing non-word characters
    text = re.sub('[^A-Za-z\' ]+', '',text)                                        ### removing all non-alphanumeric values(Except single quotes)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    text = ' '.join([word for word in text.split() if word not in (stop_words)])    ### Stopwords removal
    return text

train["comment_text"] = train["comment_text"].apply(clean_text)
test["comment_text"] = test["comment_text"].apply(clean_text)

text = train["comment_text"]

text = text[0:600893]
text = text.to_string()
print('corpus length:', len(text))

chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

corpus length: 9574259
total chars: 39


In [43]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

nb sequences: 3191407
Vectorization...


In [44]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars), activation='softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...


In [45]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

In [46]:
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(x, y,
          batch_size=128,
          epochs=60,
callbacks=[print_callback])

Epoch 1/60
----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: "le deleted longer question deta...
23869"
le deleted longer question deta...
23869  a                                                                                                                                                                                                                                                                                                                                                                                                             
----- diversity: 0.5
----- Generating with seed: "le deleted longer question deta...
23869"
le deleted longer question deta...
23869d p a mer dhon t

  preds = np.log(preds) / temperature


anl dinnin inte j wivemlli .........533...........  p rank leote  serutannt   .........werc     a tro                                              o                          w r   n                               s        uoe                           e                                                                                                 d l   s  a ama   rinda re s bdey wa
----- diversity: 1.0
----- Generating with seed: "le deleted longer question deta...
23869"
le deleted longer question deta...
23869l  ib m  .....85932c gdess rtaeosk.h.gibeintoeltwe ens bhonedian cr mixek .d0g  ki noetaistanahan le ldl fima ayehlromeing1o3 n o27ig rreere fopin  jatsis noastl a f de paw iceraricsyede apanseed eicse ogtnerteleostusdeces  paalimrl w iit co  oreite orftusnidgutoled satg c .....meppo......solp eedfa acdacuulup di2are at2f eewp0pyeseresurevpinnz voleinhpa8chmo.ogens stm to uo nbuusenmullon nan o  .
----- diversity: 1.2
----- Generating with seed: "le deleted longer question deta.

5n eauegr1igw3 t a.v e.e 5tr4 y  ia et 06p..ha3o  naafevq1nnsyrmh...e cf  u...gxi sste3llirnlu nar9a.rddseov gil yk  r  se8ye ul lo5f eryei60r tiq r1riv  fedi e gtsp dyeekd 640t nln     70adti ilithrfctdez rpiiatn  adie7 t siheu  .  ire2  alt o ef  n  et  ctr tnaoemerycaiyaogpmas t1d4 21.5l   c140gyal rrhe23lh  t2 ttotadelold ang dboeni1alolgon p t ge3allnle a.j3 uma2   xsndcrs 9um anaemi o ptaprl 
Epoch 5/60
----- Generating text after Epoch: 4
----- diversity: 0.2
----- Generating with seed: "ince cannot leave user page know put...
"
ince cannot leave user page know put...
i        ... e                                                                                                                                                                                                                                                                                                                                                                                                  
----- diversity: 

KeyboardInterrupt: 

Сохранение модели

In [48]:
model.save("Oleg.model")

INFO:tensorflow:Assets written to: Oleg.model\assets


Загрузка и дообучение модели

In [None]:
from keras.models import save_model, load_model

model = load_model("./Oleg.model")

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(x, y,
          batch_size=128,
          epochs=60,
callbacks=[print_callback])