## Neural Language Modeling

### Develop a Character-Based Neural Language Model

##### Data Preparation

In [1]:
from os import listdir

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

def process_docs(directory):
    documents = list()
    # walk through all files in the folder
    for filename in listdir(directory):
        path = directory + '/' + filename
        # load the doc
        doc = load_doc(path)
        # clean
        tokens = doc.split()
        raw_text = ' '.join(tokens)
        # add to list
        documents.append(raw_text)
    return documents

# load text
raw_text = process_docs(r"D:\NLP\Deep_Learning_in_NLP\phishing_detection\poem")
print(len(raw_text))
# organize into sequences of characters
length = 10
sequences = list()
for i in range(len(raw_text)):
    for j in range(length, len(raw_text[i])):
        # select sequence of tokens
        seq = raw_text[i][j-length:j+1]
        # store
        sequences.append(seq)
print('Total Sequences: %d' % len(sequences))
# save sequences to file
out_filename = 'char_sequences.txt'
save_doc(sequences, out_filename)

100
Total Sequences: 236725


#####  Train Language Model

In [2]:
from numpy import array
from pickle import dump
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# define the model
def define_model(X):
    model = Sequential()
    model.add(LSTM(75, input_shape=(X.shape[1], X.shape[2])))
    model.add(Dense(vocab_size, activation='softmax'))
    # compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize defined model
    model.summary()
    plot_model(model, to_file='model.png', show_shapes=True)
    return model

# load
in_filename = 'char_sequences.txt'
raw_text = load_doc(in_filename)
lines = raw_text.split('\n')
# integer encode sequences of characters
chars = sorted(list(set(raw_text)))
mapping = dict((c, i) for i, c in enumerate(chars))
sequences = list()
for line in lines:
    # integer encode line
    encoded_seq = [mapping[char] for char in line]
    # store
    sequences.append(encoded_seq)
# vocabulary size
vocab_size = len(mapping)
print('Vocabulary Size: %d' % vocab_size)
# separate into input and output
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
X = array(sequences)
y = to_categorical(y, num_classes=vocab_size)
# define model
model = define_model(X)
# fit model
model.fit(X, y, epochs=100, verbose=2)
# save the model to file
model.save('model.h5')
# save the mapping
dump(mapping, open('mapping.pkl', 'wb'))


Vocabulary Size: 95
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 75)                51300     
                                                                 
 dense (Dense)               (None, 95)                7220      
                                                                 
Total params: 58520 (228.59 KB)
Trainable params: 58520 (228.59 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.
Epoch 1/100
7398/7398 - 36s - loss: 2.4191 - accuracy: 0.3293 - 36s/epoch - 5ms/step
Epoch 2/100
7398/7398 - 35s - loss: 2.0973 - accuracy: 0.3981 - 35s/epoch - 5ms/step
Epoch 3/100
7398/7398 - 35s - loss: 1.9842 - accuracy: 0.4271 - 35s/epoch -

  saving_api.save_model(


##### Generate Text

In [5]:
from pickle import load
import numpy as np
from keras.models import load_model
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

# generate a sequence of characters with a language model
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
    in_text = seed_text
    # generate a fixed number of characters
    for _ in range(n_chars):
        # encode the characters as integers
        encoded = [mapping[char] for char in in_text]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # one hot encode
        encoded = to_categorical(encoded, num_classes=len(mapping))
        #reshaped_data = encoded.reshape(1, encoded.shape[0], encoded.shape[1])
        # predict character
        #yhat_probabilities = model.predict(reshaped_data, verbose=0)
        #yhat = np.argmax(yhat_probabilities)
        yhat = model.predict(encoded, verbose=0)
        predicted_index = np.argmax(yhat)
        # reverse map integer to character
        out_char = ''
        for char, index in mapping.items():
            if index == predicted_index:
                out_char = char
                break
        # append to input
        in_text += out_char
    return in_text
# load the model
model = load_model('model.h5')
# load the mapping
mapping = load(open('mapping.pkl', 'rb'))
# test start of rhyme
print(generate_seq(model, mapping, 10, 'will be ve', 20))
# test mid-line
print(generate_seq(model, mapping, 10, 'he strive ', 20))
# test not in original
print(generate_seq(model, mapping, 10, 'hello wor', 20))

will be venneming the rest of 
he strive of a hands the strik
hello words, and which the sa
