In [1]:
import re,os,sys

In [7]:
GLOVE_DIR = r'D:\sudeep_work\language_modelling\glove.6B'
TEXT_DATA_DIR = r"D:\sudeep_work\language_modelling\data\1.txt"
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [9]:
with open(TEXT_DATA_DIR, 'r',encoding = 'utf8') as cf:
    corpus = []
    for line in cf: # loops over all the lines in the corpus
        line = line.strip() # strips off \n \r from the ends 
        if line: # Take only non empty lines
            line = re.sub(r'\([^)]*\)', '', line) # Regular Expression to remove text in between brackets
            line = re.sub(' +',' ', line) # Removes consecutive spaces
            # add more pre-processing steps
            corpus.append(line)
print("\n".join(corpus)) # Shows the first 5 lines of the corpus

use a cascade of many layers of nonlinear processing units for feature extraction and transformation.


In [10]:
corpus

['use a cascade of many layers of nonlinear processing units for feature extraction and transformation.']

In [11]:
# Load Spacy
import spacy
import numpy as np


In [12]:
nlp = spacy.load('en_core_web_sm')

In [13]:
def preprocess_corpus(corpus):
    corpus_tokens = []
    sentence_lengths = []
    for line in corpus:
        doc = nlp(line) # Parse each line in the corpus
        for sent in doc.sents: # Loop over all the sentences in the line
            corpus_tokens.append('SEQUENCE_BEGIN')
            s_len = 1
            for tok in sent: # Loop over all the words in a sentence
                if tok.text.strip() != '' and tok.ent_type_ != '': # If the token is a Named Entity then do not lowercase it 
                    corpus_tokens.append(tok.text)
                else:
                    corpus_tokens.append(tok.text.lower())
                s_len += 1
            corpus_tokens.append('SEQUENCE_END')
            sentence_lengths.append(s_len+1)
    return corpus_tokens, sentence_lengths

In [14]:
corpus_tokens, sentence_lengths = preprocess_corpus(corpus)
print(corpus_tokens) # Prints the first 30 tokens
mean_sentence_length = np.mean(sentence_lengths)
deviation_sentence_length = np.std(sentence_lengths)
max_sentence_length = np.max(sentence_lengths)
print('Mean Sentence Length: {}\nSentence Length Standard Deviation: {}\n'
      'Max Sentence Length: {}'.format(mean_sentence_length, deviation_sentence_length, max_sentence_length))

['SEQUENCE_BEGIN', 'use', 'a', 'cascade', 'of', 'many', 'layers', 'of', 'nonlinear', 'processing', 'units', 'for', 'feature', 'extraction', 'and', 'transformation', '.', 'SEQUENCE_END']
Mean Sentence Length: 18.0
Sentence Length Standard Deviation: 0.0
Max Sentence Length: 18


In [32]:
Vocab = list(set(corpus_tokens)) # This works well for a very small corpus
print(len(Vocab))

17


In [33]:
import collections

word_counter = collections.Counter()
for term in corpus_tokens:
    word_counter.update({term: 1})
word_counter
Vocab = word_counter.most_common(500) 
print('Vocab Size: {}'.format(len(Vocab))) 
print(word_counter.most_common(100)) # just to show the top 100 terms

Vocab Size: 17
[('of', 2), ('SEQUENCE_BEGIN', 1), ('use', 1), ('a', 1), ('cascade', 1), ('many', 1), ('layers', 1), ('nonlinear', 1), ('processing', 1), ('units', 1), ('for', 1), ('feature', 1), ('extraction', 1), ('and', 1), ('transformation', 1), ('.', 1), ('SEQUENCE_END', 1)]


In [34]:

Vocab.append(('UNKNOWN', 1))
print("Vocab = ",len(Vocab))
Idx = range(1, len(Vocab)+1)
vocab = [t[0] for t in Vocab]

Word2Idx = dict(zip(vocab, Idx))
Idx2Word = dict(zip(Idx, vocab))

Word2Idx['PAD'] = 0
Idx2Word[0] = 'PAD'
VOCAB_SIZE = len(Word2Idx)
print('Word2Idx Size: {}'.format(len(Word2Idx)))
print('Idx2Word Size: {}'.format(len(Idx2Word)))

Vocab =  18
Word2Idx Size: 19
Idx2Word Size: 19


In [35]:
Idx2Word

{0: 'PAD',
 1: 'of',
 2: 'SEQUENCE_BEGIN',
 3: 'use',
 4: 'a',
 5: 'cascade',
 6: 'many',
 7: 'layers',
 8: 'nonlinear',
 9: 'processing',
 10: 'units',
 11: 'for',
 12: 'feature',
 13: 'extraction',
 14: 'and',
 15: 'transformation',
 16: '.',
 17: 'SEQUENCE_END',
 18: 'UNKNOWN'}

In [37]:
embedding_matrix = np.zeros((len(Word2Idx) , EMBEDDING_DIM))# We use 300 because Spacy provides us with vectors of size 300

In [38]:
embedding_matrix.shape

(19, 100)

In [39]:
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), encoding = 'utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [42]:
for w_i, key in enumerate(Word2Idx):
    print("w_i, key = ",w_i, key)
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[w_i] = embedding_vector


w_i, key =  0 of
w_i, key =  1 SEQUENCE_BEGIN
w_i, key =  2 use
w_i, key =  3 a
w_i, key =  4 cascade
w_i, key =  5 many
w_i, key =  6 layers
w_i, key =  7 nonlinear
w_i, key =  8 processing
w_i, key =  9 units
w_i, key =  10 for
w_i, key =  11 feature
w_i, key =  12 extraction
w_i, key =  13 and
w_i, key =  14 transformation
w_i, key =  15 .
w_i, key =  16 SEQUENCE_END
w_i, key =  17 UNKNOWN
w_i, key =  18 PAD


In [44]:
embedding_matrix.shape

(19, 100)

In [45]:
train_val_split = int(len(corpus_tokens) * 0.8) # We use 80% of the data for Training and 20% for validating
train = corpus_tokens[:train_val_split]
validation = corpus_tokens[train_val_split:-1]

print('Train Size: {}\nValidation Size: {}'.format(len(train), len(validation)))

Train Size: 14
Validation Size: 3


In [46]:
# A method to convert a sequence of words into a sequence of IDs given a Word2Idx dictionary
def word2idseq(data, word2idx):
    id_seq = []
    for word in data:
        if word in word2idx:
            id_seq.append(word2idx[word])
        else:
            id_seq.append(word2idx['UNKNOWN'])
    return id_seq

def find_ngrams(input_list, n):
    print("n = ",n)
    print("input_list = ",input_list)
    return zip(*[input_list[i:] for i in range(n)])

In [47]:
train_id_seqs = word2idseq(train, Word2Idx)
validation_id_seqs = word2idseq(validation, Word2Idx)

In [48]:
print('Sample Train IDs')
print(train_id_seqs)
train_id_text = [Idx2Word[i] for i in train_id_seqs]
print("train_id_text = ",train_id_text)
print('Sample Validation IDs')
print(validation_id_seqs)
validation_id_seqs_text = [Idx2Word[i] for i in validation_id_seqs]
print("validation_id_seqs_text = ",validation_id_seqs_text)

Sample Train IDs
[2, 3, 4, 5, 1, 6, 7, 1, 8, 9, 10, 11, 12, 13]
train_id_text =  ['SEQUENCE_BEGIN', 'use', 'a', 'cascade', 'of', 'many', 'layers', 'of', 'nonlinear', 'processing', 'units', 'for', 'feature', 'extraction']
Sample Validation IDs
[14, 15, 16]
validation_id_seqs_text =  ['and', 'transformation', '.']


In [49]:
import random
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [50]:
def prepare_data(data, n_grams=5, batch_size=8, n_epochs=10):
    dataset_X = []
    dataset_Y = []
    X, Y = [], []
    buff_size, start, end = 8, 0, 8
    n_buffer = 0
    epoch = 0
    print("there")
    while epoch <= n_epochs:
        print("epoch",epoch)
        print("len(X) = ",len(X))
        if len(X) >= batch_size:

            X_batch = X[:batch_size]
            Y_batch = Y[:batch_size]
            X_batch = pad_sequences(X_batch, maxlen=n_grams, value=0)
            print("X_batch = ",X_batch)
            print("Y_batch = ",Y_batch)
            
            Y_batch = to_categorical(Y_batch, VOCAB_SIZE)
#             yield (X_batch, Y_batch, epoch)
            dataset_X.extend(list(X_batch))
            dataset_Y.extend(list(Y_batch))
            
            X = X[batch_size:]
            Y = Y[batch_size:]
            continue
        n = random.randrange(2, n_grams)
        print("n = ",n)
        if len(data) < n: continue
        print("start,end = ",start,end)
        if end > len(data): end = len(data)
        grams = find_ngrams(data[start: end], n) # generates the n-grams
        print("grams = ",grams)
        splits = list(zip(*grams)) # split it
        print("splits = ", splits)
        X += list(zip(*splits[:len(splits)-1])) # from the inputs
        print("X = ",X)
        X = [list(x) for x in X] 
        print("X = ",X)
        Y += splits[-1] # form the targets
        print("Y = ",Y)
        if start + buff_size > len(data):
            print("start + buff_size")
            start = 0
            epoch += 1
            end = start + buff_size
        else:
            start = start + buff_size
            end = end + buff_size
    return dataset_X, dataset_Y 
            

In [51]:

# Hyperparameters# Hyperp 
LR = 0.0001
HIDDEN_DIMS = 256
BATCH_SIZE = 32
N_EPOCHS=10
N_GRAMS = 5
N_VALIDATE = 10000

In [52]:
train_id_seqs

[2, 3, 4, 5, 1, 6, 7, 1, 8, 9, 10, 11, 12, 13]

In [53]:
dataset_X, dataset_Y  = prepare_data(train_id_seqs, N_GRAMS, BATCH_SIZE, N_EPOCHS)

there
epoch 0
len(X) =  0
n =  3
start,end =  0 8
n =  3
input_list =  [2, 3, 4, 5, 1, 6, 7, 1]
grams =  <zip object at 0x0000000029C22FC8>
splits =  [(2, 3, 4, 5, 1, 6), (3, 4, 5, 1, 6, 7), (4, 5, 1, 6, 7, 1)]
X =  [(2, 3), (3, 4), (4, 5), (5, 1), (1, 6), (6, 7)]
X =  [[2, 3], [3, 4], [4, 5], [5, 1], [1, 6], [6, 7]]
Y =  [4, 5, 1, 6, 7, 1]
epoch 0
len(X) =  6
n =  3
start,end =  8 16
n =  3
input_list =  [8, 9, 10, 11, 12, 13]
grams =  <zip object at 0x0000000029C4C708>
splits =  [(8, 9, 10, 11), (9, 10, 11, 12), (10, 11, 12, 13)]
X =  [[2, 3], [3, 4], [4, 5], [5, 1], [1, 6], [6, 7], (8, 9), (9, 10), (10, 11), (11, 12)]
X =  [[2, 3], [3, 4], [4, 5], [5, 1], [1, 6], [6, 7], [8, 9], [9, 10], [10, 11], [11, 12]]
Y =  [4, 5, 1, 6, 7, 1, 10, 11, 12, 13]
start + buff_size
epoch 1
len(X) =  10
n =  4
start,end =  0 8
n =  4
input_list =  [2, 3, 4, 5, 1, 6, 7, 1]
grams =  <zip object at 0x0000000029C2D448>
splits =  [(2, 3, 4, 5, 1), (3, 4, 5, 1, 6), (4, 5, 1, 6, 7), (5, 1, 6, 7, 1)]
X =  [[2

grams =  <zip object at 0x0000000029C4AC08>
splits =  [(2, 3, 4, 5, 1, 6, 7), (3, 4, 5, 1, 6, 7, 1)]
X =  [[5], [1], [6], [7], [8, 9, 10], [9, 10, 11], [10, 11, 12], [2, 3, 4], [3, 4, 5], [4, 5, 1], [5, 1, 6], [1, 6, 7], [8, 9, 10], [9, 10, 11], [10, 11, 12], [2], [3], [4], [5], [1], [6], [7], [8, 9], [9, 10], [10, 11], [11, 12], (2,), (3,), (4,), (5,), (1,), (6,), (7,)]
X =  [[5], [1], [6], [7], [8, 9, 10], [9, 10, 11], [10, 11, 12], [2, 3, 4], [3, 4, 5], [4, 5, 1], [5, 1, 6], [1, 6, 7], [8, 9, 10], [9, 10, 11], [10, 11, 12], [2], [3], [4], [5], [1], [6], [7], [8, 9], [9, 10], [10, 11], [11, 12], [2], [3], [4], [5], [1], [6], [7]]
Y =  [1, 6, 7, 1, 11, 12, 13, 5, 1, 6, 7, 1, 11, 12, 13, 3, 4, 5, 1, 6, 7, 1, 10, 11, 12, 13, 3, 4, 5, 1, 6, 7, 1]
epoch 6
len(X) =  33
X_batch =  [[ 0  0  0  0  5]
 [ 0  0  0  0  1]
 [ 0  0  0  0  6]
 [ 0  0  0  0  7]
 [ 0  0  8  9 10]
 [ 0  0  9 10 11]
 [ 0  0 10 11 12]
 [ 0  0  2  3  4]
 [ 0  0  3  4  5]
 [ 0  0  4  5  1]
 [ 0  0  5  1  6]
 [ 0  0  1  6  

In [54]:
import tensorflow as tf
from keras import backend as K
from numpy.random import seed
import cv2
import matplotlib.pyplot as plt
import numpy as np
import math
import os
from IPython import display
import time
import pandas as pd
from keras.utils import np_utils
from keras.utils.training_utils import multi_gpu_model
from sklearn.model_selection import train_test_split
import pickle , gc
from fractions import Fraction
from collections import Counter, OrderedDict
# from Activation import relu
from keras.models import Sequential, Model
from keras.layers import Concatenate, Dense, LSTM, Input, concatenate,Reshape
from keras.optimizers import Adagrad, Adam, RMSprop
from keras.models import Sequential, model_from_json
from keras.layers import Dense, BatchNormalization
from keras.layers import Dropout, Activation
from keras.utils import np_utils
from keras.utils.training_utils import multi_gpu_model

from keras.layers.advanced_activations import LeakyReLU, ELU
from keras.layers import Flatten
from keras.layers.convolutional import Conv2D,Conv1D
from keras.layers.convolutional import MaxPooling2D
from keras.layers import Bidirectional
from keras import activations
from keras.models import Sequential
from keras.layers.core import Flatten, Dense, Dropout
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D,MaxPooling1D
from keras.optimizers import SGD
from keras.layers.convolutional_recurrent import ConvLSTM2D
from keras.callbacks import EarlyStopping, ModelCheckpoint
import keras
from keras.models import Sequential, Model
from keras.layers import Concatenate, Dense, LSTM, Input, concatenate
from keras.optimizers import Adagrad, Adam
from keras.callbacks import ReduceLROnPlateau
from keras.applications.vgg16 import VGG16
from keras.preprocessing import image
from keras.layers import Input
from keras.applications.vgg16 import preprocess_input

In [55]:
 from keras.layers import Embedding

In [56]:
embedding_matrix.shape

(19, 100)

In [57]:


embedding_layer = Embedding(len(Word2Idx),
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=N_GRAMS,
                            trainable=False)

In [58]:
sequence_input = Input(shape=(N_GRAMS,), dtype='int32')
print(sequence_input.shape)
embedded_sequences = embedding_layer(sequence_input)
print(embedded_sequences.shape)
model = LSTM(50)(embedded_sequences)
preds = Dense(len(Word2Idx), activation='softmax')(model)

model = Model(sequence_input, preds)
print(model.summary())


(?, 5)
(?, 5, 100)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 5)                 0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 5, 100)            1900      
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                30200     
_________________________________________________________________
dense_1 (Dense)              (None, 19)                969       
Total params: 33,069
Trainable params: 31,169
Non-trainable params: 1,900
_________________________________________________________________
None


In [59]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])


In [61]:
# happy learning!
model.fit(dataset_X, dataset_Y, validation_data=(dataset_X, dataset_Y),
          epochs=2, batch_size=128)

ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 1 array(s), but instead got the following list of 96 arrays: [array([0, 0, 0, 2, 3]), array([0, 0, 0, 3, 4]), array([0, 0, 0, 4, 5]), array([0, 0, 0, 5, 1]), array([0, 0, 0, 1, 6]), array([0, 0, 0, 6, 7]), array([0, 0, 0, 8, 9]), array([ 0,  0,  0,  9, 10]), ar...