In [1]:
import numpy as np
import nltk
import gensim
from unidecode import unidecode
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from glove import Glove
from gensim.models import word2vec
import csv
from pickle import dump
from time import time

In [2]:
confuc_model = Glove.load('Confuciembeddings1')

### Clean Dataset using Regex
Remove special characters from the data set and split into sentences
1. Potentially need to add tokenization for names
2. Consider method of retaining capitalization? Perhaps correct grammar later add


In [3]:
word_to_index = confuc_model.dictionary
word_to_vec_map = {word: confuc_model.word_vectors[confuc_model.dictionary[word]] for word in confuc_model.dictionary.keys()}

In [4]:
from keras.models import Model, Sequential
from keras.layers import Dense, Input, Dropout, LSTM, Activation, Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence, text
from keras.initializers import glorot_uniform
from keras.optimizers import Adam, SGD
from keras.utils import to_categorical
from keras.callbacks import TensorBoard
import tensorflow as tf

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [5]:
with open('Outputs/out_file1.csv') as csvfile:
    reader = csv.reader(csvfile)
    sentences2 = [item for sublist in reader for item in sublist]

sentences = [nltk.word_tokenize(sent) for sent in sentences2]
words = [item for sublist in sentences for item in sublist]

In [6]:
print(words)



In [7]:
### Generate Sequences
flat_sequence = [word_to_index[i] for i in words]

length = 10 + 1
sequences = list()

for i in range(length, len(flat_sequence)):
    seq = flat_sequence[i - length: i]
    line = seq
    sequences.append(line)

In [8]:
## Define Parameters
vocab_size = len(word_to_index) + 1
vector_dim = len(word_to_vec_map['man'])
maxLen = 10


In [9]:
## Define X, y
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

In [10]:
print(np.shape(y))
print(np.shape(X))

(61427, 6031)
(61427, 10)


In [11]:
print(word_to_index['confused'])
print(word_to_vec_map['confused'])

1968
[-4.79211117e-03 -1.12285739e-02 -2.35476633e-02 -1.83748371e-02
  2.60924625e-02 -7.34899544e-03  2.80112521e-02 -1.75595795e-02
 -1.71328746e-02 -1.04084225e-02  2.11416688e-02 -2.27650826e-02
 -1.46129098e-02  9.79521076e-03  2.83407798e-02 -9.95634903e-03
 -1.54248258e-02  1.07564602e-02  3.10458495e-02  1.00075236e-03
 -2.68718851e-02  2.36723643e-02 -4.38218147e-02  2.10320645e-02
 -2.21350151e-02 -1.98058595e-02 -1.96821685e-02 -1.20854323e-03
  1.68174471e-02 -3.55661293e-02 -3.88498306e-02 -1.98451160e-02
 -1.50926181e-05 -1.97132027e-02 -2.50312264e-02  2.22289869e-02
  2.81416378e-02 -1.44529539e-02  3.12234518e-02 -5.45129352e-03
 -2.68914620e-02  3.07381139e-02 -2.90633746e-02 -3.49366129e-02
 -1.04225835e-02 -1.59030633e-02  1.51510501e-02  1.16555977e-02
  2.28774481e-03 -7.43527140e-03  2.40830015e-03  2.85596909e-02
 -1.27472166e-02  1.49435479e-02  9.38551069e-03 -1.19287450e-02
  4.88249646e-03 -2.95267270e-02 -1.56713047e-02  2.41291241e-03
  1.67454577e-02  1.

In [12]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    vocab_len = len(word_to_index) + 1
    emb_dim = word_to_vec_map['confused'].shape[0]
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]
    
    embedding_layer = Embedding(input_dim = vocab_len, output_dim = emb_dim, trainable = False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [13]:
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
print("weights[0][1][3] =", embedding_layer.get_weights()[0][1][3])

weights[0][1][3] = 0.15722294


In [14]:

def DeepFucius(input_shape, word_to_vec_map, word_to_index):
    vocab_len = len(word_to_index) + 1
    sentence_indices = Input(shape = input_shape, dtype = 'int32')
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)
    
    X = LSTM(1024, return_sequences = True)(embeddings)
    
    X = Dropout(0.2)(X)
    
    X = LSTM(1024, return_sequences = True)(X)
    
    X = Dropout(0.2)(X)
    
    X = LSTM(512, return_sequences = False)(X)
    
    X = Dropout(0.2)(X)
    
    X = Dense(500, activation = 'tanh')(X)
    
    X = Dense(vocab_len, activation = 'softmax')(X)
    
    model = Model(inputs = sentence_indices, outputs = X)
    
    return model
    

In [15]:
model = DeepFucius((maxLen,), word_to_vec_map, word_to_index)

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 10, 100)           603100    
_________________________________________________________________
lstm_1 (LSTM)                (None, 10, 1024)          4608000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 10, 1024)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 10, 1024)          8392704   
_________________________________________________________________
dropout_2 (Dropout)          (None, 10, 1024)          0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 512)               3147776   
__________

In [16]:
opt = Adam(lr = 0.0005, beta_1 = 0.9, beta_2 = 0.999, epsilon = None, decay = 0.0000001, amsgrad = False)
model.compile(loss = 'categorical_crossentropy',optimizer = opt, metrics = ['accuracy'])


In [17]:
#logs_path = 'Logs/graph2'
#tf.summary.FileWriter(logs_path, graph=tf.get_default_graph())
#tensorboard = TensorBoard(logs_path.format(time()))

### Train the model

##### modify your parameters as necessary

In [19]:
model.fit(X, y, batch_size = 128, epochs = 1)



Epoch 1/1


<keras.callbacks.History at 0x7ff5021e46d8>

In [20]:
### Save Model Parameters and Tokenizer
model.save('model.h10') 


In [21]:
dump(word_to_index, open('tokenizer2.pkl','wb'))

In [22]:
print(X)

[[   0    1    2 ...    7    8    9]
 [   1    2    3 ...    8    9   10]
 [   2    3    4 ...    9   10   11]
 ...
 [   4    6  114 ... 2387  171  114]
 [   6  114  437 ...  171  114  450]
 [ 114  437 2387 ...  114  450   96]]
