In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply, GRU
from tensorflow.keras.layers import RepeatVector, Dense, Activation, Lambda
from tensorflow.keras.models import Model
import numpy as np
import os
import matplotlib.pyplot as plt
import joblib
import pickle

In [None]:
BATCH_SIZE = 100
n_a = 64
n_s = 128

In [None]:
num_words=30000
max_length=30
Tx = max_length
Ty = max_length

path = 'Data/'

dirlist = os.listdir(path)
human_sentences=[]
machine_sentences=[]
for File in dirlist:
    with open(path+"/"+File, 'r') as raw_lines:
        lineList = []
        while True:        
            line = raw_lines.readline()
            if not line:
                break
            lineList.append(line)

    for i in range(0, len(lineList)):
        if(i%2)==0:
            human_sentences.append(lineList[i])
        else:
            machine_sentences.append(lineList[i])    

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_words, oov_token='<OOV>')
tokenizer.fit_on_texts(human_sentences)
human_word_index = tokenizer.word_index
human_reverse_word_index = {a:b for (b,a) in human_word_index.items()}
tokenizerE=tokenizer

tokenizer2 = tf.keras.preprocessing.text.Tokenizer(num_words=num_words, oov_token='<OOV>')
    
tokenizer2.fit_on_texts(machine_sentences)
machine_word_index = tokenizer2.word_index
machine_reverse_word_index = {a:b for (b,a) in machine_word_index.items()}

human_sequences = tokenizer.texts_to_sequences(human_sentences)
human_padded = tf.keras.preprocessing.sequence.pad_sequences(human_sequences, maxlen=max_length, padding='post', truncating="post")

machine_sequences = tokenizer2.texts_to_sequences(machine_sentences)
machine_padded = tf.keras.preprocessing.sequence.pad_sequences(machine_sequences, maxlen=max_length, padding="post", truncating="post")


X = human_padded
Y = machine_padded
human_vocab = human_word_index
reverse_human_vocab = human_reverse_word_index
machine_vocab = machine_word_index
reverse_machine_vocab = machine_reverse_word_index


In [None]:
#taking care of odd-even shit
if X.shape[0]>Y.shape[0]:
  X = np.delete(X, len(X)-1, axis=0)
elif X.shape[0]<Y.shape[0]:
  Y = Y.delete(Y, len(Y)-1, axis=0)

In [None]:
#creating dataset
dataset = tf.data.Dataset.from_tensor_slices((X,Y)).shuffle(BATCH_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
print(Y)

[[2269   94 1250 ...    0    0    0]
 [3778 3779 1375 ...    0    0    0]
 [ 527  114  154 ...    0    0    0]
 ...
 [   2   42   15 ...    0    0    0]
 [  32    0    0 ...    0    0    0]
 [  52   46   26 ...    0    0    0]]


In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, batch_size=100, dim=max_length, shuffle=False):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.len_per_epoch = int(len(X)/self.batch_size)
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return self.len_per_epoch

    def __getitem__(self, index):
        
        p,q = self.__data_generation()

        return p, q

    def __data_generation(self):
        
        X = np.empty((self.batch_size, self.dim))
        y = np.empty((self.batch_size, self.dim))
        s0 = np.zeros((self.batch_size, n_s))
        c0 = np.zeros((self.batch_size, n_s))
        X, y = next(iter(dataset))
        y = tf.transpose(y)

        return [X,s0,c0], y

In [None]:
!wget http://nlp.stanford.edu/data/glove.twitter.27B.zip

--2020-06-30 14:49:43--  http://nlp.stanford.edu/data/glove.twitter.27B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.twitter.27B.zip [following]
--2020-06-30 14:49:44--  https://nlp.stanford.edu/data/glove.twitter.27B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip [following]
--2020-06-30 14:49:45--  http://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1520408563 (1.4G) [appli

In [None]:
!unzip glove.twitter.27B.zip

Archive:  glove.twitter.27B.zip
  inflating: glove.twitter.27B.25d.txt  
  inflating: glove.twitter.27B.50d.txt  
  inflating: glove.twitter.27B.100d.txt  
  inflating: glove.twitter.27B.200d.txt  


In [None]:
embeddings_index = {}
f = open('glove.twitter.27B.200d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1193514 word vectors.


In [None]:
embedding_dim=200
embedding_matrix = np.zeros((len(human_vocab)+1, embedding_dim))
for word, i in human_vocab.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
embeddingLayer = tf.keras.layers.Embedding((len(human_vocab)+1),embedding_dim, weights=[embedding_matrix], trainable=False, input_length=max_length)

In [None]:
repeator = RepeatVector(Tx)
concatenator = Concatenate(axis=-1)
densor = Dense(1, activation = "relu")
dotor = Dot(axes = 1)

In [None]:
def one_step_attention(a, s_prev):
    
    s_prev = repeator(s_prev)
    
    concat = concatenator([a, s_prev])
    
    e = densor(concat)
    
    alphas = tf.nn.softmax(e, axis=1)
    
    context = dotor([alphas, a])
    
    return context

In [None]:
post_activation_LSTM_cell = LSTM(n_s, return_state = True)

In [None]:
def include_yhat(context,out):
  reduced = [tf.math.argmax(out,axis=1)]
  reduced = tf.convert_to_tensor(reduced)
  reduced = tf.transpose(reduced)
  reduced = tf.cast(reduced, tf.dtypes.float32)
  concatable = RepeatVector(1)(reduced)
  concat = Concatenate(axis=-1)([context,concatable])
  g = Dense(1, activation='relu')(concat)
  alphas = tf.nn.softmax(g, axis=1)
  feedable = Dot(axes=1)([alphas,context])
  return feedable


In [None]:
def model(Tx, Ty, n_a, n_s, human_vocab_size, machine_vocab_size):
    
    X = Input(shape=(Tx,))
    s0 = Input(shape=(n_s,), name='s0')
    c0 = Input(shape=(n_s,), name='c0')
    s = s0
    c = c0
    
    outputs = []

    x = embeddingLayer(X)
    print("Shape of x after embedding:")
    print(x.shape)
    
    # Step 1: Define your pre-attention Bi-LSTM. Remember to use return_sequences=True. (≈ 1 line)
    a = Bidirectional(LSTM(n_a, return_sequences=True))(x)
    
    # Step 2: Iterate for Ty steps
    for t in range(Ty):
    
        # Create context
        context = one_step_attention(a, s)
        if t!=0:          
          context = include_yhat(context,out)
        # Apply the post-attention LSTM cell to the "context" vector.
        s, _, c = post_activation_LSTM_cell(context, initial_state=[s, c])
        
        # Apply Dense layer to the hidden state output of the post-attention LSTM
        p = Dense(machine_vocab_size)(s)

        out = tf.nn.softmax(p,axis=1)       
        
        # Append "out" to the "outputs" list and covert it into tf tensor later
        outputs.append(out)
    
    outputs = tf.convert_to_tensor(outputs)
    reduced = tf.math.argmax(out,axis=1)
    reduced = tf.convert_to_tensor(reduced)
    reduced = tf.transpose(reduced)
    reduced = tf.cast(reduced, tf.dtypes.float32)
    concatable = RepeatVector(1)(reduced)
    concat = Concatenate(axis=-1)([context,concatable])
    g = Dense(1, activation='relu')(concat)
    alphas = tf.nn.softmax(g, axis=1)
    feedable = Dot(axes=1)([alphas,context])    

    print("SEEEEEEEEEEEEEEEEE:",feedable.shape)

    # Create model instance taking three inputs and returning the tensor of outputs
    model = Model([X, s0, c0], outputs)
    
    return model

In [None]:
model = model(Tx,Ty,n_a,n_s,len(human_vocab),len(machine_vocab))
model.summary()

NameError: ignored

In [None]:
opt = tf.keras.optimizers.Adam(learning_rate=0.0025)
model.compile(loss='sparse_categorical_crossentropy', optimizer=opt)

In [None]:
generator = DataGenerator()
model.fit_generator(generator=generator, epochs=7)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<tensorflow.python.keras.callbacks.History at 0x7f05f0a7fa90>

In [None]:
Examples = ['What are you doing?','Hey','Who are you?','Hope you are having a nice day']
sequenceE = tokenizer.texts_to_sequences(Examples)
X = tf.keras.preprocessing.sequence.pad_sequences(sequences=sequenceE, maxlen=max_length, padding='post', truncating = "post")
print(X)
print(X.shape)
s0 = np.zeros((len(Examples),n_s))
c0 = np.zeros((len(Examples),n_s))
prediction = model.predict([X,s0,c0])
perm = [1,0,2]
prediction = tf.transpose(prediction, perm=perm)
responses = []
prediction = np.array(prediction)

for i in range(0,prediction.shape[0]):
  temp = []
  for j in range(0, prediction.shape[1]):
    p = np.argmax(prediction[i][j])
    temp.append(p)
  responses.append(temp)

final_responses = []
print(responses)
for k in range(0, len(responses)):
  tempo = []
  for l in range(0,len(responses[k])):
    if responses[k][l]!=0:
      kkk = reverse_machine_vocab[responses[k][l]]
      tempo.append(kkk)
  final_responses.append(tempo)
print("\n")
for m in range(0,len(final_responses)):
  print(Examples[m],"\n")
  print("-->",' '.join(final_responses[m]),"\n\n") 


In [None]:
import gensim
import pandas

In [None]:
!tar -zxvf Downloads.tar.gz

tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.user.Zone.Identifier'
ldamallet_model
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.user.Zone.Identifier'
corpus.pickle
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.user.Zone.Identifier'
id2word.pickle


In [None]:
with open('corpus.pickle','rb') as f:
  corpus = pickle.load(f)
with open('id2word.pickle','rb') as j:
  id2word = pickle.load(j)
  
print(id2word)

Dictionary(68470 unique tokens: ['day', 'get', 'third', 'also', 'blah']...)


In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=25, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

KeyboardInterrupt: ignored