In [1]:
import sys
import gensim, logging
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import string
import collections
from gensim.parsing.preprocessing import remove_stopwords
from keras.models import Sequential, load_model
from keras.layers import LSTM, Bidirectional, Dropout, Dense, Activation
from keras.optimizers import RMSprop, Adadelta, Adam
from keras.callbacks import ReduceLROnPlateau
from keras import callbacks
from keras.utils import multi_gpu_model
import random
from sklearn.feature_extraction.text import TfidfVectorizer
import csv
from sklearn.feature_extraction import stop_words

Using TensorFlow backend.


In [2]:
logging.basicConfig(level=logging.INFO)

In [None]:
puncts = string.punctuation.replace('.','')
punct = str.maketrans('','', puncts)

In [None]:
data = open('iotr.txt','r').read()

In [None]:
mysents = pd.read_pickle("lor_sents.pickle")

In [None]:
mysents

In [None]:
def fullstoper(s):
    return s[:-1] + [s[-1]+'.']

In [None]:
mysents['FullSents'] = mysents.Sentences.apply(fullstoper)

In [None]:
#contexts = set(tuple(x) for x in mysents.Context.values)
#clusters = {idx:list(x) for idx,x in enumerate(contexts)}

In [None]:
sents_width = [len(x) for x in mysents.FullSents]

In [None]:
continuous_sents = [x for y in mysents.FullSents for x in y]

In [None]:
seq_len = 8
breakpoint = 0
acc = 0
sequence_ip, context_ip, target_op = [],[],[]

for position in range(len(continuous_sents) - seq_len):
    sequence_ip.append(continuous_sents[position : position + seq_len])
    target_op.append(continuous_sents[position + seq_len])
    if position > sents_width[breakpoint] + acc - seq_len/2:
        acc += sents_width[breakpoint]
        breakpoint += 1
    #print("position ", position, " : ", position +seq_len, "breakpiont ",breakpoint)
    context_ip.append(mysents.Context[breakpoint])

In [None]:
lordf = pd.DataFrame({'Sequence_ip':sequence_ip, 'Context_ip':context_ip, 'Target_op':target_op})
lordf_100k = lordf[:100000]

In [None]:
# generate vocab to id and vice versa for the words in the sentences
sentvocab = collections.Counter([x for y in lordf_100k['Sequence_ip'] for x in y])
ctxvocab = collections.Counter([x for y in lordf_100k['Context_ip'] for x in y])
vocab = sorted(sentvocab + ctxvocab)
vocab2idx = {v:idx for idx,v in enumerate(vocab)}
idx2vocab = {idx:v for idx,v in enumerate(vocab)}

In [None]:
sequence_arr = np.zeros((len(lordf_100k), seq_len+1, len(vocab)), dtype=bool)
target_arr = np.zeros((len(lordf_100k), len(vocab)), dtype=bool)

In [None]:
for s_idx,(x,y,z) in enumerate(zip(lordf_100k['Context_ip'], lordf_100k['Sequence_ip'], lordf_100k['Target_op'])):
    sequence_arr[s_idx][0][[vocab2idx[val] for val in x]] = 1
    target_arr[s_idx][vocab2idx[z]] = 1
    for w_idx,word in enumerate(y):
        sequence_arr[s_idx][w_idx+1][vocab2idx[word]] = 1

In [None]:
sequence_arr.shape

In [None]:
'''
model = Sequential()
model.add(Bidirectional(LSTM(256, activation='relu'), input_shape=(seq_len+1, len(vocab))))
model.add(Dropout(0.2))
model.add(Dense(len(vocab)))
model.add(Activation('softmax'))
'''

In [None]:
opt_rms = RMSprop()
opt_ada = Adadelta()
opt_adam = Adam()

In [None]:
model = load_model('../../data/contextlstmv2.h5')

In [None]:
model = load_model('../../data/contextlstmv2_100k.h5')

In [None]:
parallel_model = multi_gpu_model(model, gpus=4)
parallel_model.compile(optimizer=opt_ada, loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [None]:
lr_reducer = ReduceLROnPlateau(monitor='val_loss', patience=5, verbose=1, factor=0.2, min_lr=0)
batch_size = 128
num_epochs = 20

In [None]:
parallel_model.summary()

In [3]:
embeddingmodel = gensim.models.word2vec.Word2Vec.load('../w2v_wikimodel.hd5')

INFO:gensim.utils:loading Word2Vec object from ../w2v_wikimodel.hd5
INFO:gensim.utils:loading wv recursively from ../w2v_wikimodel.hd5.wv.* with mmap=None
INFO:gensim.utils:loading vectors from ../w2v_wikimodel.hd5.wv.vectors.npy with mmap=None
INFO:gensim.utils:setting ignored attribute vectors_norm to None
INFO:gensim.utils:loading vocabulary recursively from ../w2v_wikimodel.hd5.vocabulary.* with mmap=None
INFO:gensim.utils:loading trainables recursively from ../w2v_wikimodel.hd5.trainables.* with mmap=None
INFO:gensim.utils:loading syn1neg from ../w2v_wikimodel.hd5.trainables.syn1neg.npy with mmap=None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:loaded ../w2v_wikimodel.hd5


In [None]:
# check if the end word with full-stop is an alphabet
def clean(xx):
    return ' '.join(x for x in xx.split() if x.strip('.').isalpha())

# remove all punctuations except full-stop for word endings
cdata = clean(data.translate(punct))

# Pre processing for word2vec contexts
cdata_w2v = remove_stopwords(cdata.lower())
sents_w2v = [s.lower().split() for s in cdata_w2v.split('.') if len(s.split())>5]
vcab = sorted(set([w for s in sents_w2v for w in s]))

In [None]:
# permit the vocabulary from LOR dataset, only if it is present in wiki model
common_vocab = [x for x in vcab if x in embeddingmodel.wv.vocab]
vecs = {k:embeddingmodel.wv.get_vector(k) for k in common_vocab}
wvvocab = list(vecs.keys())
wvvecs = list(vecs.values())

In [4]:
# evaluation.
vecs = {k:embeddingmodel.wv.get_vector(k) for k in embeddingmodel.wv.vocab}
wvvocab = list(vecs.keys())
wvvecs = list(vecs.values())

In [None]:
km = KMeans(n_clusters=100, random_state=0)
km.fit(wvvecs)

In [5]:
from tfidf_importance import get_results

In [6]:
tfidfs = get_results()

Tfidf. . .  {'woods': 0.22026683847554496, 'way': 0.15791757241453094, 'town': 0.24971313860547795, 'sword': 0.1975088672730664, 'spur': 0.2965002975303856, 'road': 0.1714632547197942, 'meantime': 0.2941718018418234, 'master': 0.18429834087460925, 'hobbits': 0.17046968232943605, 'great': 0.1427824920447543, 'goblins': 0.2166857650050281, 'goblin': 0.2556262417583239, 'elf': 0.2431604117397801, 'dwarves': 0.3716129345022729, 'deal': 0.2260195589358893, 'dark': 0.15568981456220948, 'chief': 0.23152211392340585, 'characters': 0.31538721905394335}
Tfidf. . .  {'staff': 0.3184685386863208, 'others': 0.2456691006727009, 'line': 0.2933623370134616, 'laughter': 0.6838470572398085, 'floor': 0.2944814423950979, 'dwarves': 0.24335175977273024, 'door': 0.24932210593208687, 'below': 0.2764331980101654}
Tfidf. . .  {'way': 0.4097664555307104, 'guards': 0.6322530436922931, 'end': 0.44709663973009917, 'dwarves': 0.48213290222270005}
Tfidf. . .  {'pull': 0.6975436055234616, 'jug': 0.7165423353810502}
T

In [34]:
def evaluate_closeness(context_word, sentence):
    cumulative = 0
    for word,tfidf in sentence.items():
        cumulative += tfidf*embeddingmodel.wv.similarity(context_word, word)
    return cumulative/sum(sentence.values())

In [35]:
for sent in tfidfs:
    print(sent[0], "..", evaluate_closeness(sent[0], sent[1]))

gandalf .. 0.07034807089068781
ring .. 0.09351642233281396
war .. 0.0404397225474678
friends .. -0.0005761456848445369
snake .. 0.19579710321674806
book .. 0.03020669099127537
home .. 0.09126108632376154
king .. 0.046763618660544105
hobbit .. 0.0589704222870908


In [37]:
tfidfs[3]

['friends', {'pull': 0.6975436055234616, 'jug': 0.7165423353810502}]

In [None]:
kmclusters = km.predict(wvvecs)

In [None]:
def get_words_from_clustercenters(num):
    return [x[0] for x in embeddingmodel.wv.most_similar(positive=[km.cluster_centers_[num]], topn=5)]

clusterlookup = {k:get_words_from_clustercenters(k) for k in range(100)}

In [None]:
def onehot2word(arr):
    return idx2vocab[arr.argmax()]

def word2onehot(word):
    vidx = vocab2idx[word]
    varr = np.zeros((1, len(vocab)), dtype=bool)
    varr[0, vidx] = 1
    return varr

def context2onehot(topic):
    topicvec = embeddingmodel.wv.get_vector(topic)
    clustervec = km.predict(topicvec.reshape(1,-1))[0]
    words = clusterlookup[clustervec]
    varr = np.zeros((1, len(vocab)), dtype=bool)
    varr[0][[vocab2idx[val] for val in words]] = 1
    return varr

def prob2onehot(prob):
    foo = np.zeros((1, len(vocab)), dtype=bool)
    foo[0, prob.argmax()] = 1
    return foo

def headstart():
    hswords = []
    for w in 'hobbits lived in the woods and an elf'.split():
        hswords.append(word2onehot(w))
    return np.array(hswords).transpose(1,0,2)

In [None]:
hints = 'hobbits gollum adventure king ring war friends war book home'.split()

In [None]:
def gen_text(model, word_limit, context_words):
    context_idx = 0
    textcum = []
    text_generated = []
    input_arr = headstart()
    context_arr = context2onehot(context_words.pop(0))
    sequence_arr = np.concatenate((context_arr, input_arr[0]))[np.newaxis,:]
    text_generated.extend([x for y in sequence_arr for x in y])
    for idx in range(word_limit):
        if len(context_words) > 0:
            predicted_arr = prob2onehot(model.predict(sequence_arr))
            text_generated.append(predicted_arr)
            if '.' in onehot2word(predicted_arr):
                context_arr = context2onehot(context_words.pop(0))
            sequence_arr = np.concatenate((context_arr, sequence_arr[0, 2:, :], predicted_arr)).reshape(sequence_arr.shape)
            print(onehot2word(predicted_arr))
    for w in text_generated:
        textcum.append(onehot2word(w))
    return ' '.join(textcum)

In [None]:
parallel_model.fit(sequence_arr, target_arr, batch_size=batch_size, epochs=20, callbacks=[lr_reducer], validation_split=0.10)

In [None]:
hints = 'Hobbits gollum adventure king ring war friends war book home'.split()

for ep in range(400):
    parallel_model.fit(sequence_arr, target_arr, batch_size=batch_size, epochs=3, callbacks=[lr_reducer], validation_split=0.10)
    parallel_model.save('../../data/contextlstmv2_100k.h5')
    #print(gen_text(parallel_model, 300, hints))
    print("Epoch", ep)

In [None]:
gen_text(model, 300, hints)

In [None]:
hints = 'hobbits gollum adventure king ring war friends war book home'.split()
gen_text(model, 300, hints)

In [None]:
model.fit(sequence_arr, target_arr, batch_size=batch_size, epochs=20, callbacks=[lr_reducer], validation_split=0.10)