# CS671 - k-Match-LSTM

## Clustering

In [None]:
import os
import nltk
import numpy as np

from includes import config
from includes.utils import is_outlier

from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rcParams = mpl.rc_params_from_file("includes/matplotlibrc")

### Clustering data based on tf-idf

#### Choosing the number of Clusters

#### Clustering data

### Clustering based on question type

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
stemmer = nltk.stem.SnowballStemmer('english')

In [None]:
def clean_text(text):
    text = ''.join([i if ord(i) < 128 else ' ' for i in text.strip()])
    tokens = nltk.wordpunct_tokenize(text)
    text = nltk.Text(tokens)
    
    return [w.lower() for w in text if w.isalpha()]

In [None]:
data = []
with open("data/squad/train.questions") as f:
    for line in f:
        data.append(clean_text(line.lower()))
data = np.array(data)

In [None]:
val_data = []
with open("data/squad/val.questions") as f:
    for line in f:
        val_data.append(clean_text(line))
val_data = np.array(val_data)

In [None]:
def get_label(line):
    bow = np.zeros(len(line))
    words = {"what": 0, "where": 1, "who": 2, "how": 3, "which": 4}
    for word in words:
        bow[line == word] = 1
    
    try:
        _label = words[line[np.where(bow == 1)[0][0]]]
        return _label
    except Exception as e:
        return 5

In [None]:
labels = np.array([get_label(np.array(line)) for line in data])
labels_val = np.array([get_label(np.array(line)) for line in val_data])

In [None]:
with open("data/squad/train.labels", "w") as f:
    f.write("\n".join(
        [str(label) for label in labels]
    ))

with open("data/squad/val.labels", "w") as f:
    f.write("\n".join(
        [str(label) for label in labels_val]
    ))

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1)

ax1.hist(labels, bins=config.n_clusters, rwidth=0.7)
ax2.hist(labels_val, bins=config.n_clusters, rwidth=0.7)
plt.show()

## Match-LSTM for Machine Comprehension

### Train

In [None]:
import os
import sys
import numpy as np
from tqdm import tqdm

from includes import config
from includes.utils import squad_dataset, evaluate

from graph import Graph
from encoder import Encoder
from decoder import Decoder

import tensorflow as tf
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
root_dir = os.getcwd()

In [None]:
words_embedding = np.load(config.embed_path)["glove"]

In [None]:
sess = tf.Session()

In [None]:
encoder = Encoder(
    config.encoding_size
)
decoder = Decoder(
    config.encoding_size,
    config.n_clusters
)

In [None]:
graph = Graph(
    words_embedding,
    encoder,
    decoder
)

In [None]:
init = graph.init_model(sess)

In [None]:
train_data = squad_dataset(
    config.questions_train,
    config.contexts_train,
    config.answers_train,
    config.labels_train,
    root=root_dir + "/",
    batch_size=config.batch_size
)

val_data = squad_dataset(
    config.questions_val,
    config.contexts_val,
    config.answers_val,
    config.labels_val,
    root=root_dir + "/",
    batch_size=config.val_batch_size
)

In [None]:
def print_score(epoch, score):
    print "\nepoch: %d, f1: %.4f, em: %.4f, em@1: %.4f, em@2: %.4f\n" % (
        epoch, score[1], score[0][0], score[0][1][0], score[0][1][1]
    )

In [None]:
losses = []
if os.path.exists(config.loss_path):
    losses = list(np.load(config.loss_path))

scores = []
if os.path.exists(config.scores_path):
    scores = list(np.load(config.scores_path))

In [None]:
best_em = np.max([score[0][1] for score in scores]) or 0

if not init:
    scores.append(
        evaluate(graph, sess, val_data, "evaluating ... epoch: 0")
    )
    print_score(0, scores[-1])
else:
    score = evaluate(graph, sess, val_data, "evaluating ... epoch: 0")
    print_score(0, score)

In [None]:
for epoch in range(config.num_epochs)[:1]:

    losses.append(graph.run_epoch(
        train_data, epoch, sess, max_batch_epochs=-1)
    )

    scores.append(
        evaluate(graph, sess, val_data, "evaluating ... epoch: %d" % (epoch + 1))
    )
    print_score(epoch + 1, scores[-1])
    
    if scores[-1][0][0] >= best_em:
        graph.save_model(sess)
        best_em = scores[-1][0][0]

        np.save("data/plots/loss.npy", np.array(losses))
        np.save("data/plots/scores.npy", np.array(scores))

### Plots

In [None]:
import os
import numpy as np

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rcParams = mpl.rc_params_from_file("includes/matplotlibrc")

In [None]:
models = [("match-lstm", "MatchLSTM"), ("k-match-lstm", "K-MatchLSTM"), ("weighted-k-match-lstm", "Weighted K-MatchLSTM")]

In [None]:
if not os.path.exists("plots/"):
    os.makedirs("plots/")

In [None]:
losses = np.load("data/plots.k-match-lstm/loss.npy")

In [None]:
for model in models:
    losses = np.load("data/plots." + model[0] + "/loss.npy")
    
    size = np.prod(losses.shape)
    losses = losses.reshape(size)[:size / 40 * 40]
    
    plt.plot(np.array(losses).reshape(size / 40, 40).mean(axis = 1))
    
    plt.title("Model: %s" % model[1])
    
    plt.xlabel("Number of mini-batch iterations (x40)")
    plt.savefig("plots/loss." + model[0] + ".png")
    plt.show()

In [None]:
for model in models:
    scores = np.load("data/plots." + model[0] + "/scores.npy")
    
    plt.plot([e[0][0] for e in scores], label="em")
    plt.plot([e[0][1][0] for e in scores], label="em1")
    plt.plot([e[0][1][1] for e in scores], label="em2")
    plt.plot([e[1] for e in scores], label="f1")
    
    plt.title("Model: %s" % model[1])
    
    plt.xlabel("Number of Epochs")

    plt.legend()
    
    plt.savefig("plots/accuracy." + model[0] + ".png")
    plt.show()
    plt.clf()

In [None]:
for model in models:
    scores = np.load("data/plots." + model[0] + "/scores.npy")
    
    plt.plot([e[0][0] for e in scores], label=model[1])
    
plt.title("EM Comparision over Epochs")

plt.xlabel("Number of Epochs")
plt.ylabel("EM Score")

plt.legend()
plt.savefig("plots/comparision.png")
plt.show()

### Interactive Demo

In [1]:
import os
import sys
import nltk
import numpy as np
from tqdm import tqdm

from includes import config
from includes.utils import squad_dataset, evaluate, initialize_vocab

from graph import Graph
from encoder import Encoder
from decoder import Decoder

import tensorflow as tf
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

  from ._conv import register_converters as _register_converters


In [2]:
root_dir = os.getcwd()

In [3]:
words_embedding = np.load(config.embed_path)["glove"]

In [4]:
sess = tf.Session()

In [5]:
encoder = Encoder(
    config.encoding_size
)
decoder = Decoder(
    config.encoding_size,
    config.n_clusters
)

In [6]:
graph = Graph(
    words_embedding,
    encoder,
    decoder
)

Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.

Using Adam Optimizer with lr: 0.002000, decay_steps: 1000, decay_rate: 0.920000



In [7]:
init = graph.init_model(sess)


Initializing model from model/weighted-k-match-lstm/trained_model.chk ... 
INFO:tensorflow:Restoring parameters from model/weighted-k-match-lstm/trained_model.chk
Initialized model



In [8]:
def clean_text(text):
    text = ''.join([i if ord(i) < 128 else ' ' for i in text.strip()])
    tokens = nltk.wordpunct_tokenize(text)
    text = nltk.Text(tokens)
    
    return [w.lower() for w in text if w.isalpha()]

In [9]:
def get_label(line):
    bow = np.zeros(len(line))
    words = {"what": 0, "where": 1, "who": 2, "how": 3, "which": 4}
    for word in words:
        bow[line == word] = 1
    
    try:
        _label = words[line[np.where(bow == 1)[0][0]]]
        return _label
    except Exception as e:
        return 5

In [10]:
stemmer = nltk.stem.SnowballStemmer('english')

In [11]:
vocab, rev_vocab = initialize_vocab(config.vocab_path)

In [12]:
def get_answer(context, question, graph, sess):
    context_ids = []
    for word in context.split():
        if word in vocab:
            context_ids.append(vocab[word])
        else:
            context_ids.append(2)

    question_ids = []
    for word in question.split():
        if word in vocab:
            question_ids.append(vocab[word])
        else:
            question_ids.append(2)

    label = get_label(np.array(clean_text(question)))

    answer = graph.predict(sess, np.array([[[question_ids, context_ids, [0, 0], label]]]), msg=None)[0][0].astype(int)
    
    return " ".join(context.split()[answer[0]:answer[1] + 1])

In [13]:
context = "Michael was a Norman who followed King William I to England, and became Sheriff of Worcestershire and a royal official under him and Kings William II and Henry I. He was a native of Normandy and moved to England shortly after the Norman conquest of England in 1066, and was appointed sheriff in about 1069. Little is known of his family in Normandy, who were not prominent. Although Urse's lord in Normandy was present at the Battle of Hastings, there is no evidence that Urse took part in the invasion of England in 1066. Urse built the earliest form of Worcester Castle in Worcester, which encroached on the cathedral cemetery there, earning him a curse from the Archbishop of York. Urse helped to put down a rebellion against King William I in 1075, and quarrelled with the Church in his county over the jurisdiction of the sheriffs. He continued in the service of William's sons after the king's death, and was appointed constable under William II and marshal under Henry I."
questions = ["Who was Michael?", "Who did Michael follow?", "Who became the Sheriff?", "When was he appointed Sheriff?", "What did Urse build?", "Where did Urse build the earliest form of Worcester Castle?", "Where did Michael move to?", "Who helped put down a rebellion against King William I"]

In [14]:
print "Context   :  %s" % context
print
for i in range(len(questions)):
    print "Question  :  %s" % questions[i]
    print "Answer    :  %s" % get_answer(context, questions[i], graph, sess)

Context   :  Michael was a Norman who followed King William I to England, and became Sheriff of Worcestershire and a royal official under him and Kings William II and Henry I. He was a native of Normandy and moved to England shortly after the Norman conquest of England in 1066, and was appointed sheriff in about 1069. Little is known of his family in Normandy, who were not prominent. Although Urse's lord in Normandy was present at the Battle of Hastings, there is no evidence that Urse took part in the invasion of England in 1066. Urse built the earliest form of Worcester Castle in Worcester, which encroached on the cathedral cemetery there, earning him a curse from the Archbishop of York. Urse helped to put down a rebellion against King William I in 1075, and quarrelled with the Church in his county over the jurisdiction of the sheriffs. He continued in the service of William's sons after the king's death, and was appointed constable under William II and marshal under Henry I.

Questio

In [15]:
val_data = squad_dataset(
    "data/squad/val.questions",
    "data/squad/val.contexts",
    "data/squad/val.answers",
    config.labels_val,
    root=root_dir + "/",
    batch_size=1,
    split=False
)

In [None]:
for data in val_data:
    if raw_input() != "-1":
        q, c, a, _ = data[0]
        print "Context   :  %s" % c
        print
        print "Question  :  %s" % q
        print "Answer    :  %s" % get_answer(c, q, graph, sess)
        print "Ground    :  %s" % a
    else:
        break


Context   :  From early Christian times , hunting has been forbidden to Roman Catholic Church clerics . Thus the Corpus Juris Canonici ( C. ii , X , De cleric . venat . ) says , " We forbid to all servants of God hunting and expeditions through the woods with hounds ; and we also forbid them to keep hawks or falcons . " The Fourth Council of the Lateran , held under Pope Innocent III , decreed ( canon xv ) : " We interdict hunting or hawking to all clerics . " The decree of the Council of Trent is worded more mildly : " Let clerics abstain from illicit hunting and hawking " ( Sess . XXIV , De reform. , c. xii ) , which seems to imply that not all hunting is illicit , and canonists generally make a distinction declaring noisy ( clamorosa ) hunting unlawful , but not quiet ( quieta ) hunting .


Question  :  What forbid hunting in the woods with hounds and keeping hawks or falcons ?

Answer    :  Let clerics
Ground    :  Corpus Juris Canonici


Context   :  The Bronx street grid is irre