# CS671 - k-Match-LSTM

## Clustering

In [None]:
import os
import nltk
import numpy as np

from includes import config
from includes.utils import is_outlier

from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rcParams = mpl.rc_params_from_file("includes/matplotlibrc")

### Clustering data based on tf-idf

#### Choosing the number of Clusters

#### Clustering data

### Clustering based on question type

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
stemmer = nltk.stem.SnowballStemmer('english')

In [None]:
def clean_text(text):
    text = ''.join([i if ord(i) < 128 else ' ' for i in text.strip()])
    tokens = nltk.wordpunct_tokenize(text)
    text = nltk.Text(tokens)
    
    return [w.lower() for w in text if w.isalpha()]

In [None]:
data = []
with open("data/squad/train.question") as f:
    for line in f:
        data.append(clean_text(line))
data = np.array(data)

In [None]:
val_data = []
with open("data/squad/val.question") as f:
    for line in f:
        val_data.append(clean_text(line))
val_data = np.array(val_data)

In [None]:
def get_label(line):
    bow = np.zeros(len(line))
    words = {"what": 0, "where": 1, "who": 2, "how": 3, "which": 4}
    for word in words:
        bow[line == word] = 1
    
    try:
        _label = words[line[np.where(bow == 1)[0][0]]]
        return _label
    except Exception as e:
        return 5

In [None]:
labels = np.array([get_label(np.array(line)) for line in data])
labels_val = np.array([get_label(np.array(line)) for line in val_data])

In [None]:
with open("data/squad/train.labels", "w") as f:
    f.write("\n".join(
        [str(label) for label in labels]
    ))

with open("data/squad/val.labels", "w") as f:
    f.write("\n".join(
        [str(label) for label in labels_val]
    ))

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1)

ax1.hist(labels, bins=config.n_clusters, rwidth=0.7)
ax2.hist(labels_val, bins=config.n_clusters, rwidth=0.7)
plt.show()

## Match-LSTM for Machine Comprehension

In [1]:
import os
import sys
import numpy as np
from tqdm import tqdm

from includes import config
from includes.utils import squad_dataset, evaluate

from graph import Graph
from encoder import Encoder
from decoder import Decoder

import tensorflow as tf
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rcParams = mpl.rc_params_from_file("includes/matplotlibrc")

  from ._conv import register_converters as _register_converters


In [2]:
root_dir = os.getcwd()

In [3]:
words_embedding = np.load(config.embed_path)["glove"]

In [4]:
sess = tf.Session()

In [5]:
encoder = Encoder(
    config.encoding_size,
    config.dropout_keep_prob
)
decoder = Decoder(
    config.encoding_size,
    config.n_clusters,
    config.dropout_keep_prob
)

In [6]:
graph = Graph(
    words_embedding,
    encoder,
    decoder
)

Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.


In [7]:
init = graph.init_model(sess)


Initializing model from model/k-match-lstm/trained_model.chk ... 
INFO:tensorflow:Restoring parameters from model/k-match-lstm/trained_model.chk
Initialized model



In [8]:
train_data = squad_dataset(
    config.questions_train,
    config.contexts_train,
    config.answers_train,
    config.labels_train,
    root=root_dir + "/",
    batch_size=config.batch_size
)

val_data = squad_dataset(
    config.questions_val,
    config.contexts_val,
    config.answers_val,
    config.labels_val,
    root=root_dir + "/",
    batch_size=config.val_batch_size
)

In [9]:
def print_score(epoch, score):
    print "\nepoch: %d, f1: %.4f, em: %.4f, em@1: %.4f, em@2: %.4f\n" % (
        epoch, score[1], score[0][0], score[0][1][0], score[0][1][1]
    )

In [10]:
losses = []
if os.path.exists(config.loss_path):
    losses = list(np.load(config.loss_path))

scores = []
if os.path.exists(config.scores_path):
    scores = list(np.load(config.scores_path))

In [None]:
best_em = np.max([score[0][1] for score in scores]) or 0

if not init:
    scores.append(
        evaluate(graph, sess, val_data, "evaluating ... epoch: 0")
    )
    print_score(0, scores[-1])
else:
    score = evaluate(graph, sess, val_data, "evaluating ... epoch: 0")
    print_score(0, score)

In [None]:
for epoch in range(config.num_epochs)[:1]:

    losses.append(graph.run_epoch(
        train_data, epoch, sess, max_batch_epochs=-1)
    )

    scores.append(
        evaluate(graph, sess, val_data, "evaluating ... epoch: %d" % (epoch + 1))
    )
    print_score(epoch + 1, scores[-1])
    
    if scores[-1][0][0] >= best_em:
        graph.save_model(sess)
        best_em = scores[-1][0][0]

        np.save("data/plots/loss.npy", np.array(losses))
        np.save("data/plots/scores.npy", np.array(scores))

In [None]:
plt.plot(np.array(losses).reshape((4 * len(losses[-1]) / 40, 40)).mean(axis = 1))

plt.show()

In [None]:
plt.plot([e[0][0] for e in scores], label="em")
plt.plot([e[0][1][0] for e in scores], label="em1")
plt.plot([e[0][1][1] for e in scores], label="em2")
plt.plot([e[1] for e in scores], label="f1")

plt.legend()
plt.show()

## Predicting Answers