# CS671 - k-Match-LSTM

## Clustering

In [None]:
import os
import nltk
import numpy as np

from includes import config
from includes.utils import is_outlier

from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rcParams = mpl.rc_params_from_file("includes/matplotlibrc")

### Clustering data based on tf-idf

#### Choosing the number of Clusters

#### Clustering data

### Clustering based on question type

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
stemmer = nltk.stem.SnowballStemmer('english')

In [None]:
def clean_text(text):
    text = ''.join([i if ord(i) < 128 else ' ' for i in text.strip()])
    tokens = nltk.wordpunct_tokenize(text)
    text = nltk.Text(tokens)
    
    return [w.lower() for w in text if w.isalpha()]

In [None]:
data = []
with open("data/squad/train.question") as f:
    for line in f:
        data.append(clean_text(line))
data = np.array(data)

In [None]:
val_data = []
with open("data/squad/val.question") as f:
    for line in f:
        val_data.append(clean_text(line))
val_data = np.array(val_data)

In [None]:
def get_label(line):
    bow = np.zeros(len(line))
    words = {"what": 0, "where": 1, "who": 2, "how": 3, "which": 4}
    for word in words:
        bow[line == word] = 1
    
    try:
        _label = words[line[np.where(bow == 1)[0][0]]]
        return _label
    except Exception as e:
        return 5

In [None]:
labels = np.array([get_label(np.array(line)) for line in data])
labels_val = np.array([get_label(np.array(line)) for line in val_data])

In [None]:
with open("data/squad/train.labels", "w") as f:
    f.write("\n".join(
        [str(label) for label in labels]
    ))

with open("data/squad/val.labels", "w") as f:
    f.write("\n".join(
        [str(label) for label in labels_val]
    ))

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1)

ax1.hist(labels, bins=config.n_clusters, rwidth=0.7)
ax2.hist(labels_val, bins=config.n_clusters, rwidth=0.7)
plt.show()

## Match-LSTM for Machine Comprehension

In [None]:
import os
import sys
import numpy as np
from tqdm import tqdm

from includes import config
from includes.utils import squad_dataset
from includes.evaluate import evaluate_model, test, get_answers

from graph import Graph
from encoder import Encoder
from decoder import Decoder

import tensorflow as tf

In [None]:
root_dir = os.getcwd()

In [None]:
words_embedding = np.load(config.embed_path)["glove"]

In [None]:
sess = tf.Session()

In [None]:
encoder = Encoder(
    config.encoding_size
)
decoder = Decoder(
    config.encoding_size,
    config.n_clusters
)

In [None]:
graph = Graph(
    words_embedding,
    encoder,
    decoder
)

In [None]:
init = graph.init_model(sess)

In [None]:
train_data = squad_dataset(
    config.questions_train,
    config.contexts_train,
    config.answers_train,
    config.labels_train,
    root=root_dir + "/",
    batch_size=config.batch_size
)

val_data = squad_dataset(
    config.questions_val,
    config.contexts_val,
    config.answers_val,
    config.labels_val,
    root=root_dir + "/",
    batch_size=config.val_batch_size
)

In [None]:
scores = []

best_em = 0

if init:
    scores.append(evaluate_model(graph, sess, val_data))
    print "\nepoch: %d, em: %.4f, em@1: %.4f, em@2: %.4f\n" % (
        0, scores[-1][0], scores[-1][1], scores[-1][2]
    )

    best_em = scores[-1][0]

In [None]:
for epoch in range(config.num_epochs):

    graph.run_epoch(train_data, epoch, sess, max_batch_epochs=-1)

    scores.append(evaluate_model(graph, sess, val_data))
    print "\nepoch: %d, em: %.4f, em@1: %.4f, em@2: %.4f\n" % (
        epoch + 1, scores[-1][0], scores[-1][1], scores[-1][2]
    )

    if scores[-1][0] >= best_em:
        graph.save_model(sess)
        best_em = scores[-1][0]