In [2]:
import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec
import nltk
from nltk import word_tokenize

nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nicho\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# reading in the data
train_data = pd.read_csv("WikiQA-train.tsv", sep="\t")
test_data = pd.read_csv("WikiQA-test.tsv", sep="\t")

Extract the unique questions from the train and test data frames, including the documentID and the DocumentTitle


In [4]:
def get_questions_documenttag(data):
    qd = data[
        ["Question", "QuestionID", "DocumentID", "DocumentTitle"]
    ].drop_duplicates()
    return qd


train_question_doctag = get_questions_documenttag(train_data)
test_question_doctag = get_questions_documenttag(test_data)

In [5]:
# get unique questions
train_questions = train_question_doctag["Question"]
test_questions = test_question_doctag["Question"]

In [6]:
# get the unique document ids
train_docid = train_question_doctag["DocumentID"]
test_docid = test_question_doctag["DocumentID"]

Extract the answers to those questions.


In [7]:
def get_answers(data, questions, documentids):
    answers = []  # list of answers
    for q in range(len(questions)):
        question = questions.iloc[q]
        doc_id = documentids.iloc[q]  # add the document id
        df = data[data["Question"] == question]
        index = df.loc[df["Label"] == 1]["Sentence"].index.values
        if len(index) == 0:  # if no answer found
            answers.append([question, doc_id, "No answer"])
        else:  # if 1 answer found
            answers.append([question, doc_id, df.loc[index[0], "Sentence"]])
    return answers


train_answers = pd.DataFrame(get_answers(train_data, train_questions, train_docid))
test_answers = pd.DataFrame(get_answers(test_data, test_questions, test_docid))

The above get_answers returns train_answers and test_answers which, gives us in the following columns

-   Question
-   Related Document ID
-   Answer (if no answer to that question, return no answer)


In [8]:
def get_documents(data, questions, documentids):  # (done by Finn, tweaked by Dan)
    documents = []
    for q in range(len(questions)):
        question = questions.iloc[q]
        doc_id = documentids.iloc[q]  # add the document id
        df = data[data["Question"] == question]
        sentences = df["Sentence"].tolist()
        for i in range(0, len(sentences) - 1):
            sentences[i] = sentences[i] + " "
        documents.append([doc_id, "".join(sentences)])
    return documents


train_documents = pd.DataFrame(
    get_documents(train_data, train_questions, train_docid)
)  # return the individual document in list
test_documents = pd.DataFrame(
    get_documents(test_data, test_questions, test_docid)
)  # return the individual document in list

The above train_documents and test_documents called from the get_documents gives us in the following columns

-   Document ID
-   Full Document


In [9]:
# renaming all the columns for more standardised access
train_answers.columns = ["Question", "DocumentID", "Answer"]
test_answers.columns = ["Question", "DocumentID", "Answer"]
train_documents.columns = ["DocumentID", "Document"]
test_documents.columns = ["DocumentID", "Document"]

In [10]:
# result is 2117, 2117, 630, 630

len(train_answers), len(train_documents), len(test_answers), len(test_documents)

(2117, 2117, 630, 630)

**Prior to tagging, we should maybe clean the document and answers first:** (stopped here)

Maybe?

-   lowercase (might lose context, but we can use on questions)
-   removing any punctuation or weird symbols (do)
-   removal of stop words? (probably not)

Make sure that the pre-processing is standardised to be the same throughout doc and ans.


In [11]:
# These are just common English contractions. We used it in Lab 5 before!
contraction_dict = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "this's": "this is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "here's": "here is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have",
}

In [12]:
def preprocess_lower(text):
    # Lowercase the text for question, answer and documents
    text = text.lower()
    for word, new_word in contraction_dict.items():
        text = text.replace(word, new_word)  # dealing with contractions
    pattern = r"[^a-zA-Z0-9\s]"
    cleaned_text = re.sub(pattern, " ", text)
    return cleaned_text


train_answers[["Question", "Answer"]] = train_answers[["Question", "Answer"]].applymap(
    preprocess_lower
)
train_documents["Document"] = train_documents["Document"].apply(preprocess_lower)
test_answers[["Question", "Answer"]] = test_answers[["Question", "Answer"]].applymap(
    preprocess_lower
)
test_documents["Document"] = test_documents["Document"].apply(preprocess_lower)

In [13]:
train_documents

Unnamed: 0,DocumentID,Document
0,D1,a partly submerged glacier cave on perito more...
1,D2,in physics circular motion is a movement of ...
2,D5,apollo creed is a fictional character from the...
3,D6,in the united states the title of federal jud...
4,D7,the beretta 21a bobcat is a small pocket sized...
...,...,...
2112,D2805,blue mountain state is an american comedy seri...
2113,D2806,apple inc formerly apple computer inc is ...
2114,D2807,section 8 housing in the south bronx section 8...
2115,D2808,restaurants categorized by type and informatio...


In [14]:
def labelling(documents, answers):
    tagged_documents = []
    for q in range(len(answers)):
        tagged_document = []
        qn = answers["Question"].loc[q]
        doc_id = answers["DocumentID"].loc[q]
        content = documents.loc[documents["DocumentID"] == doc_id, "Document"].values[0]
        answer = answers["Answer"].loc[q]

        if answer == "no answer":
            tokens = word_tokenize(content)
            for j in range(len(tokens)):
                tagged_document.append("N")  # none
        else:
            parts = content.partition(answer)
            for j in range(len(parts)):
                tokens = word_tokenize(parts[j])
                if j == 1:
                    tagged_document.append("S")  # start of answer
                    for k in range(len(tokens) - 2):
                        tagged_document.append("I")  # inside of answer
                    tagged_document.append("E")  # end of answer
                else:
                    for k in range(len(tokens)):
                        tagged_document.append("N")  # outside answer
        tagged_documents.append(tagged_document)
    return tagged_documents


train_doc_ans_labels = labelling(train_documents, train_answers)
test_doc_ans_labels = labelling(test_documents, test_answers)

In [15]:
# check if tags are good
def testing_tokens(ind, labels, documents, answers):
    for i, j in zip(labels[ind], word_tokenize(documents["Document"][ind])):
        print([i, j])
    print(answers["Answer"][ind])


testing_tokens(1000, train_doc_ans_labels, train_documents, train_answers)

['N', 'egg']
['N', 'roll']
['N', 'is']
['N', 'a']
['N', 'term']
['N', 'used']
['N', 'for']
['N', 'many']
['N', 'different']
['N', 'foods']
['N', 'around']
['N', 'the']
['N', 'world']
['S', '2']
['I', 'egg']
['I', 'roll']
['I', 'varieties']
['I', 'of']
['I', 'egg']
['I', 'rolls']
['I', 'are']
['I', 'found']
['I', 'in']
['I', 'mainland']
['I', 'china']
['I', 'many']
['I', 'chinese']
['I', 'speaking']
['I', 'regions']
['I', 'of']
['I', 'asia']
['I', 'and']
['I', 'chinese']
['I', 'immigrant']
['I', 'communities']
['I', 'around']
['I', 'the']
['E', 'world']
['N', 'egg']
['N', 'rolls']
['N', 'as']
['N', 'referred']
['N', 'to']
['N', 'in']
['N', 'china']
['N', 'in']
['N', 'guangdong']
['N', 'and']
['N', 'hong']
['N', 'kong']
['N', 'egg']
['N', 'roll']
['N', 'usually']
['N', 'refers']
['N', 'to']
['N', 'biscuit']
['N', 'roll']
['N', 'this']
['N', 'is']
['N', 'a']
['N', 'type']
['N', 'of']
['N', 'biscuit']
['N', 'the']
['N', 'ingredient']
['N', 'included']
['N', 'egg']
['N', 'flour']
['N', 'and

Cleaned Documents: train and test

train_answers - contains the ['Question','DocumentID','Answer']

train_documents - contains the ['DocumentID','Document']

train_doc_ans_labels - contains a list of list of answer tags for each document,


In [16]:
# To prepare the document for word embeddings:
train_doc_ques = pd.DataFrame(
    {"Document": train_documents["Document"], "Question": train_answers["Question"]}
)
test_doc_ques = pd.DataFrame(
    {"Document": test_documents["Document"], "Question": test_answers["Question"]}
)

### Word Embeddings

To use the CBOW model, we need the data in sentences. Extract this from the original dataset, don't use sent_tokenise, will mess with some of the fullstops, we want to maintain structure from above


In [17]:
def word_tokens(data):
    sentence_list = []
    for i in range(len(data)):
        sentence_list.append(word_tokenize(data[i]))
    return sentence_list


train_doc_list = word_tokens(train_doc_ques["Document"])
train_ques_list = word_tokens(train_doc_ques["Question"])
test_doc_list = word_tokens(test_doc_ques["Document"])
test_ques_list = word_tokens(test_doc_ques["Question"])

In [18]:
combined_text = train_doc_list + train_ques_list + test_doc_list + test_ques_list

In [19]:
# model trained, don't have to run this multiple times
wc_cbow_model = Word2Vec(
    sentences=combined_text,
    vector_size=100,
    window=5,
    min_count=1,
    workers=2,
    epochs=30,
)
# wc_cbow_model.save("cbow.model")

To implement QA

1. Word Embeddings, using CBOW
2. Feature Extraction 1 - POS tags
3. Feature Extraction 2 - TF-IDF
4. Feature Extraction 3 - NER


In [20]:
# run this if model in directory
wc_cbow_model = Word2Vec.load("./cbow.model")

In [21]:
def get_word_embeddings(doc):
    tokenized_doc = word_tokenize(doc)
    embeddings = [wc_cbow_model.wv[word] for word in tokenized_doc]
    return embeddings


train_doc_ques["Doc_Embeddings"] = train_doc_ques["Document"].apply(get_word_embeddings)
train_doc_ques["Q_Embeddings"] = train_doc_ques["Question"].apply(get_word_embeddings)
test_doc_ques["Doc_Embeddings"] = test_doc_ques["Document"].apply(get_word_embeddings)
test_doc_ques["Q_Embeddings"] = test_doc_ques["Question"].apply(get_word_embeddings)

In [22]:
train_doc_ques["Doc_Tokens"] = train_doc_ques["Document"].apply(word_tokenize)
train_doc_ques["Q_Tokens"] = train_doc_ques["Question"].apply(word_tokenize)
test_doc_ques["Doc_Tokens"] = test_doc_ques["Document"].apply(word_tokenize)
test_doc_ques["Q_Tokens"] = test_doc_ques["Question"].apply(word_tokenize)

In [23]:
def find_max_length(column):
    max_length = 0
    lns = []
    for i in range(len(column)):
        lns.append(len(column[i]))
        if len(column[i]) > max_length:
            max_length = len(column[i])
    return "Max Seq Length is {}, Median is {}, Number of lines is {})".format(
        max_length, np.median(lns), len(lns)
    )


find_max_length(test_doc_ques["Doc_Embeddings"])
find_max_length(train_doc_ques["Doc_Embeddings"])

'Max Seq Length is 924, Median is 182.0, Number of lines is 2117)'

In [24]:
def check_count(doc):
    count = 0
    for i in range(len(doc)):
        if len(doc["Doc_Embeddings"][i]) != len(doc["Doc_Tokens"][i]):
            count += 1
        elif len(doc["Q_Embeddings"][i]) != len(doc["Q_Tokens"][i]):
            count += 1
        else:
            continue
    return count


check_count(train_doc_ques)  # looks good

0

Note, need to convert the POS tags, NER tags into embeddings. After this, pad the questions and answers to the max question/document length in the combined training and test set.

### PoS Tagging


In [25]:
# Apply the pos tags to the tokens
from nltk.tag import pos_tag

# download the dependency and resource as required
nltk.download("averaged_perceptron_tagger")

train_doc_ques["Doc_POS"] = train_doc_ques["Doc_Tokens"].apply(pos_tag)
train_doc_ques["Q_POS"] = train_doc_ques["Q_Tokens"].apply(pos_tag)
test_doc_ques["Doc_POS"] = test_doc_ques["Doc_Tokens"].apply(pos_tag)
test_doc_ques["Q_POS"] = test_doc_ques["Q_Tokens"].apply(pos_tag)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nicho\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [26]:
# checking the POS tags: # looks ok
train_doc_ques["Q_POS"][100]

[('how', 'WRB'),
 ('many', 'JJ'),
 ('schools', 'NNS'),
 ('are', 'VBP'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('big', 'JJ'),
 ('ten', 'NN')]

In [27]:
# Extract all unique POS Tags
all_pos_tags = (
    train_doc_ques["Doc_POS"].tolist()
    + test_doc_ques["Doc_POS"].tolist()
    + train_doc_ques["Q_POS"].tolist()
    + test_doc_ques["Q_POS"].tolist()
)


def get_unique_pos(data):
    pos_tags = set()
    for item in data:
        for _, pos_tag in item:
            pos_tags.add(pos_tag)

    pos_tag_index = {tag: i for i, tag in enumerate(sorted(pos_tags))}
    return pos_tag_index


pos_iden = get_unique_pos(all_pos_tags)  # list of tags
pos_iden

{'$': 0,
 'CC': 1,
 'CD': 2,
 'DT': 3,
 'EX': 4,
 'FW': 5,
 'IN': 6,
 'JJ': 7,
 'JJR': 8,
 'JJS': 9,
 'MD': 10,
 'NN': 11,
 'NNP': 12,
 'NNPS': 13,
 'NNS': 14,
 'PDT': 15,
 'POS': 16,
 'PRP': 17,
 'PRP$': 18,
 'RB': 19,
 'RBR': 20,
 'RBS': 21,
 'RP': 22,
 'SYM': 23,
 'TO': 24,
 'UH': 25,
 'VB': 26,
 'VBD': 27,
 'VBG': 28,
 'VBN': 29,
 'VBP': 30,
 'VBZ': 31,
 'WDT': 32,
 'WP': 33,
 'WP$': 34,
 'WRB': 35}

### NER Tagging


### Steps to run this:

-   pip install spacy
-   python -m spacy download en_core_web_sm

If loaded for the first time, restart kernel


In [28]:
# nltk using Spacy
# pip install -U spacy
# python -m spacy download en_core_web_sm
import spacy
import en_core_web_sm

# loading pre-trained model of NER
nlp = en_core_web_sm.load()
nlp.to_disk("en_core_web_sm")

In [29]:
nlp = spacy.load("./en_core_web_sm/")

In [30]:
def ner_tagging(texts):
    tagged_texts = []
    for text in texts:
        doc = spacy.tokens.Doc(nlp.vocab, words=text)
        nlp.get_pipe("ner")(doc)
        tagged_texts.append([(token.text, token.ent_type_) for token in doc])
    return tagged_texts

In [31]:
# Will take a while...
train_doc_ques["Doc_NER"] = ner_tagging(train_doc_ques["Doc_Tokens"])
train_doc_ques["Q_NER"] = ner_tagging(train_doc_ques["Q_Tokens"])
test_doc_ques["Doc_NER"] = ner_tagging(test_doc_ques["Doc_Tokens"])
test_doc_ques["Q_NER"] = ner_tagging(test_doc_ques["Q_Tokens"])

In [32]:
train_doc_ques["Q_NER"][12]

[('how', ''),
 ('much', ''),
 ('are', ''),
 ('the', ''),
 ('harry', 'WORK_OF_ART'),
 ('potter', 'WORK_OF_ART'),
 ('movies', 'WORK_OF_ART'),
 ('worth', 'WORK_OF_ART')]

In [33]:
# Similar approach to the POS

# Extract all unique POS Tags
all_ner_tags = (
    train_doc_ques["Doc_NER"].tolist()
    + test_doc_ques["Doc_NER"].tolist()
    + train_doc_ques["Q_NER"].tolist()
    + test_doc_ques["Q_NER"].tolist()
)


def get_unique_ner(data):
    ner_tags = set()
    for item in data:
        for _, ner_tag in item:
            ner_tags.add(ner_tag)

    ner_tag_index = {tag: i for i, tag in enumerate(sorted(ner_tags))}
    return ner_tag_index


ner_iden = get_unique_pos(all_ner_tags)  # list of tags
ner_iden

{'': 0,
 'CARDINAL': 1,
 'DATE': 2,
 'EVENT': 3,
 'FAC': 4,
 'GPE': 5,
 'LANGUAGE': 6,
 'LAW': 7,
 'LOC': 8,
 'MONEY': 9,
 'NORP': 10,
 'ORDINAL': 11,
 'ORG': 12,
 'PERCENT': 13,
 'PERSON': 14,
 'PRODUCT': 15,
 'QUANTITY': 16,
 'TIME': 17,
 'WORK_OF_ART': 18}

In [34]:
# check ohv dims
ner_idx = ner_iden.values()
aa = np.eye(max(ner_idx) + 1)
# aa

### TF-IDF

First, calculate the document frequency of each token in the entire corpus (training documents + testing documents). The result is a dictionary where each token is a key and its value is the document frequency.


In [35]:
def document_frequency(corpus):
    """
    Computes the document frequency for every token in the corpus.
    Returns a dictionary {token: doc_freq, ...}
    """
    document_frequency = {}
    for document in corpus:
        for token in np.unique(document):
            try:
                document_frequency[token] += 1
            except:
                document_frequency[token] = 1
    return document_frequency


train_corpus = (
    train_doc_ques["Doc_Tokens"].tolist() + train_doc_ques["Q_Tokens"].tolist()
)
test_corpus = test_doc_ques["Doc_Tokens"].tolist() + test_doc_ques["Q_Tokens"].tolist()
train_doc_freq = document_frequency(train_corpus)
test_doc_freq = document_frequency(test_corpus)

Now calculate TF-IDF using the document frequency from above.


In [36]:
from collections import Counter
import math


def compute_tf_idf(corpus, doc_frequency):
    """
    Computes the term frequency inverse document frequency for every token in every document in the corpus.
    Returns a list the same shape as the list of tokenized documents except every token is replaced with the tf-idf
    for that token.
    """
    tf_idf = {}
    tf_idf_list = []
    N = len(doc_frequency)
    doc_id = 0
    for document in corpus:
        tf_idf_doc = []
        counter = Counter(document)
        total_num_words = len(document)
        for token in np.unique(document):
            tf = counter[token] / total_num_words
            df = doc_frequency[token]
            idf = math.log(N / (df + 1)) + 1
            tf_idf[doc_id, token] = tf * idf
        for token in document:
            tf_idf_doc.append(tf_idf[doc_id, token])
        tf_idf_list.append(tf_idf_doc)
        doc_id += 1
    return tf_idf_list


train_doc_ques["Doc_TFIDF"] = compute_tf_idf(
    train_doc_ques["Doc_Tokens"].tolist(), train_doc_freq
)
train_doc_ques["Q_TFIDF"] = compute_tf_idf(
    train_doc_ques["Q_Tokens"].tolist(), train_doc_freq
)
test_doc_ques["Doc_TFIDF"] = compute_tf_idf(
    test_doc_ques["Doc_Tokens"].tolist(), test_doc_freq
)
test_doc_ques["Q_TFIDF"] = compute_tf_idf(
    test_doc_ques["Q_Tokens"].tolist(), test_doc_freq
)

In [37]:
test_doc_ques

Unnamed: 0,Document,Question,Doc_Embeddings,Q_Embeddings,Doc_Tokens,Q_Tokens,Doc_POS,Q_POS,Doc_NER,Q_NER,Doc_TFIDF,Q_TFIDF
0,african immigration to the united states refer...,how african americans were immigrated to the us,"[[1.2371346, 0.026763553, -0.3803194, 1.221458...","[[-1.4015731, 0.38238972, -2.0838397, 1.811023...","[african, immigration, to, the, united, states...","[how, african, americans, were, immigrated, to...","[(african, JJ), (immigration, NN), (to, TO), (...","[(how, WRB), (african, JJ), (americans, NNS), ...","[(african, ORG), (immigration, ORG), (to, ), (...","[(how, ), (african, NORP), (americans, NORP), ...","[0.2444338691504048, 0.1690021701802108, 0.283...","[0.708514287434769, 0.9349595495002982, 0.9907..."
1,a prison from old french prisoun also known...,how large were early jails,"[[0.5997837, -0.49879548, -3.0764463, 0.808406...","[[-1.4015731, 0.38238972, -2.0838397, 1.811023...","[a, prison, from, old, french, prisoun, also, ...","[how, large, were, early, jails]","[(a, DT), (prison, NN), (from, IN), (old, JJ),...","[(how, WRB), (large, JJ), (were, VBD), (early,...","[(a, ), (prison, ), (from, ), (old, ), (french...","[(how, ), (large, ), (were, ), (early, ), (jai...","[0.23734258656531904, 0.15055215160158517, 0.0...","[1.1336228598956304, 1.2987719202959243, 1.145..."
2,a small electrically powered pump a large el...,how a water pump works,"[[0.5997837, -0.49879548, -3.0764463, 0.808406...","[[-1.4015731, 0.38238972, -2.0838397, 1.811023...","[a, small, electrically, powered, pump, a, lar...","[how, a, water, pump, works]","[(a, DT), (small, JJ), (electrically, RB), (po...","[(how, WRB), (a, DT), (water, NN), (pump, NN),...","[(a, ), (small, ), (electrically, ), (powered,...","[(how, ), (a, ), (water, ), (pump, ), (works, )]","[0.1817774325199266, 0.07355137595811154, 0.19...","[1.1336228598956304, 0.8361761895916624, 1.412..."
3,lolita is a 1962 comedy drama film by stanley ...,how old was sue lyon when she made lolita,"[[-0.14881791, 0.08810432, 0.20327696, 0.15059...","[[-1.4015731, 0.38238972, -2.0838397, 1.811023...","[lolita, is, a, 1962, comedy, drama, film, by,...","[how, old, was, sue, lyon, when, she, made, lo...","[(lolita, NN), (is, VBZ), (a, DT), (1962, CD),...","[(how, WRB), (old, JJ), (was, VBD), (sue, NN),...","[(lolita, ), (is, ), (a, ), (1962, DATE), (com...","[(how, ), (old, ), (was, ), (sue, PERSON), (ly...","[0.22677023461895618, 0.03157635832223484, 0.0...","[0.6297904777197946, 0.7657655256778074, 0.528..."
4,each antibody binds to a specific antigen an...,how are antibodies used in,"[[0.6742135, 2.104281, -5.2121115, 2.685211, -...","[[-1.4015731, 0.38238972, -2.0838397, 1.811023...","[each, antibody, binds, to, a, specific, antig...","[how, are, antibodies, used, in]","[(each, DT), (antibody, NN), (binds, VBZ), (to...","[(how, WRB), (are, VBP), (antibodies, NNS), (u...","[(each, ), (antibody, ), (binds, ), (to, ), (a...","[(how, ), (are, ), (antibodies, ), (used, ), (...","[0.05405354788321593, 0.24067984633049655, 0.0...","[1.1336228598956304, 0.9669842090544979, 1.862..."
...,...,...,...,...,...,...,...,...,...,...,...,...
625,american cuts of beef including the brisket br...,where is the brisket from,"[[-2.0547018, 1.0713737, -0.8421732, -0.101983...","[[-0.95963866, -1.4266869, 0.45872673, 0.91664...","[american, cuts, of, beef, including, the, bri...","[where, is, the, brisket, from]","[(american, JJ), (cuts, NNS), (of, IN), (beef,...","[(where, WRB), (is, VBZ), (the, DT), (brisket,...","[(american, NORP), (cuts, ), (of, ), (beef, ),...","[(where, ), (is, ), (the, ), (brisket, ), (fro...","[0.04467110190076254, 0.15609658426345493, 0.2...","[1.1855735192809844, 0.8020395013847651, 0.792..."
626,the arm architecture describes a family of ris...,what is arm chipset,"[[-0.23652898, -0.92592084, -0.6663827, -0.485...","[[-1.2585803, 0.6122668, -1.9161738, -1.78394,...","[the, arm, architecture, describes, a, family,...","[what, is, arm, chipset]","[(the, DT), (arm, NN), (architecture, NN), (de...","[(what, WP), (is, VBZ), (arm, JJ), (chipset, NN)]","[(the, ), (arm, ), (architecture, ), (describe...","[(what, ), (is, ), (arm, ), (chipset, )]","[0.13239230955602443, 0.2925047479012816, 0.06...","[1.2003495883084616, 1.0025493767309563, 2.188..."
627,june bug or junebug may refer to beetles phy...,what is the life span of june bugs,"[[-3.9346182, 0.74120855, 0.60106486, 1.894009...","[[-1.2585803, 0.6122668, -1.9161738, -1.78394,...","[june, bug, or, junebug, may, refer, to, beetl...","[what, is, the, life, span, of, june, bugs]","[(june, NN), (bug, NN), (or, CC), (junebug, NN...","[(what, WP), (is, VBZ), (the, DT), (life, NN),...","[(june, DATE), (bug, ), (or, ), (junebug, ), (...","[(what, ), (is, ), (the, ), (life, ), (span, )...","[0.3625692810391371, 0.3450139669072635, 0.056...","[0.6001747941542308, 0.5012746883654782, 0.495..."
628,this is a list of known biological mothers und...,who is the youngest female to give birth world...,"[[-1.5558925, 1.413975, -9.826337, 1.7899787, ...","[[-1.8808663, 1.8944086, 0.1218802, -0.4170562...","[this, is, a, list, of, known, biological, mot...","[who, is, the, youngest, female, to, give, bir...","[(this, DT), (is, VBZ), (a, DT), (list, NN), (...","[(who, WP), (is, VBZ), (the, DT), (youngest, J...","[(this, ), (is, ), (a, ), (list, ), (of, ), (k...","[(who, ), (is, ), (the, ), (youngest, ), (fema...","[0.4279041372169481, 0.30847673130183273, 0.32...","[0.5327914192742925, 0.40101975069238255, 0.39..."


In [38]:
train_doc_ques

Unnamed: 0,Document,Question,Doc_Embeddings,Q_Embeddings,Doc_Tokens,Q_Tokens,Doc_POS,Q_POS,Doc_NER,Q_NER,Doc_TFIDF,Q_TFIDF
0,a partly submerged glacier cave on perito more...,how are glacier caves formed,"[[0.5997837, -0.49879548, -3.0764463, 0.808406...","[[-1.4015731, 0.38238972, -2.0838397, 1.811023...","[a, partly, submerged, glacier, cave, on, peri...","[how, are, glacier, caves, formed]","[(a, DT), (partly, RB), (submerged, VBN), (gla...","[(how, WRB), (are, VBP), (glacier, JJ), (caves...","[(a, ), (partly, ), (submerged, ), (glacier, )...","[(how, ), (are, ), (glacier, ), (caves, ), (fo...","[0.24677746860673927, 0.1451147752938821, 0.16...","[1.034209132108837, 0.8495245060136498, 1.9896..."
1,in physics circular motion is a movement of ...,how are the directions of the velocity and for...,"[[-1.5489185, 0.5201235, -0.99664, 0.22544254,...","[[-1.4015731, 0.38238972, -2.0838397, 1.811023...","[in, physics, circular, motion, is, a, movemen...","[how, are, the, directions, of, the, velocity,...","[(in, IN), (physics, NNS), (circular, JJ), (mo...","[(how, WRB), (are, VBP), (the, DT), (direction...","[(in, ), (physics, ), (circular, ), (motion, )...","[(how, ), (are, ), (the, ), (directions, ), (o...","[0.10157954319378303, 0.047977208560696934, 0....","[0.34473637736961227, 0.28317483533788324, 0.4..."
2,apollo creed is a fictional character from the...,how did apollo creed die,"[[-0.34742618, 0.57979244, 0.6444635, 0.087400...","[[-1.4015731, 0.38238972, -2.0838397, 1.811023...","[apollo, creed, is, a, fictional, character, f...","[how, did, apollo, creed, die]","[(apollo, NNS), (creed, VBP), (is, VBZ), (a, D...","[(how, WRB), (did, VBD), (apollo, VB), (creed,...","[(apollo, ORG), (creed, ), (is, ), (a, ), (fic...","[(how, ), (did, ), (apollo, ORG), (creed, ), (...","[0.15061458188394025, 0.28752101617726733, 0.0...","[1.034209132108837, 1.1532031921635457, 1.7872..."
3,in the united states the title of federal jud...,how long is the term for federal judges,"[[-1.5489185, 0.5201235, -0.99664, 0.22544254,...","[[-1.4015731, 0.38238972, -2.0838397, 1.811023...","[in, the, united, states, the, title, of, fede...","[how, long, is, the, term, for, federal, judges]","[(in, IN), (the, DT), (united, JJ), (states, V...","[(how, WRB), (long, JJ), (is, VBZ), (the, DT),...","[(in, ), (the, GPE), (united, GPE), (states, G...","[(how, ), (long, ), (is, ), (the, ), (term, ),...","[0.08828641778817685, 0.32044981985432075, 0.1...","[0.646380707568023, 0.7777965845277852, 0.4296..."
4,the beretta 21a bobcat is a small pocket sized...,how a beretta model 21 pistols magazines works,"[[-0.23652898, -0.92592084, -0.6663827, -0.485...","[[-1.4015731, 0.38238972, -2.0838397, 1.811023...","[the, beretta, 21a, bobcat, is, a, small, pock...","[how, a, beretta, model, 21, pistols, magazine...","[(the, DT), (beretta, NN), (21a, CD), (bobcat,...","[(how, WRB), (a, DT), (beretta, NN), (model, N...","[(the, ), (beretta, ), (21a, ), (bobcat, ), (i...","[(how, ), (a, ), (beretta, PRODUCT), (model, )...","[0.21160137561032594, 0.8290055086215778, 0.22...","[0.646380707568023, 0.4549959577436755, 1.2435..."
...,...,...,...,...,...,...,...,...,...,...,...,...
2112,blue mountain state is an american comedy seri...,where was blue mountain state filmed at,"[[-0.6384207, 0.010117575, 1.2289604, -0.94139...","[[-0.95963866, -1.4266869, 0.45872673, 0.91664...","[blue, mountain, state, is, an, american, come...","[where, was, blue, mountain, state, filmed, at]","[(blue, JJ), (mountain, NN), (state, NN), (is,...","[(where, WRB), (was, VBD), (blue, JJ), (mounta...","[(blue, LOC), (mountain, LOC), (state, ), (is,...","[(where, ), (was, ), (blue, ), (mountain, ), (...","[0.29475406599874054, 0.3823486411848812, 0.22...","[0.7503059363146255, 0.5906688741268076, 1.063..."
2113,apple inc formerly apple computer inc is ...,when was apple computer founded,"[[0.06645721, 0.45839298, 0.9816723, -0.780045...","[[-4.1827083, -2.6780283, -0.7016728, 1.746488...","[apple, inc, formerly, apple, computer, inc, i...","[when, was, apple, computer, founded]","[(apple, NN), (inc, VBP), (formerly, RB), (app...","[(when, WRB), (was, VBD), (apple, NN), (comput...","[(apple, ORG), (inc, ORG), (formerly, ORG), (a...","[(when, ), (was, ), (apple, ), (computer, ), (...","[0.2615737575545419, 0.08943278683987749, 0.02...","[0.9915267033360398, 0.8269364237775307, 1.700..."
2114,section 8 housing in the south bronx section 8...,what is section eight housing,"[[1.0071282, -0.101174586, -0.69412017, 1.5527...","[[-1.2585803, 0.6122668, -1.9161738, -1.78394,...","[section, 8, housing, in, the, south, bronx, s...","[what, is, section, eight, housing]","[(section, NN), (8, CD), (housing, NN), (in, I...","[(what, WP), (is, VBZ), (section, NN), (eight,...","[(section, LAW), (8, LAW), (housing, ), (in, )...","[(what, ), (is, ), (section, ), (eight, CARDIN...","[0.19207290739926391, 0.16089672153840806, 0.2...","[0.8426216022668568, 0.6874871616619922, 1.514..."
2115,restaurants categorized by type and informatio...,what is the main type of restaurant,"[[0.39172053, 0.8604569, 0.24641779, 0.5786414...","[[-1.2585803, 0.6122668, -1.9161738, -1.78394,...","[restaurants, categorized, by, type, and, info...","[what, is, the, main, type, of, restaurant]","[(restaurants, NNS), (categorized, VBN), (by, ...","[(what, WP), (is, VBZ), (the, DT), (main, JJ),...","[(restaurants, ), (categorized, ), (by, ), (ty...","[(what, ), (is, ), (the, ), (main, ), (type, )...","[0.876941110711729, 0.9254918922898989, 0.4042...","[0.6018725730477548, 0.49106225832999434, 0.48..."


In [39]:
def one_hot_vectorize(
    pos_tagger, ner_tagger, data
):  # pass in the unique dict for ner or pos
    pos_idx = pos_tagger.values()
    pos_ohv = np.eye(max(pos_idx) + 1)  # create the ohv
    ner_idx = ner_tagger.values()
    ner_ohv = np.eye(max(ner_idx) + 1)

    dpos_full_ohv, dner_full_ohv = [], []  # lists to append to
    qpos_full_ohv, qner_full_ohv = [], []  # lists to append to

    for item in data["Doc_POS"]:
        sent_ohv = []
        for word in item:
            tag = word[1]
            pos_index_iden = pos_tagger[tag]
            sent_ohv.append(pos_ohv[pos_index_iden])
        dpos_full_ohv.append(sent_ohv)

    for item in data["Q_POS"]:
        sent_ohv = []
        for word in item:
            tag = word[1]
            pos_index_iden = pos_tagger[tag]
            sent_ohv.append(pos_ohv[pos_index_iden])
        qpos_full_ohv.append(sent_ohv)

    for item in data["Doc_NER"]:
        sent_ohv = []
        for word in item:
            tag = word[1]
            ner_index_iden = ner_tagger[tag]
            sent_ohv.append(ner_ohv[ner_index_iden])
        dner_full_ohv.append(sent_ohv)

    for item in data["Q_NER"]:
        sent_ohv = []
        for word in item:
            tag = word[1]
            ner_index_iden = ner_tagger[tag]
            sent_ohv.append(ner_ohv[ner_index_iden])
        qner_full_ohv.append(sent_ohv)

    return (dpos_full_ohv, qpos_full_ohv, dner_full_ohv, qner_full_ohv)

In [40]:
# get the ohv for doc
(
    train_doc_pos_ohv,
    train_q_pos_ohv,
    train_doc_ner_ohv,
    train_q_ner_ohv,
) = one_hot_vectorize(pos_iden, ner_iden, train_doc_ques)
test_doc_pos_ohv, test_q_pos_ohv, test_doc_ner_ohv, test_q_ner_ohv = one_hot_vectorize(
    pos_iden, ner_iden, test_doc_ques
)

In [41]:
# reduce the dataframe to just tokens and embeddings:
doc_emb_train = train_doc_ques[["Doc_Tokens", "Doc_Embeddings", "Doc_TFIDF"]]
doc_pos_ner = pd.DataFrame({"Doc_POS": train_doc_pos_ohv, "Doc_NER": train_doc_ner_ohv})
doc_emb_train = pd.concat([doc_emb_train, doc_pos_ner], axis=1)

q_emb_train = train_doc_ques[["Q_Tokens", "Q_Embeddings", "Q_TFIDF"]]
q_pos_ner = pd.DataFrame({"Q_POS": train_q_pos_ohv, "Q_NER": train_q_ner_ohv})
q_emb_train = pd.concat([q_emb_train, q_pos_ner], axis=1)

In [42]:
doc_emb_test = test_doc_ques[["Doc_Tokens", "Doc_Embeddings", "Doc_TFIDF"]]
doc_pos_ner = pd.DataFrame({"Doc_POS": test_doc_pos_ohv, "Doc_NER": test_doc_ner_ohv})
doc_emb_test = pd.concat([doc_emb_test, doc_pos_ner], axis=1)

q_emb_test = test_doc_ques[["Q_Tokens", "Q_Embeddings", "Q_TFIDF"]]
q_pos_ner = pd.DataFrame({"Q_POS": test_q_pos_ohv, "Q_NER": test_q_ner_ohv})
q_emb_test = pd.concat([q_emb_test, q_pos_ner], axis=1)

### Word Embeddings (Doc and Qn)

The embeddings of the questions and answers of the train and test set can be found here:

-   Train Document - doc_emb_train
-   Train Q - q_emb_train
-   Test Document - doc_emb_test
-   Test Q - q_emb_test

The max_document size is 1675 and max_question size is 23.


In [43]:
doc_emb_train

Unnamed: 0,Doc_Tokens,Doc_Embeddings,Doc_TFIDF,Doc_POS,Doc_NER
0,"[a, partly, submerged, glacier, cave, on, peri...","[[0.5997837, -0.49879548, -3.0764463, 0.808406...","[0.24677746860673927, 0.1451147752938821, 0.16...","[[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,"[in, physics, circular, motion, is, a, movemen...","[[-1.5489185, 0.5201235, -0.99664, 0.22544254,...","[0.10157954319378303, 0.047977208560696934, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,"[apollo, creed, is, a, fictional, character, f...","[[-0.34742618, 0.57979244, 0.6444635, 0.087400...","[0.15061458188394025, 0.28752101617726733, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,"[in, the, united, states, the, title, of, fede...","[[-1.5489185, 0.5201235, -0.99664, 0.22544254,...","[0.08828641778817685, 0.32044981985432075, 0.1...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,"[the, beretta, 21a, bobcat, is, a, small, pock...","[[-0.23652898, -0.92592084, -0.6663827, -0.485...","[0.21160137561032594, 0.8290055086215778, 0.22...","[[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
...,...,...,...,...,...
2112,"[blue, mountain, state, is, an, american, come...","[[-0.6384207, 0.010117575, 1.2289604, -0.94139...","[0.29475406599874054, 0.3823486411848812, 0.22...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,..."
2113,"[apple, inc, formerly, apple, computer, inc, i...","[[0.06645721, 0.45839298, 0.9816723, -0.780045...","[0.2615737575545419, 0.08943278683987749, 0.02...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2114,"[section, 8, housing, in, the, south, bronx, s...","[[1.0071282, -0.101174586, -0.69412017, 1.5527...","[0.19207290739926391, 0.16089672153840806, 0.2...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,..."
2115,"[restaurants, categorized, by, type, and, info...","[[0.39172053, 0.8604569, 0.24641779, 0.5786414...","[0.876941110711729, 0.9254918922898989, 0.4042...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [44]:
doc_emb_train["Doc_NER"][0]

[array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]),
 array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]),
 array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]),
 array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]),
 array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]),
 array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]),
 array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]),
 array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]),
 array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]),
 array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]),
 array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [45]:
def full_array(data, data_type="Document"):
    num_vec_length = 156
    max_doc = 1675
    max_qn = 23
    zero_vec = np.zeros(156)

    if data_type == "Document":
        full_vec = []  # create a list for list of list for document
        for dat in range(len(data)):  # go through each line
            doc_ques = data.loc[dat]  # document data
            v = []  # create list to each word
            for j in range(len(doc_ques.iloc[0])):
                vn = []  # list of concat word embeddings
                vn.append(doc_ques.iloc[1][j].tolist())  # Word2Vec
                vn.append(doc_ques.iloc[2][j])  # TF-IDF
                vn.append(doc_ques.iloc[3][j].tolist())  # POS
                vn.append(doc_ques.iloc[4][j].tolist())  # NER
                flatten = [
                    item
                    for sublist in vn
                    for item in (sublist if isinstance(sublist, list) else [sublist])
                ]
                v.append(flatten)
            while len(v) < max_doc:
                v.append(zero_vec)
            full_vec.append(v)

    if data_type == "Question":
        full_vec = []  # create a list for list of list for document
        for dat in range(len(data)):  # go through each line
            doc_ques = data.loc[dat]  # document data
            v = []  # create list to each word
            for j in range(len(doc_ques.iloc[0])):
                vn = []  # list of concat word embeddings
                vn.append(doc_ques.iloc[1][j].tolist())  # Word2Vec
                vn.append(doc_ques.iloc[2][j])  # TF-IDF
                vn.append(doc_ques.iloc[3][j].tolist())  # POS
                vn.append(doc_ques.iloc[4][j].tolist())  # NER
                flatten = [
                    item
                    for sublist in vn
                    for item in (sublist if isinstance(sublist, list) else [sublist])
                ]
                v.append(flatten)
            while len(v) < max_qn:
                v.append(zero_vec)
            full_vec.append(v)
    return full_vec

In [46]:
# Training/Test Documents to pass in, takes about a min
final_doc_train = np.stack(full_array(doc_emb_train, data_type="Document"))
final_doc_test = np.stack(full_array(doc_emb_test, data_type="Document"))

In [47]:
# Training/Test Questions to pass in, takes about a few seconds
final_qn_train = np.stack(full_array(q_emb_train, data_type="Question"))
final_qn_test = np.stack(full_array(q_emb_test, data_type="Question"))

In [48]:
def convert_labels(labels):
    check = []
    for i in labels:
        if len(i) < 1675:
            while len(i) < 1675:
                i.append("N")
            check.append(i)
        else:
            check.append(i)
    return check


tr_labels = np.array(convert_labels(train_doc_ans_labels))
ts_labels = np.array(convert_labels(test_doc_ans_labels))

Earlier, we found that the sequence length of the documents can get quite large, which might introduce a lot of noise into the model with paddings. One alternative to reduce this noise is to perhaps truncate the sequences down to just over the median for the documents. We also need to do this for the outputs.


In [49]:
find_max_length(test_doc_ques["Doc_Embeddings"]), find_max_length(
    train_doc_ques["Doc_Embeddings"]
), find_max_length(test_doc_ques["Q_Embeddings"]), find_max_length(
    train_doc_ques["Q_Embeddings"]
)

('Max Seq Length is 1675, Median is 181.0, Number of lines is 630)',
 'Max Seq Length is 924, Median is 182.0, Number of lines is 2117)',
 'Max Seq Length is 19, Median is 7.0, Number of lines is 630)',
 'Max Seq Length is 23, Median is 7.0, Number of lines is 2117)')

In [50]:
# truncate the embeddings and the labels to 200


def truncate(data, labels, max_seq=200):
    data = data[:, :max_seq, :]
    labels = labels[:, :max_seq]
    return data, labels


final_doc_train, tr_labels = truncate(final_doc_train, tr_labels)
final_doc_test, ts_labels = truncate(final_doc_test, ts_labels)

In [51]:
# final prepared documents are found here:

# final_doc_train, tr_labels - doc embeddings and output labels for training
# final_doc_test, ts_labels - doc embeddings and output labels for testing
# final_qn_train - question embeddings for training
# final_qn_test - question embeddings for testing

### Converting into Tensors:


In [52]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [53]:
# takes a min
tf_final_doc_train = torch.tensor(final_doc_train, device=device, dtype=torch.float32)
tf_final_doc_test = torch.tensor(final_doc_test, device=device, dtype=torch.float32)
tf_final_qn_train = torch.tensor(final_qn_train, device=device, dtype=torch.float32)
tf_final_qn_test = torch.tensor(final_qn_test, device=device, dtype=torch.float32)

In [54]:
# check dimensions
print(tf_final_doc_train.shape)
print(tf_final_doc_test.shape)
print(tf_final_qn_train.shape)
print(tf_final_qn_test.shape)

torch.Size([2117, 200, 156])
torch.Size([630, 200, 156])
torch.Size([2117, 23, 156])
torch.Size([630, 23, 156])


In [55]:
# save the tensors into wd
torch.save(tf_final_doc_train, "tensor.doc_train")
torch.save(tf_final_doc_test, "tensor.doc_test")
torch.save(tf_final_qn_train, "tensor.qn_train")
torch.save(tf_final_qn_test, "tensor.qn_test")

**Input Embedding Ablation Study**

In the model input embedding Ablation study, we are given 3 variations of input embeddings to test. We will test 3 options:

1. Word2Vec only # 100 dims
2. Word2Vec + Tf-IDF # 101 dims
3. Word2Vec + all features (TF-IDF, POS, NER) # 156 dims

Since we are using tensors, we can use tensor slicing to take out the relevant features.
Our tensor of embeddings are built as follows (w2v, TF-IDF, POS, NER)


In [56]:
def convert_tensors(tf_doc_train, tf_doc_test, tf_qn_train, tf_qn_test, option=3):
    if option == 3:
        return tf_doc_train, tf_doc_test, tf_qn_train, tf_qn_test
    elif option == 1:
        tf_doc_train = tf_doc_train[:, :, :100]
        tf_doc_test = tf_doc_test[:, :, :100]
        tf_qn_train = tf_qn_train[:, :, :100]
        tf_qn_test = tf_qn_test[:, :, :100]
        return tf_doc_train, tf_doc_test, tf_qn_train, tf_qn_test
    elif option == 2:
        tf_doc_train = tf_doc_train[:, :, :101]
        tf_doc_test = tf_doc_test[:, :, :101]
        tf_qn_train = tf_qn_train[:, :, :101]
        tf_qn_test = tf_qn_test[:, :, :101]
        return tf_doc_train, tf_doc_test, tf_qn_train, tf_qn_test

In [57]:
# change option to see size
(
    convert_tensors(
        tf_final_doc_train, tf_final_doc_test, tf_final_qn_train, tf_final_qn_test, 2
    )[0]
).shape

torch.Size([2117, 200, 101])

Our answer should perhaps also be in the form of (1x1675) list containing ['N','S','I','E']


Additionally, the labels should be one hot vectorised, as they are cateogorical.


In [58]:
np.save("tr_labels.npy", tr_labels)
np.save("ts_labels.npy", ts_labels)

In [59]:
def one_hot_encode_labels(labels):
    # Create a dictionary that maps each label to a unique integer
    label_to_int = {"N": 0, "S": 1, "I": 2, "E": 3}

    # Map the labels to integers
    int_labels = [[label_to_int[label] for label in sequence] for sequence in labels]

    # Create an identity matrix of size 4 (since there are 4 labels)
    identity = np.eye(4)

    # Use the integer labels as indices to select rows from the identity matrix
    one_hot_labels = [identity[sequence] for sequence in int_labels]
    return one_hot_labels


tr_encoded = one_hot_encode_labels(tr_labels)
tst_encoded = one_hot_encode_labels(ts_labels)

In [60]:
# Create a mapping from label to index
label2index = {"N": 0, "S": 1, "I": 2, "E": 3}

# Find the maximum length of the label lists
max_len = tf_final_doc_train.shape[1]

# Create a tensor to hold the one-hot encoded labels
train_labels = torch.zeros(
    len(tr_labels), max_len, len(label2index), device=device, dtype=torch.float32
)
test_labels = torch.zeros(
    len(ts_labels),
    max_len,
    len(label2index),
    device=device,
    dtype=torch.float32,
)

# Iterate over the label lists and one-hot encode the labels
for i, label_list in enumerate(tr_labels):
    for j, label in enumerate(label_list):
        index = label2index[label]
        # Sets all elements of the target_labels tensor at position (i,j) to 0
        train_labels[i, j] = 0
        train_labels[i, j, index] = 1

for i, label_list in enumerate(ts_labels):
    for j, label in enumerate(label_list):
        index = label2index[label]
        # Sets all elements of the target_labels tensor at position (i,j) to 0
        test_labels[i, j] = 0
        test_labels[i, j, index] = 1

In [81]:
from sklearn.utils.class_weight import compute_class_weight

# Reshape the target labels tensor
reshaped_target_labels = (
    train_labels.view(-1, 4).cpu().numpy()
)  # Assuming it's on the GPU

# Flatten the reshaped target labels
flattened_target_labels = reshaped_target_labels.argmax(axis=1)

# Calculate the class weights
class_weights = compute_class_weight(
    class_weight="balanced", classes=[0, 1, 2, 3], y=flattened_target_labels
)

# Convert the class weights to a PyTorch tensor
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

In [74]:
ts_labels

array([['N', 'N', 'N', ..., 'N', 'N', 'N'],
       ['N', 'N', 'N', ..., 'N', 'N', 'N'],
       ['N', 'N', 'N', ..., 'N', 'N', 'N'],
       ...,
       ['N', 'N', 'N', ..., 'N', 'N', 'N'],
       ['N', 'N', 'N', ..., 'N', 'N', 'N'],
       ['N', 'N', 'N', ..., 'N', 'N', 'N']], dtype='<U1')

In [76]:
print(train_labels.shape)
print(train_labels[0][:50])

torch.Size([2117, 200, 4])
tensor([[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.],
        [1.

### Model


In [62]:
import torch
from torch import Tensor
import torch.nn as nn


class DocumentBiRNN(nn.Module):
    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        num_layers=1,
    ):
        super(DocumentBiRNN, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(
            input_size,
            hidden_size,
            num_layers=num_layers,
            bidirectional=True,
        )

    def forward(self, input: Tensor):
        input = input.unsqueeze(1)
        output: Tensor
        output, _ = self.lstm(input)
        # print("document output shape: ", output.shape)
        return output

In [63]:
import torch
from torch import Tensor
import torch.nn as nn

import torch.nn.functional as F


class QuestionBiRNN(nn.Module):
    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        num_layers=1,
    ):
        super(QuestionBiRNN, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.lstm = nn.LSTM(
            input_size,
            hidden_size,
            num_layers=num_layers,
            bidirectional=True,
        )

    def forward(self, input: Tensor):
        input = input.unsqueeze(1)
        output, (hn, cn) = self.lstm(input)
        forward_hn = hn[-2, :, :]
        backward_hn = hn[-1, :, :]
        hidden = torch.cat((forward_hn, backward_hn), dim=-1).unsqueeze(0)
        # print("question hidden shape: ", hidden.shape)
        return hidden

In [64]:
import torch.nn as nn

# the document has shape (doc_len, batch_size, hidden_size) and
# the question has shape (ques_len, batch_size, hidden_size),
# the resulting attention tensor would have shape (batch_size, ques_len, doc_len).
# Each element in the attention tensor represents the dot product
# between the corresponding hidden state of the question and document.


# In your case, since batch_size is 1,
# you can remove the singleton dimension to get
# an attention tensor of shape (ques_len, doc_len), or (23, 1675) in this case
class Attention(nn.Module):
    def __init__(self, ques_len, hidden_size):
        super(Attention, self).__init__()
        self.out = nn.Linear(ques_len, hidden_size)

    def forward(self, document_output, question_summary):
        document_output = document_output.permute(1, 0, 2)  # torch.Size([200, 1, 16])
        question_summary = question_summary.permute(1, 2, 0)  # torch.Size([23, 1, 16])

        # [1, 200, 16], [1, 16, 23] -> [1, 200, 23]
        attention_scores = torch.bmm(document_output, question_summary).squeeze(0)

        # print("attention scores: ", attention_scores)

        # print("attention scores shape: ", attention_scores.shape)

        return attention_scores

        # Apply the softmax function to the attention scores to obtain the attention weights
        # attention_weights = nn.functional.softmax(attention_scores, dim=2).squeeze(0)

        # # print("attention weights shape: ", attention_weights.shape)
        # print("attention weights: ", attention_weights)
        # return attention_weights
        # out = self.out(attention_weights)

        # # print("attention output shape: ", out.shape)

        # return out

In [65]:
class ReadingComprehensionModel(nn.Module):
    def __init__(self, document_rnn, question_rnn, attention, hidden_size, output_size):
        super(ReadingComprehensionModel, self).__init__()
        self.document_rnn = document_rnn
        self.question_rnn = question_rnn
        self.attention = attention
        self.linear = nn.Linear(1, output_size)

    def predict_label(self, attention_output):
        pred = self.linear(attention_output)
        # print("prediction shape: ", pred.shape)
        pred_weights = nn.functional.softmax(pred, dim=1)
        # print("prediction weights shape: ", pred_weights.shape)
        return pred_weights

In [66]:
from torch.utils.data import Dataset


class BatchDataset(Dataset):
    def __init__(self, document_inputs, question_inputs, target_labels):
        self.document_inputs = document_inputs
        self.question_inputs = question_inputs
        self.target_labels = target_labels

    def __len__(self):
        return len(self.document_inputs)

    def __getitem__(self, idx):
        return (
            self.document_inputs[idx],
            self.question_inputs[idx],
            self.target_labels[idx],
        )

In [67]:
from torch.utils.data import TensorDataset, DataLoader


def trainIter(
    model,
    document_inputs,
    question_inputs,
    target_labels,
    num_epochs,
    criterion,
    optimizer,
):
    model.train()
    for epoch in range(num_epochs):
        loss = 0
        for document_input, question_input, target_label in zip(
            document_inputs, question_inputs, target_labels
        ):
            optimizer.zero_grad()

            document_output = model.document_rnn(document_input)
            question_summary = model.question_rnn(question_input)

            attention_output = model.attention(document_output, question_summary)

            token_label_logits = model.predict_label(attention_output).to(device)

            # print("token label logits shape: ", token_label_logits.shape)
            # print("target label shape: ", target_label.shape)
            # print("token label logits: ", token_label_logits)

            # print(token_label_logits[0])
            # print(target_label[0])
            # raise TypeError("stop")

            loss += criterion(token_label_logits, target_label)
            # print(loss)

        loss.backward()
        optimizer.step()

        avg_loss = loss.item() / len(document_inputs)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

In [96]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

START_LABEL = 1
END_LABEL = 3


def evaluate(model, document_inputs, question_inputs, target_labels, criterion):
    model.eval()
    with torch.no_grad():
        loss = 0
        all_predictions = []
        all_targets = []
        for document_input, question_input, target_label in zip(
            document_inputs, question_inputs, target_labels
        ):
            document_output = model.document_rnn(document_input)
            question_summary = model.question_rnn(question_input)
            attention_output = model.attention(document_output, question_summary)
            token_label_logits = model.predict_label(attention_output).to(device)
            loss += criterion(token_label_logits, target_label)

            # print(token_label_logits)

            predictions = token_label_logits.argmax(dim=-1).cpu().numpy()
            targets = target_label.argmax(dim=-1).cpu().numpy()
            # print(predictions == 1)

            if any(targets == START_LABEL) and any(targets == END_LABEL):
                # Find indices of start and end tokens
                start_token_idx = np.where(targets == START_LABEL)[0]
                end_token_idx = np.where(targets == END_LABEL)[0]

                print("target: ", targets[start_token_idx[0] : end_token_idx[0] + 1])
                print(
                    "prediction: ",
                    predictions[start_token_idx[0] : end_token_idx[0] + 1],
                )
                print()

                # Take slice of predictions and target_labels for sentence tokens
                sentence_prediction = predictions[
                    start_token_idx[0] : end_token_idx[0] + 1
                ]
                sentence_target = targets[start_token_idx[0] : end_token_idx[0] + 1]

                all_predictions.extend(sentence_prediction)
                all_targets.extend(sentence_target)
            else:
                # Use the whole document since there is no answer
                all_predictions.extend(predictions)
                all_targets.extend(targets)

        # print(all_predictions)
        # print(all_targets)

        avg_loss = loss.item() / len(document_inputs)
        accuracy = accuracy_score(all_targets, all_predictions)
        precision = precision_score(all_targets, all_predictions, average="macro")
        recall = recall_score(all_targets, all_predictions, average="macro")
        f1 = f1_score(all_targets, all_predictions, average="macro")

        print(
            f"Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}"
        )

        return accuracy, precision, recall, f1

In [97]:
from torch import optim


# def train():
hidden_size = 32
epochs = 20
learning_rate = 0.1
token_labels = 4


# tf_final_doc_train = torch.rand
# tf_final_qn_train
# (
#     tf_final_doc_train,
#     tf_final_doc_test,
#     tf_final_qn_train,
#     tf_final_qn_test,
# ) = convert_tensors(
#     tf_final_doc_train, tf_final_doc_test, tf_final_qn_train, tf_final_qn_test, 1
# )
document_num_embeddings = tf_final_doc_train.shape[2]
question_num_embeddings = tf_final_qn_train.shape[2]
ques_len = tf_final_qn_train.shape[1]

document_rnn = DocumentBiRNN(
    hidden_size=hidden_size, input_size=document_num_embeddings
).to(device)
question_rnn = QuestionBiRNN(
    input_size=question_num_embeddings,
    hidden_size=hidden_size,
).to(device)
attention = Attention(ques_len=ques_len, hidden_size=hidden_size).to(device)
reading_comp = ReadingComprehensionModel(
    document_rnn,
    question_rnn,
    attention,
    hidden_size=hidden_size,
    output_size=token_labels,
).to(device)
reading_comp_optimizer = optim.AdamW(reading_comp.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss(weight=class_weights)

# # Create TensorDatasets from your data tensors
# dataset = TensorDataset(tf_final_doc_train, tf_final_qn_train)

# # Define batch size and create DataLoader
# batch_size = 32
# dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

trainIter(
    reading_comp,
    tf_final_doc_train,
    tf_final_qn_train,
    train_labels,
    epochs,
    criterion,
    reading_comp_optimizer,
)


# train()

Epoch 1/20, Loss: 1.3963
Epoch 2/20, Loss: 1.4468
Epoch 3/20, Loss: 1.4039
Epoch 4/20, Loss: 1.3523
Epoch 5/20, Loss: 1.3215
Epoch 6/20, Loss: 1.2950
Epoch 7/20, Loss: 1.2665
Epoch 8/20, Loss: 1.2405
Epoch 9/20, Loss: 1.2165
Epoch 10/20, Loss: 1.1957
Epoch 11/20, Loss: 1.1777
Epoch 12/20, Loss: 1.1673
Epoch 13/20, Loss: 1.1537
Epoch 14/20, Loss: 1.1437
Epoch 15/20, Loss: 1.1330
Epoch 16/20, Loss: 1.1230
Epoch 17/20, Loss: 1.1133
Epoch 18/20, Loss: 1.1041
Epoch 19/20, Loss: 1.0954
Epoch 20/20, Loss: 1.0876


In [98]:
evaluate(reading_comp, tf_final_doc_train, tf_final_qn_train, train_labels, criterion)

target:  [1 2 2 2 2 2 2 2 2 2 2 2 3]
prediction:  [1 0 1 0 1 0 0 0 1 3 0 0 3]

target:  [1 2 2 2 2 2 2 2 3]
prediction:  [1 1 0 1 3 0 0 0 3]

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3]
prediction:  [1 3 0 0 0 0 1 1 3 3 0 3 0 0 0 1 0 3 1 1 3 0 3]

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3]
prediction:  [1 1 1 0 0 3 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 3 0 0 0 1 0 3]

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3]
prediction:  [1 1 0 1 0 0 0 1 0 0 1 3 0 0 1 1 0 0 0 0 1 0 3]

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3]
prediction:  [1 0 0 0 1 3 0 0 0 3 0 1 0 1 0 0 0 0 1 1 3 0 1 0 0 0 3]

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3]
prediction:  [1 1 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 3 3]

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3]
prediction:  [1 0 1 0 0 0 0 0 0 0 3 0 1 3 3 0 1 1 0 0 0 0 1 0 3 0 1 0 0 1 0 0 0 0 0 3]

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 

  _warn_prf(average, modifier, msg_start, len(result))


Loss: 1.0808, Accuracy: 0.6756, Precision: 0.2443, Recall: 0.6671, F1: 0.2242


(0.6756366109764911,
 0.24428252529845976,
 0.6670915737557239,
 0.22418909066906473)

In [99]:
evaluate(reading_comp, tf_final_doc_test, tf_final_qn_test, test_labels, criterion)

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 3]
prediction:  [1 0 1 0 0 0 0 1 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 3 0 0 1 0 1 1 0 0 0 1 0 1 1
 0 3 0]

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3]
prediction:  [0 0 0 1 0 0 0 0 0 0 3 0 0 0 0 0 0 3 1 3]

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 3]
prediction:  [1 0 1 0 0 1 1 0 0 0 1 0 0 3]

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3]
prediction:  [1 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 0 3 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 3]

target:  [1 2 2 2 2 2 2 2 3]
prediction:  [1 1 0 0 0 0 0 0 0]

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 3]
prediction:  [1 0 1 0 0 0 3 0 0 3 0 3 0 0 1 3 0 3 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1
 1 0]

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3]
prediction:  [1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 3 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 3]

target:  [1 2 2 2 2 2 2 2 2

  _warn_prf(average, modifier, msg_start, len(result))


(0.6711198519742584,
 0.24195519917316188,
 0.5327898562663447,
 0.2160725645227667)

In [100]:
from torch import optim


# def train():
hidden_size = 32
epochs = 50
learning_rate = 0.1
token_labels = 4


# tf_final_doc_train = torch.rand
# tf_final_qn_train
# (
#     tf_final_doc_train,
#     tf_final_doc_test,
#     tf_final_qn_train,
#     tf_final_qn_test,
# ) = convert_tensors(
#     tf_final_doc_train, tf_final_doc_test, tf_final_qn_train, tf_final_qn_test, 1
# )
document_num_embeddings = tf_final_doc_train.shape[2]
question_num_embeddings = tf_final_qn_train.shape[2]
ques_len = tf_final_qn_train.shape[1]

document_rnn = DocumentBiRNN(
    hidden_size=hidden_size, input_size=document_num_embeddings
).to(device)
question_rnn = QuestionBiRNN(
    input_size=question_num_embeddings,
    hidden_size=hidden_size,
).to(device)
attention = Attention(ques_len=ques_len, hidden_size=hidden_size).to(device)
reading_comp = ReadingComprehensionModel(
    document_rnn,
    question_rnn,
    attention,
    hidden_size=hidden_size,
    output_size=token_labels,
).to(device)
reading_comp_optimizer = optim.AdamW(reading_comp.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss(weight=class_weights)

# # Create TensorDatasets from your data tensors
# dataset = TensorDataset(tf_final_doc_train, tf_final_qn_train)

# # Define batch size and create DataLoader
# batch_size = 32
# dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

trainIter(
    reading_comp,
    tf_final_doc_train,
    tf_final_qn_train,
    train_labels,
    epochs,
    criterion,
    reading_comp_optimizer,
)


# train()

Epoch 1/50, Loss: 1.3957
Epoch 2/50, Loss: 1.3354
Epoch 3/50, Loss: 1.3125
Epoch 4/50, Loss: 1.2579
Epoch 5/50, Loss: 1.2185
Epoch 6/50, Loss: 1.2016
Epoch 7/50, Loss: 1.1820
Epoch 8/50, Loss: 1.1674
Epoch 9/50, Loss: 1.1516
Epoch 10/50, Loss: 1.1370
Epoch 11/50, Loss: 1.1239
Epoch 12/50, Loss: 1.1126
Epoch 13/50, Loss: 1.1021
Epoch 14/50, Loss: 1.0914
Epoch 15/50, Loss: 1.0820
Epoch 16/50, Loss: 1.0728
Epoch 17/50, Loss: 1.0644
Epoch 18/50, Loss: 1.0565
Epoch 19/50, Loss: 1.0495
Epoch 20/50, Loss: 1.0425
Epoch 21/50, Loss: 1.0359
Epoch 22/50, Loss: 1.0306
Epoch 23/50, Loss: 1.0244
Epoch 24/50, Loss: 1.0185
Epoch 25/50, Loss: 1.0126
Epoch 26/50, Loss: 1.0073
Epoch 27/50, Loss: 1.0017
Epoch 28/50, Loss: 0.9963
Epoch 29/50, Loss: 0.9919
Epoch 30/50, Loss: 0.9862
Epoch 31/50, Loss: 0.9953
Epoch 32/50, Loss: 0.9853
Epoch 33/50, Loss: 0.9818
Epoch 34/50, Loss: 0.9738
Epoch 35/50, Loss: 0.9683
Epoch 36/50, Loss: 0.9645
Epoch 37/50, Loss: 0.9602
Epoch 38/50, Loss: 0.9570
Epoch 39/50, Loss: 0.

In [101]:
evaluate(reading_comp, tf_final_doc_train, tf_final_qn_train, train_labels, criterion)

target:  [1 2 2 2 2 2 2 2 2 2 2 2 3]
prediction:  [1 2 2 2 2 2 0 2 2 2 2 2 3]

target:  [1 2 2 2 2 2 2 2 3]
prediction:  [1 0 0 0 0 2 2 2 3]

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3]
prediction:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0 2 2 3]

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3]
prediction:  [1 0 2 0 2 2 2 2 2 2 2 2 2 0 0 2 0 0 2 2 0 2 2 0 2 0 2 3]

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3]
prediction:  [1 2 2 2 2 2 0 0 2 0 2 2 2 2 2 2 0 0 2 2 2 2 3]

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3]
prediction:  [1 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3]

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3]
prediction:  [1 1 0 0 0 2 2 2 2 0 2 2 2 2 2 0 2 2 3]

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3]
prediction:  [1 2 2 2 0 2 0 0 2 2 2 0 2 2 2 2 0 2 2 2 2 2 2 0 2 0 0 0 2 0 0 0 2 2 2 3]

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 

(0.5856820983718063,
 0.33157808673757283,
 0.8523251678828888,
 0.3299440726587368)

In [102]:
evaluate(reading_comp, tf_final_doc_test, tf_final_qn_test, test_labels, criterion)

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 3]
prediction:  [0 2 2 2 2 2 0 2 2 2 2 2 0 0 2 0 2 2 2 2 2 2 2 3 0 0 2 2 0 2 2 2 0 2 2 2 2
 2 2 2]

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3]
prediction:  [2 0 2 0 0 0 0 2 2 2 2 0 0 0 0 0 2 3 0 2]

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 3]
prediction:  [1 0 0 0 2 0 0 0 2 2 2 2 0 0]

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3]
prediction:  [2 2 2 2 2 2 2 2 2 2 2 0 0 2 2 0 2 2 3 2 2 2 2 2 0 2 2 2 2 2 0 0 0 2 0 0 2]

target:  [1 2 2 2 2 2 2 2 3]
prediction:  [0 2 0 0 2 0 0 2 2]

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 3]
prediction:  [2 2 0 2 1 0 0 0 0 2 2 0 2 2 2 0 2 0 0 2 2 0 0 0 2 0 2 2 2 2 2 2 0 0 2 2 0
 0 0]

target:  [1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3]
prediction:  [1 0 2 2 2 0 0 2 2 2 2 2 0 0 2 2 2 1 0 0 2 2 2 2 0 0 2 2 0 2 2 0 0 0 2 2]

target:  [1 2 2 2 2 2 2 2 2

(0.5289709185276559,
 0.2859811592039112,
 0.5502330587594537,
 0.2510332048418604)