In [1]:
import nltk

nltk.download("punkt")
from nltk import word_tokenize
import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec

[nltk_data] Downloading package punkt to /Users/daniel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# reading in the data
train_data = pd.read_csv("WikiQA-train.tsv", sep="\t")
test_data = pd.read_csv("WikiQA-test.tsv", sep="\t")

Extract the unique questions from the train and test data frames, including the documentID and the DocumentTitle


In [3]:
def get_questions_documenttag(data):
    qd = data[
        ["Question", "QuestionID", "DocumentID", "DocumentTitle"]
    ].drop_duplicates()
    return qd


train_question_doctag = get_questions_documenttag(train_data)
test_question_doctag = get_questions_documenttag(test_data)

In [4]:
# get unique questions
train_questions = train_question_doctag["Question"]
test_questions = test_question_doctag["Question"]

In [5]:
# get the unique document ids
train_docid = train_question_doctag["DocumentID"]
test_docid = test_question_doctag["DocumentID"]

Extract the answers to those questions.


In [6]:
def get_answers(data, questions, documentids):
    answers = []  # list of answers
    for q in range(len(questions)):
        question = questions.iloc[q]
        doc_id = documentids.iloc[q]  # add the document id
        df = data[data["Question"] == question]
        index = df.loc[df["Label"] == 1]["Sentence"].index.values
        if len(index) == 0:  # if no answer found
            answers.append([question, doc_id, "No answer"])
        else:  # if 1 answer found
            answers.append([question, doc_id, df.loc[index[0], "Sentence"]])
    return answers


train_answers = pd.DataFrame(get_answers(train_data, train_questions, train_docid))
test_answers = pd.DataFrame(get_answers(test_data, test_questions, test_docid))

The above get_answers returns train_answers and test_answers which, gives us in the following columns

-   Question
-   Related Document ID
-   Answer (if no answer to that question, return no answer)


In [7]:
def get_documents(data, questions, documentids):  # (done by Finn, tweaked by Dan)
    documents = []
    for q in range(len(questions)):
        question = questions.iloc[q]
        doc_id = documentids.iloc[q]  # add the document id
        df = data[data["Question"] == question]
        sentences = df["Sentence"].tolist()
        for i in range(0, len(sentences) - 1):
            sentences[i] = sentences[i] + " "
        documents.append([doc_id, "".join(sentences)])
    return documents


train_documents = pd.DataFrame(
    get_documents(train_data, train_questions, train_docid)
)  # return the individual document in list
test_documents = pd.DataFrame(
    get_documents(test_data, test_questions, test_docid)
)  # return the individual document in list

The above train_documents and test_documents called from the get_documents gives us in the following columns

-   Document ID
-   Full Document


In [8]:
# renaming all the columns for more standardised access
train_answers.columns = ["Question", "DocumentID", "Answer"]
test_answers.columns = ["Question", "DocumentID", "Answer"]
train_documents.columns = ["DocumentID", "Document"]
test_documents.columns = ["DocumentID", "Document"]

In [9]:
# result is 2117, 2117, 630, 630

len(train_answers), len(train_documents), len(test_answers), len(test_documents)

(2117, 2117, 630, 630)

**Prior to tagging, we should maybe clean the document and answers first:** (stopped here)

Maybe?

-   lowercase (might lose context, but we can use on questions)
-   removing any punctuation or weird symbols (do)
-   removal of stop words? (probably not)

Make sure that the pre-processing is standardised to be the same throughout doc and ans.


In [10]:
def preprocess_lower(text):
    # Lowercase the text for question, answer and documents
    text = text.lower()
    pattern = r"[^a-zA-Z0-9\s]"
    cleaned_text = re.sub(pattern, " ", text)
    return cleaned_text


train_answers[["Question", "Answer"]] = train_answers[["Question", "Answer"]].applymap(
    preprocess_lower
)
train_documents["Document"] = train_documents["Document"].apply(preprocess_lower)
test_answers[["Question", "Answer"]] = test_answers[["Question", "Answer"]].applymap(
    preprocess_lower
)
test_documents["Document"] = test_documents["Document"].apply(preprocess_lower)

In [11]:
train_documents

Unnamed: 0,DocumentID,Document
0,D1,a partly submerged glacier cave on perito more...
1,D2,in physics circular motion is a movement of ...
2,D5,apollo creed is a fictional character from the...
3,D6,in the united states the title of federal jud...
4,D7,the beretta 21a bobcat is a small pocket sized...
...,...,...
2112,D2805,blue mountain state is an american comedy seri...
2113,D2806,apple inc formerly apple computer inc is ...
2114,D2807,section 8 housing in the south bronx section 8...
2115,D2808,restaurants categorized by type and informatio...


In [12]:
def labelling(documents, answers):
    tagged_documents = []
    for q in range(len(answers)):
        tagged_document = []
        qn = answers["Question"].loc[q]
        doc_id = answers["DocumentID"].loc[q]
        content = documents.loc[documents["DocumentID"] == doc_id, "Document"].values[0]
        answer = answers["Answer"].loc[q]

        if answer == "no answer":
            tokens = word_tokenize(content)
            for j in range(len(tokens)):
                tagged_document.append("N")  # none
        else:
            parts = content.partition(answer)
            for j in range(len(parts)):
                tokens = word_tokenize(parts[j])
                if j == 1:
                    tagged_document.append("S")  # start of answer
                    for k in range(len(tokens) - 2):
                        tagged_document.append("I")  # inside of answer
                    tagged_document.append("E")  # end of answer
                else:
                    for k in range(len(tokens)):
                        tagged_document.append("N")  # outside answer
        tagged_documents.append(tagged_document)
    return tagged_documents


train_doc_ans_labels = labelling(train_documents, train_answers)
test_doc_ans_labels = labelling(test_documents, test_answers)

In [13]:
# check if tags are good
def testing_tokens(ind, labels, documents, answers):
    for i, j in zip(labels[ind], word_tokenize(documents["Document"][ind])):
        print([i, j])
    print(answers["Answer"][ind])



Cleaned Documents: train and test

train_answers - contains the ['Question','DocumentID','Answer']

train_documents - contains the ['DocumentID','Document']

train_doc_ans_labels - contains a list of list of answer tags for each document,


In [14]:
# To prepare the document for word embeddings:
train_doc_ques = pd.DataFrame(
    {"Document": train_documents["Document"], "Question": train_answers["Question"]}
)
test_doc_ques = pd.DataFrame(
    {"Document": test_documents["Document"], "Question": test_answers["Question"]}
)

### Word Embeddings

To use the CBOW model, we need the data in sentences. Extract this from the original dataset, don't use sent_tokenise, will mess with some of the fullstops, we want to maintain structure from above


In [15]:
def word_tokens(data):
    sentence_list = []
    for i in range(len(data)):
        sentence_list.append(word_tokenize(data[i]))
    return sentence_list


train_doc_list = word_tokens(train_doc_ques["Document"])
train_ques_list = word_tokens(train_doc_ques["Question"])
test_doc_list = word_tokens(test_doc_ques["Document"])
test_ques_list = word_tokens(test_doc_ques["Question"])

In [16]:
combined_text = train_doc_list + train_ques_list + test_doc_list + test_ques_list

In [17]:
# model trained, don't have to run this multiple times
wc_cbow_model = Word2Vec(
    sentences=combined_text,
    vector_size=100,
    window=5,
    min_count=1,
    workers=2,
    epochs=30,
)
wc_cbow_model.save("cbow.model")

KeyboardInterrupt: 

To implement QA

1. Word Embeddings, using CBOW
2. Feature Extraction 1 - POS tags
3. Feature Extraction 2 - TF-IDF
4. Feature Extraction 3 - NER


In [18]:
# run this if model in directory 
wc_cbow_model = Word2Vec.load("./cbow.model")

In [19]:
def get_word_embeddings(doc):
    tokenized_doc = word_tokenize(doc)
    embeddings = [wc_cbow_model.wv[word] for word in tokenized_doc]
    return embeddings


train_doc_ques["Doc_Embeddings"] = train_doc_ques["Document"].apply(get_word_embeddings)
train_doc_ques["Q_Embeddings"] = train_doc_ques["Question"].apply(get_word_embeddings)
test_doc_ques["Doc_Embeddings"] = test_doc_ques["Document"].apply(get_word_embeddings)
test_doc_ques["Q_Embeddings"] = test_doc_ques["Question"].apply(get_word_embeddings)

In [20]:
train_doc_ques["Doc_Tokens"] = train_doc_ques["Document"].apply(word_tokenize)
train_doc_ques["Q_Tokens"] = train_doc_ques["Question"].apply(word_tokenize)
test_doc_ques["Doc_Tokens"] = test_doc_ques["Document"].apply(word_tokenize)
test_doc_ques["Q_Tokens"] = test_doc_ques["Question"].apply(word_tokenize)

In [21]:
def check_count(doc):
    count = 0
    for i in range(len(doc)):
        if len(doc["Doc_Embeddings"][i]) != len(doc["Doc_Tokens"][i]):
            count += 1
        elif len(doc["Q_Embeddings"][i]) != len(doc["Q_Tokens"][i]):
            count += 1
        else:
            continue
    return count


check_count(train_doc_ques)  # looks good

0

Note, need to convert the POS tags, NER tags into embeddings. After this, pad the questions and answers to the max question/document length in the combined training and test set.

### PoS Tagging


In [22]:
# Apply the pos tags to the tokens
from nltk.tag import pos_tag

# download the dependency and resource as required
nltk.download("averaged_perceptron_tagger")

train_doc_ques["Doc_POS"] = train_doc_ques["Doc_Tokens"].apply(pos_tag)
train_doc_ques["Q_POS"] = train_doc_ques["Q_Tokens"].apply(pos_tag)
test_doc_ques["Doc_POS"] = test_doc_ques["Doc_Tokens"].apply(pos_tag)
test_doc_ques["Q_POS"] = test_doc_ques["Q_Tokens"].apply(pos_tag)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/daniel/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [23]:
# checking the POS tags: # looks ok
test_doc_ques["Q_POS"][0]

[('how', 'WRB'),
 ('african', 'JJ'),
 ('americans', 'NNS'),
 ('were', 'VBD'),
 ('immigrated', 'VBN'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('us', 'PRP')]

In [24]:
# Extract all unique POS Tags
all_pos_tags = (
    train_doc_ques["Doc_POS"].tolist()
    + test_doc_ques["Doc_POS"].tolist()
    + train_doc_ques["Q_POS"].tolist()
    + test_doc_ques["Q_POS"].tolist()
)


def get_unique_pos(data):
    pos_tags = set()
    for item in data:
        for _, pos_tag in item:
            pos_tags.add(pos_tag)

    pos_tag_index = {tag: i for i, tag in enumerate(sorted(pos_tags))}
    return pos_tag_index


pos_iden = get_unique_pos(all_pos_tags)  # list of tags
pos_iden

{'$': 0,
 'CC': 1,
 'CD': 2,
 'DT': 3,
 'EX': 4,
 'FW': 5,
 'IN': 6,
 'JJ': 7,
 'JJR': 8,
 'JJS': 9,
 'MD': 10,
 'NN': 11,
 'NNP': 12,
 'NNPS': 13,
 'NNS': 14,
 'PDT': 15,
 'POS': 16,
 'PRP': 17,
 'PRP$': 18,
 'RB': 19,
 'RBR': 20,
 'RBS': 21,
 'RP': 22,
 'SYM': 23,
 'TO': 24,
 'UH': 25,
 'VB': 26,
 'VBD': 27,
 'VBG': 28,
 'VBN': 29,
 'VBP': 30,
 'VBZ': 31,
 'WDT': 32,
 'WP': 33,
 'WP$': 34,
 'WRB': 35}

### NER Tagging


### Steps to run this:

-   pip install spacy
-   python -m spacy download en_core_web_sm

If loaded for the first time, restart kernel


In [25]:
# nltk using Spacy
# pip install -U spacy
!pip install -U spacy
!python -m spacy download en_core_web_sm
# python -m spacy download en_core_web_sm
import spacy
import en_core_web_sm

# loading pre-trained model of NER
nlp = en_core_web_sm.load()

Collecting spacy
  Downloading spacy-3.5.3-cp39-cp39-macosx_10_9_x86_64.whl (6.9 MB)
[K     |████████████████████████████████| 6.9 MB 11.7 MB/s eta 0:00:01
Installing collected packages: spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.5.2
    Uninstalling spacy-3.5.2:
      Successfully uninstalled spacy-3.5.2
Successfully installed spacy-3.5.3
2023-05-16 18:48:15.992279: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 229 kB/s eta 0:00:01    |████████████████▋               | 6.6 MB 3.5 MB/s eta 0:00:

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


2023-05-16 18:48:53.847284: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [28]:
def ner_tagging(texts):
    tagged_texts = []
    for text in texts:
        doc = spacy.tokens.Doc(nlp.vocab, words=text)
        nlp.get_pipe("ner")(doc)
        tagged_texts.append([(token.text, token.ent_type_) for token in doc])
    return tagged_texts

In [29]:
# Will take a while...
train_doc_ques["Doc_NER"] = ner_tagging(train_doc_ques["Doc_Tokens"])
train_doc_ques["Q_NER"] = ner_tagging(train_doc_ques["Q_Tokens"])
test_doc_ques["Doc_NER"] = ner_tagging(test_doc_ques["Doc_Tokens"])
test_doc_ques["Q_NER"] = ner_tagging(test_doc_ques["Q_Tokens"])

In [30]:
# Similar approach to the POS

# Extract all unique POS Tags
all_ner_tags = (
    train_doc_ques["Doc_NER"].tolist()
    + test_doc_ques["Doc_NER"].tolist()
    + train_doc_ques["Q_NER"].tolist()
    + test_doc_ques["Q_NER"].tolist()
)


def get_unique_ner(data):
    ner_tags = set()
    for item in data:
        for _, ner_tag in item:
            ner_tags.add(ner_tag)

    ner_tag_index = {tag: i for i, tag in enumerate(sorted(ner_tags))}
    return ner_tag_index


ner_iden = get_unique_pos(all_ner_tags)  # list of tags
ner_iden

{'': 0,
 'CARDINAL': 1,
 'DATE': 2,
 'EVENT': 3,
 'FAC': 4,
 'GPE': 5,
 'LANGUAGE': 6,
 'LAW': 7,
 'LOC': 8,
 'MONEY': 9,
 'NORP': 10,
 'ORDINAL': 11,
 'ORG': 12,
 'PERCENT': 13,
 'PERSON': 14,
 'PRODUCT': 15,
 'QUANTITY': 16,
 'TIME': 17,
 'WORK_OF_ART': 18}

In [31]:
# check ohv dims
ner_idx = ner_iden.values()
aa = np.eye(max(ner_idx) + 1)
#aa

### TF-IDF

First, calculate the document frequency of each token in the entire corpus (training documents + testing documents). The result is a dictionary where each token is a key and its value is the document frequency.


In [32]:
def document_frequency(corpus):
    """
    Computes the document frequency for every token in the corpus.
    Returns a dictionary {token: doc_freq, ...}
    """
    document_frequency = {}
    for document in corpus:
        for token in np.unique(document):
            try:
                document_frequency[token] += 1
            except:
                document_frequency[token] = 1
    return document_frequency


train_corpus = (
    train_doc_ques["Doc_Tokens"].tolist() + train_doc_ques["Q_Tokens"].tolist()
)
test_corpus = test_doc_ques["Doc_Tokens"].tolist() + test_doc_ques["Q_Tokens"].tolist()
train_doc_freq = document_frequency(train_corpus)
test_doc_freq = document_frequency(test_corpus)

Now calculate TF-IDF using the document frequency from above.


In [33]:
from collections import Counter
import math


def compute_tf_idf(corpus, doc_frequency):
    """
    Computes the term frequency inverse document frequency for every token in every document in the corpus.
    Returns a list the same shape as the list of tokenized documents except every token is replaced with the tf-idf
    for that token.
    """
    tf_idf = {}
    tf_idf_list = []
    N = len(doc_frequency)
    doc_id = 0
    for document in corpus:
        tf_idf_doc = []
        counter = Counter(document)
        total_num_words = len(document)
        for token in np.unique(document):
            tf = counter[token] / total_num_words
            df = doc_frequency[token]
            idf = math.log(N / (df + 1)) + 1
            tf_idf[doc_id, token] = tf * idf
        for token in document:
            tf_idf_doc.append(tf_idf[doc_id, token])
        tf_idf_list.append(tf_idf_doc)
        doc_id += 1
    return tf_idf_list


train_doc_ques["Doc_TFIDF"] = compute_tf_idf(
    train_doc_ques["Doc_Tokens"].tolist(), train_doc_freq
)
train_doc_ques["Q_TFIDF"] = compute_tf_idf(
    train_doc_ques["Q_Tokens"].tolist(), train_doc_freq
)
test_doc_ques["Doc_TFIDF"] = compute_tf_idf(
    test_doc_ques["Doc_Tokens"].tolist(), test_doc_freq
)
test_doc_ques["Q_TFIDF"] = compute_tf_idf(
    test_doc_ques["Q_Tokens"].tolist(), test_doc_freq
)

In [34]:
test_doc_ques

Unnamed: 0,Document,Question,Doc_Embeddings,Q_Embeddings,Doc_Tokens,Q_Tokens,Doc_POS,Q_POS,Doc_NER,Q_NER,Doc_TFIDF,Q_TFIDF
0,african immigration to the united states refer...,how african americans were immigrated to the us,"[[0.88913774, -0.053637512, -0.34699965, 0.234...","[[0.25111866, -0.7270332, -0.56952626, 1.05881...","[african, immigration, to, the, united, states...","[how, african, americans, were, immigrated, to...","[(african, JJ), (immigration, NN), (to, TO), (...","[(how, WRB), (african, JJ), (americans, NNS), ...","[(african, ORG), (immigration, ORG), (to, ), (...","[(how, ), (african, NORP), (americans, NORP), ...","[0.2444438957631187, 0.16900818614783913, 0.28...","[0.7085526392283996, 0.934997901293929, 0.9907..."
1,a prison from old french prisoun also known...,how large were early jails,"[[0.18600275, -0.2672038, -2.469172, 1.7943542...","[[0.25111866, -0.7270332, -0.56952626, 1.05881...","[a, prison, from, old, french, prisoun, also, ...","[how, large, were, early, jails]","[(a, DT), (prison, NN), (from, IN), (old, JJ),...","[(how, WRB), (large, JJ), (were, VBD), (early,...","[(a, ), (prison, ), (from, ), (old, ), (french...","[(how, ), (large, ), (were, ), (early, ), (jai...","[0.23736000397378015, 0.15055751080418858, 0.0...","[1.1336842227654393, 1.2988332831657334, 1.145..."
2,a small electrically powered pump a large el...,how a water pump works,"[[0.18600275, -0.2672038, -2.469172, 1.7943542...","[[0.25111866, -0.7270332, -0.56952626, 1.05881...","[a, small, electrically, powered, pump, a, lar...","[how, a, water, pump, works]","[(a, DT), (small, JJ), (electrically, RB), (po...","[(how, WRB), (a, DT), (water, NN), (pump, NN),...","[(a, ), (small, ), (electrically, ), (powered,...","[(how, ), (a, ), (water, ), (pump, ), (works, )]","[0.18179077227423296, 0.07355471089668812, 0.1...","[1.1336842227654393, 0.8362375524614717, 1.412..."
3,lolita is a 1962 comedy drama film by stanley ...,how old was sue lyon when she made lolita,"[[-0.24463412, 0.06878885, 0.363463, 0.1364551...","[[0.25111866, -0.7270332, -0.56952626, 1.05881...","[lolita, is, a, 1962, comedy, drama, film, by,...","[how, old, was, sue, lyon, when, she, made, lo...","[(lolita, NN), (is, VBZ), (a, DT), (1962, CD),...","[(how, WRB), (old, JJ), (was, VBD), (sue, NN),...","[(lolita, ), (is, ), (a, ), (1962, DATE), (com...","[(how, ), (old, ), (was, ), (sue, PERSON), (ly...","[0.22677748220200447, 0.031618095192546124, 0....","[0.6298245682030218, 0.7657996161610346, 0.528..."
4,each antibody binds to a specific antigen an...,how are antibodies used in,"[[1.2020115, 2.6943376, -5.0276637, 3.097347, ...","[[0.25111866, -0.7270332, -0.56952626, 1.05881...","[each, antibody, binds, to, a, specific, antig...","[how, are, antibodies, used, in]","[(each, DT), (antibody, NN), (binds, VBZ), (to...","[(how, WRB), (are, VBP), (antibodies, NNS), (u...","[(each, ), (antibody, ), (binds, ), (to, ), (a...","[(how, ), (are, ), (antibodies, ), (used, ), (...","[0.05405618374532114, 0.2406872267443911, 0.01...","[1.1336842227654393, 0.9670455719243073, 1.862..."
...,...,...,...,...,...,...,...,...,...,...,...,...
625,american cuts of beef including the brisket br...,where is the brisket from,"[[-1.6133411, 0.7031535, 0.6441129, -0.7412261...","[[0.20251973, -2.1292784, -0.22558218, 2.18635...","[american, cuts, of, beef, including, the, bri...","[where, is, the, brisket, from]","[(american, JJ), (cuts, NNS), (of, IN), (beef,...","[(where, WRB), (is, VBZ), (the, DT), (brisket,...","[(american, NORP), (cuts, ), (of, ), (beef, ),...","[(where, ), (is, ), (the, ), (brisket, ), (fro...","[0.044673596326364536, 0.15610157311465894, 0....","[1.1856348821507936, 0.8030996178906715, 0.792..."
626,the arm architecture describes a family of ris...,what is arm chipset,"[[-0.024290355, -0.34677127, -0.9102898, 0.579...","[[1.6417806, -0.42128003, -2.0681963, -3.08729...","[the, arm, architecture, describes, a, family,...","[what, is, arm, chipset]","[(the, DT), (arm, NN), (architecture, NN), (de...","[(what, WP), (is, VBZ), (arm, JJ), (chipset, NN)]","[(the, ), (arm, ), (architecture, ), (describe...","[(what, ), (is, ), (arm, ), (chipset, )]","[0.13240256299185374, 0.2925150013371109, 0.06...","[1.200426291895723, 1.0038745223633394, 2.1882..."
627,june bug or junebug may refer to beetles phy...,what is the life span of june bugs,"[[-3.2918558, 0.07746171, 2.024428, 1.5457352,...","[[1.6417806, -0.42128003, -2.0681963, -3.08729...","[june, bug, or, junebug, may, refer, to, beetl...","[what, is, the, life, span, of, june, bugs]","[(june, NN), (bug, NN), (or, CC), (junebug, NN...","[(what, WP), (is, VBZ), (the, DT), (life, NN),...","[(june, DATE), (bug, ), (or, ), (junebug, ), (...","[(what, ), (is, ), (the, ), (life, ), (span, )...","[0.3625851507468464, 0.345024546712403, 0.0560...","[0.6002131459478615, 0.5019372611816697, 0.495..."
628,this is a list of known biological mothers und...,who is the youngest female to give birth world...,"[[-1.4733256, 1.7044597, -5.0576773, 2.08202, ...","[[-0.20219268, 0.45227763, -1.5316664, -0.1189...","[this, is, a, list, of, known, biological, mot...","[who, is, the, youngest, female, to, give, bir...","[(this, DT), (is, VBZ), (a, DT), (list, NN), (...","[(who, WP), (is, VBZ), (the, DT), (youngest, J...","[(this, ), (is, ), (a, ), (list, ), (of, ), (k...","[(who, ), (is, ), (the, ), (youngest, ), (fema...","[0.4279277383207209, 0.3088844684194891, 0.321...","[0.532822100709197, 0.40154980894533576, 0.396..."


In [35]:
train_doc_ques

Unnamed: 0,Document,Question,Doc_Embeddings,Q_Embeddings,Doc_Tokens,Q_Tokens,Doc_POS,Q_POS,Doc_NER,Q_NER,Doc_TFIDF,Q_TFIDF
0,a partly submerged glacier cave on perito more...,how are glacier caves formed,"[[0.18600275, -0.2672038, -2.469172, 1.7943542...","[[0.25111866, -0.7270332, -0.56952626, 1.05881...","[a, partly, submerged, glacier, cave, on, peri...","[how, are, glacier, caves, formed]","[(a, DT), (partly, RB), (submerged, VBN), (gla...","[(how, WRB), (are, VBP), (glacier, JJ), (caves...","[(a, ), (partly, ), (submerged, ), (glacier, )...","[(how, ), (are, ), (glacier, ), (caves, ), (fo...","[0.24679288919367473, 0.145118630440616, 0.161...","[1.0342546228402965, 0.8500722992276416, 1.989..."
1,in physics circular motion is a movement of ...,how are the directions of the velocity and for...,"[[-1.5330905, 0.4225784, -0.2228741, 0.4760204...","[[0.25111866, -0.7270332, -0.56952626, 1.05881...","[in, physics, circular, motion, is, a, movemen...","[how, are, the, directions, of, the, velocity,...","[(in, IN), (physics, NNS), (circular, JJ), (mo...","[(how, WRB), (are, VBP), (the, DT), (direction...","[(in, ), (physics, ), (circular, ), (motion, )...","[(how, ), (are, ), (the, ), (directions, ), (o...","[0.10158600494541081, 0.047978500911022494, 0....","[0.34475154094676547, 0.2833574330758805, 0.45..."
2,apollo creed is a fictional character from the...,how did apollo creed die,"[[-0.27247757, 0.75547856, 0.87444097, -0.0673...","[[0.25111866, -0.7270332, -0.56952626, 1.05881...","[apollo, creed, is, a, fictional, character, f...","[how, did, apollo, creed, die]","[(apollo, NNS), (creed, VBP), (is, VBZ), (a, D...","[(how, WRB), (did, VBD), (apollo, VB), (creed,...","[(apollo, ORG), (creed, ), (is, ), (a, ), (fic...","[(how, ), (did, ), (apollo, ORG), (creed, ), (...","[0.1506184153725464, 0.2875274053249442, 0.057...","[1.0342546228402965, 1.1571023666681808, 1.787..."
3,in the united states the title of federal jud...,how long is the term for federal judges,"[[-1.5330905, 0.4225784, -0.2228741, 0.4760204...","[[0.25111866, -0.7270332, -0.56952626, 1.05881...","[in, the, united, states, the, title, of, fede...","[how, long, is, the, term, for, federal, judges]","[(in, IN), (the, DT), (united, JJ), (states, V...","[(how, WRB), (long, JJ), (is, VBZ), (the, DT),...","[(in, ), (the, GPE), (united, GPE), (states, G...","[(how, ), (long, ), (is, ), (the, ), (term, ),...","[0.08829203392786322, 0.3204713483897852, 0.16...","[0.6464091392751853, 0.7778250162349475, 0.429..."
4,the beretta 21a bobcat is a small pocket sized...,how a beretta model 21 pistols magazines works,"[[-0.024290355, -0.34677127, -0.9102898, 0.579...","[[0.25111866, -0.7270332, -0.56952626, 1.05881...","[the, beretta, 21a, bobcat, is, a, small, pock...","[how, a, beretta, model, 21, pistols, magazine...","[(the, DT), (beretta, NN), (21a, CD), (bobcat,...","[(how, WRB), (a, DT), (beretta, NN), (model, N...","[(the, ), (beretta, ), (21a, ), (bobcat, ), (i...","[(how, ), (a, ), (beretta, PRODUCT), (model, )...","[0.21161559146390707, 0.8290244630930195, 0.22...","[0.6464091392751853, 0.4550243894508378, 1.243..."
...,...,...,...,...,...,...,...,...,...,...,...,...
2112,blue mountain state is an american comedy seri...,where was blue mountain state filmed at,"[[-0.14039661, -0.07358733, 0.9209801, -0.3817...","[[0.20251973, -2.1292784, -0.22558218, 2.18635...","[blue, mountain, state, is, an, american, come...","[where, was, blue, mountain, state, filmed, at]","[(blue, JJ), (mountain, NN), (state, NN), (is,...","[(where, WRB), (was, VBD), (blue, JJ), (mounta...","[(blue, LOC), (mountain, LOC), (state, ), (is,...","[(where, ), (was, ), (blue, ), (mountain, ), (...","[0.29476307406437613, 0.3823599012669257, 0.22...","[0.7503384296942396, 0.5907013675064214, 1.063..."
2113,apple inc formerly apple computer inc is ...,when was apple computer founded,"[[0.071747534, 1.0473902, 1.9560602, 0.4480185...","[[-4.2278485, -2.5841281, -0.2910819, 0.805927...","[apple, inc, formerly, apple, computer, inc, i...","[when, was, apple, computer, founded]","[(apple, NN), (inc, VBP), (formerly, RB), (app...","[(when, WRB), (was, VBD), (apple, NN), (comput...","[(apple, ORG), (inc, ORG), (formerly, ORG), (a...","[(when, ), (was, ), (apple, ), (computer, ), (...","[0.26158075612861253, 0.089435411305154, 0.029...","[0.9915721940674992, 0.8269819145089902, 1.700..."
2114,section 8 housing in the south bronx section 8...,what is section eight housing,"[[1.0145441, 0.23937027, -1.1242148, 0.8756493...","[[1.6417806, -0.42128003, -2.0681963, -3.08729...","[section, 8, housing, in, the, south, bronx, s...","[what, is, section, eight, housing]","[(section, NN), (8, CD), (housing, NN), (in, I...","[(what, WP), (is, VBZ), (section, NN), (eight,...","[(section, LAW), (8, LAW), (housing, ), (in, )...","[(what, ), (is, ), (section, ), (eight, CARDIN...","[0.19207867615144178, 0.16090249029058593, 0.2...","[0.8426670929983164, 0.6879048840289103, 1.514..."
2115,restaurants categorized by type and informatio...,what is the main type of restaurant,"[[0.005273128, 1.09162, 0.88325566, 1.41998, 0...","[[1.6417806, -0.42128003, -2.0681963, -3.08729...","[restaurants, categorized, by, type, and, info...","[what, is, the, main, type, of, restaurant]","[(restaurants, NNS), (categorized, VBN), (by, ...","[(what, WP), (is, VBZ), (the, DT), (main, JJ),...","[(restaurants, ), (categorized, ), (by, ), (ty...","[(what, ), (is, ), (the, ), (main, ), (type, )...","[0.8769638560774586, 0.9255146376556289, 0.404...","[0.6019050664273689, 0.4913606314492216, 0.483..."


In [36]:
def one_hot_vectorize(
    pos_tagger, ner_tagger, data
):  # pass in the unique dict for ner or pos
    pos_idx = pos_tagger.values()
    pos_ohv = np.eye(max(pos_idx) + 1)  # create the ohv
    ner_idx = ner_tagger.values()
    ner_ohv = np.eye(max(ner_idx) + 1)

    dpos_full_ohv, dner_full_ohv = [], []  # lists to append to
    qpos_full_ohv, qner_full_ohv = [], []  # lists to append to

    for item in data["Doc_POS"]:
        sent_ohv = []
        for word in item:
            tag = word[1]
            pos_index_iden = pos_tagger[tag]
            sent_ohv.append(pos_ohv[pos_index_iden])
        dpos_full_ohv.append(sent_ohv)

    for item in data["Q_POS"]:
        sent_ohv = []
        for word in item:
            tag = word[1]
            pos_index_iden = pos_tagger[tag]
            sent_ohv.append(pos_ohv[pos_index_iden])
        qpos_full_ohv.append(sent_ohv)

    for item in data["Doc_NER"]:
        sent_ohv = []
        for word in item:
            tag = word[1]
            ner_index_iden = ner_tagger[tag]
            sent_ohv.append(ner_ohv[ner_index_iden])
        dner_full_ohv.append(sent_ohv)

    for item in data["Q_NER"]:
        sent_ohv = []
        for word in item:
            tag = word[1]
            ner_index_iden = ner_tagger[tag]
            sent_ohv.append(ner_ohv[ner_index_iden])
        qner_full_ohv.append(sent_ohv)

    return (dpos_full_ohv, qpos_full_ohv, dner_full_ohv, qner_full_ohv)

In [37]:
# get the ohv for doc
(
    train_doc_pos_ohv,
    train_q_pos_ohv,
    train_doc_ner_ohv,
    train_q_ner_ohv,
) = one_hot_vectorize(pos_iden, ner_iden, train_doc_ques)
test_doc_pos_ohv, test_q_pos_ohv, test_doc_ner_ohv, test_q_ner_ohv = one_hot_vectorize(
    pos_iden, ner_iden, test_doc_ques
)

In [39]:
# reduce the dataframe to just tokens and embeddings:
doc_emb_train = train_doc_ques[["Doc_Tokens", "Doc_Embeddings", "Doc_TFIDF"]]
doc_pos_ner = pd.DataFrame({"Doc_POS": train_doc_pos_ohv, "Doc_NER": train_doc_ner_ohv})
doc_emb_train = pd.concat([doc_emb_train, doc_pos_ner], axis=1)

q_emb_train = train_doc_ques[["Q_Tokens", "Q_Embeddings", "Q_TFIDF"]]
q_pos_ner = pd.DataFrame({"Q_POS": train_q_pos_ohv, "Q_NER": train_q_ner_ohv})
q_emb_train = pd.concat([q_emb_train, q_pos_ner], axis=1)

In [40]:
doc_emb_test = test_doc_ques[["Doc_Tokens", "Doc_Embeddings", "Doc_TFIDF"]]
doc_pos_ner = pd.DataFrame({"Doc_POS": test_doc_pos_ohv, "Doc_NER": test_doc_ner_ohv})
doc_emb_test = pd.concat([doc_emb_test, doc_pos_ner], axis=1)

q_emb_test = test_doc_ques[["Q_Tokens", "Q_Embeddings", "Q_TFIDF"]]
q_pos_ner = pd.DataFrame({"Q_POS": test_q_pos_ohv, "Q_NER": test_q_ner_ohv})
q_emb_test = pd.concat([q_emb_test, q_pos_ner], axis=1)

### Word Embeddings (Doc and Qn)

The embeddings of the questions and answers of the train and test set can be found here:

-   Train Document - doc_emb_train
-   Train Q - q_emb_train
-   Test Document - doc_emb_test
-   Test Q - q_emb_test

The max_document size is 1675 and max_question size is 23.


In [41]:
doc_emb_train.loc[0]

Doc_Tokens        [a, partly, submerged, glacier, cave, on, peri...
Doc_Embeddings    [[0.18600275, -0.2672038, -2.469172, 1.7943542...
Doc_TFIDF         [0.24679288919367473, 0.145118630440616, 0.161...
Doc_POS           [[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
Doc_NER           [[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
Name: 0, dtype: object

In [42]:
len(doc_emb_train)

2117

In [43]:
def full_array(data, data_type="Document"):
    num_vec_length = 156
    max_doc = 1675
    max_qn = 23
    zero_vec = np.zeros(156)

    if data_type == "Document":
        full_vec = []  # create a list for list of list for document
        for dat in range(len(data)):  # go through each line
            doc_ques = data.loc[dat]  # document data
            v = []  # create list to each word
            for j in range(len(doc_ques.iloc[0])):
                vn = []  # list of concat word embeddings
                vn.append(doc_ques.iloc[1][j].tolist()) #Word2Vec
                vn.append(doc_ques.iloc[2][j]) # TF-IDF
                vn.append(doc_ques.iloc[3][j].tolist()) # POS
                vn.append(doc_ques.iloc[4][j].tolist()) # NER
                flatten = [
                    item
                    for sublist in vn
                    for item in (sublist if isinstance(sublist, list) else [sublist])
                ]
                v.append(flatten)
            while len(v) < max_doc:
                v.append(zero_vec)
            full_vec.append(v)

    if data_type == "Question":
        full_vec = []  # create a list for list of list for document
        for dat in range(len(data)):  # go through each line
            doc_ques = data.loc[dat]  # document data
            v = []  # create list to each word
            for j in range(len(doc_ques.iloc[0])):
                vn = []  # list of concat word embeddings
                vn.append(doc_ques.iloc[1][j].tolist()) #Word2Vec
                vn.append(doc_ques.iloc[2][j]) # TF-IDF
                vn.append(doc_ques.iloc[3][j].tolist()) #POS
                vn.append(doc_ques.iloc[4][j].tolist()) #NER
                flatten = [
                    item
                    for sublist in vn
                    for item in (sublist if isinstance(sublist, list) else [sublist])
                ]
                v.append(flatten)
            while len(v) < max_qn:
                v.append(zero_vec)
            full_vec.append(v)
    return full_vec

In [44]:
# Training/Test Documents to pass in, takes about a min
final_doc_train = full_array(doc_emb_train, data_type="Document")
final_doc_test = full_array(doc_emb_test, data_type="Document")

In [45]:
# Training/Test Questions to pass in, takes about a few seconds
final_qn_train = full_array(q_emb_train, data_type="Question")
final_qn_test = full_array(q_emb_test, data_type="Question")

### Converting into Tensors:


In [46]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
# takes a min
tf_final_doc_train = torch.tensor(final_doc_train, device=device)
tf_final_doc_test = torch.tensor(final_doc_test, device=device)
tf_final_qn_train = torch.tensor(final_qn_train, device=device)
tf_final_qn_test = torch.tensor(final_qn_test, device=device)

NameError: name 'final_doc_train' is not defined

In [12]:
# check dimensions
print(tf_final_doc_train.shape)
print(tf_final_doc_test.shape)
print(tf_final_qn_train.shape)
print(tf_final_qn_test.shape)

NameError: name 'tf_final_doc_train' is not defined

**Input Embedding Ablation Study**

In the model input embedding Ablation study, we are given 3 variations of input embeddings to test. We will test 3 options:

1. Word2Vec only # 100 dims
2. Word2Vec + Tf-IDF # 101 dims
3. Word2Vec + all features (TF-IDF, POS, NER) # 156 dims

Since we are using tensors, we can use tensor slicing to take out the relevant features.
Our tensor of embeddings are built as follows (w2v, TF-IDF, POS, NER)

In [50]:
def convert_tensors(tf_doc_train, tf_doc_test, tf_qn_train, tf_qn_test, option=3):
    if option == 3:
        return tf_doc_train, tf_doc_test, tf_qn_train, tf_qn_test
    elif option == 1:
        tf_doc_train = tf_doc_train[:, :, :100]
        tf_doc_test = tf_doc_test[:, :, :100]
        tf_qn_train = tf_qn_train[:, :, :100]
        tf_qn_test = tf_qn_test[:, :, :100]
        return tf_doc_train, tf_doc_test, tf_qn_train, tf_qn_test
    elif option == 2:
        tf_doc_train = tf_doc_train[:, :, :101]
        tf_doc_test = tf_doc_test[:, :, :101]
        tf_qn_train = tf_qn_train[:, :, :101]
        tf_qn_test = tf_qn_test[:, :, :101]
        return tf_doc_train, tf_doc_test, tf_qn_train, tf_qn_test

In [51]:
# change option to see size
(convert_tensors(tf_final_doc_train, tf_final_doc_test, tf_final_qn_train, tf_final_qn_test, 1)[0]).shape

torch.Size([2117, 1675, 100])

Our answer should perhaps also be in the form of (1x1675) list containing ['N','S','I','E']

Additionally, the labels should be one hot vectorised, as they are cateogorical.


In [65]:
def convert_labels(labels):
    check = []
    for i in labels:
        if len(i) < 1675:
            while len(i) < 1675:
                i.append('N')
            check.append(i)
        else:
            check.append(i)
    return check
tr_labels = convert_labels(train_doc_ans_labels)
ts_labels = convert_labels(test_doc_ans_labels)

In [101]:
def one_hot_encode_labels(labels):
    # Create a dictionary that maps each label to a unique integer
    label_to_int = {'N': 0, 'S': 1, 'I': 2, 'E': 3}

    # Map the labels to integers
    int_labels = [[label_to_int[label] for label in sequence] for sequence in labels]

    # Create an identity matrix of size 4 (since there are 4 labels)
    identity = np.eye(4)

    # Use the integer labels as indices to select rows from the identity matrix
    one_hot_labels = [identity[sequence] for sequence in int_labels]
    return one_hot_labels

tr_encoded = one_hot_encode_labels(tr_labels)
tst_encoded = one_hot_encode_labels(ts_labels)

### Model

In [None]:
import torch
import torch.nn as nn

# Bi-LSTM for Document Portion


class BiLSTMModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_layers):
        super(BiLSTMModel, self).__init__()

        # Define the Bi-LSTM layers for the document and the question
        self.document_lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.question_lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)

    def forward(self, document, question):
        # Pass the document and the question through their respective Bi-LSTM layers
        document_output, _ = self.document_lstm(document)
        question_output, _ = self.question_lstm(question)
        return output





In [None]:
from torch import Tensor
import torch.nn as nn
from typing import Literal, Dict, Type, Union
from enum import Enum


class NNType(Enum):
    RNN = "rnn"
    LSTM = "lstm"
    GRU = "gru"

    def __str__(self):
        return self.value


NN_MAP: Dict[NNType, Type[Union[nn.RNN, nn.LSTM, nn.GRU]]] = {
    NNType.RNN: nn.RNN,
    NNType.LSTM: nn.LSTM,
    NNType.GRU: nn.GRU,
}


class EncoderBiRNN(nn.Module):
    def __init__(
        self,
        hidden_size: int,
        embedding: nn.Embedding,
        nn_type: Literal["rnn", "lstm", "gru"] = "rnn",
        num_layers=1,
    ):
        super(EncoderBiRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = embedding
        self.nn_type = NNType(nn_type)
        self.nn = NN_MAP[self.nn_type](
            hidden_size,
            hidden_size,
            num_layers=num_layers,
            bidirectional=True,
        )

    def forward(self, input: nn.Embedding, hidden: Tensor):
        embedded: Tensor = self.embedding(input).view(1, 1, -1)
        output: Tensor
        output, hidden = self.nn(embedded, hidden)
        return output, hidden

    def initHidden(self):
        return (
            torch.zeros(2, 1, self.hidden_size, device=device)
            if self.nn_type != NNType.LSTM
            else (
                torch.zeros(2, 1, self.hidden_size, device=device),
                torch.zeros(2, 1, self.hidden_size, device=device),
            )
        )

In [None]:
import torch.nn.functional as F


class AttentionMethod(Enum):
    DOT_PRODUCT = "dot_product"
    SCALE_DOT_PRODUCT = "scale_dot_product"
    COSINE_SIMILARITY = "cosine_similarity"

    def __str__(self):
        return self.value


class DecoderBiRNN(nn.Module):
    def __init__(
        self,
        hidden_size: int,
        output_size: int,
        embedding: nn.Embedding,
        max_length: int,
        nn_type: Literal["rnn", "lstm", "gru"] = "rnn",
        num_layers=1,
        dropout_p=0.1,
        attention_method: Literal[
            "dot_product",
            "scale_dot_product",
            "cosine_similarity",
        ] = "dot_product",
    ):
        super(DecoderBiRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        self.embedding = embedding
        self.dropout = nn.Dropout(self.dropout_p)
        self.nn_type = NNType(nn_type)
        self.attention_method = AttentionMethod(attention_method)
        self.nn = NN_MAP[self.nn_type](
            hidden_size,
            hidden_size,
            num_layers=num_layers,
            bidirectional=True,
        )
        self.out = nn.Linear(self.hidden_size * 4, self.output_size)

    def cal_attention(self, hidden: Tensor, encoder_hiddens: Tensor):
        if self.attention_method == AttentionMethod.DOT_PRODUCT:
            if self.nn_type == NNType.LSTM:  # For BiLSTM
                energy = torch.bmm(hidden[0], encoder_hiddens.T.repeat(2, 1, 1))
                attn_weights = F.softmax(energy, dim=-1)
                attn_output = torch.bmm(attn_weights, encoder_hiddens.repeat(2, 1, 1))
                concat_output = torch.cat(
                    (attn_output[0], hidden[0][0], attn_output[1], hidden[0][1]), 1
                )
            else:  # For BiRNN & BiGRU
                energy = torch.bmm(hidden, encoder_hiddens.T.repeat(2, 1, 1))
                attn_weights = F.softmax(energy, dim=-1)
                attn_output = torch.bmm(attn_weights, encoder_hiddens.repeat(2, 1, 1))
                concat_output = torch.cat(
                    (attn_output[0], hidden[0], attn_output[1], hidden[1]), 1
                )

        elif self.attention_method == AttentionMethod.COSINE_SIMILARITY:
            if self.nn_type == NNType.LSTM:  # For LSTM
                cosine_similarity = nn.CosineSimilarity(dim=-1)
                h_n, c_n = hidden
                # h_n_reshaped = h_n.mean(dim=0, keepdim=True)
                attn_weights_f = F.softmax(
                    cosine_similarity(h_n[0].unsqueeze(0), encoder_hiddens), dim=-1
                )
                attn_output_f = torch.bmm(
                    attn_weights_f.unsqueeze(0), encoder_hiddens.unsqueeze(0)
                )
                attn_weights_b = F.softmax(
                    cosine_similarity(h_n[1].unsqueeze(0), encoder_hiddens), dim=-1
                )
                attn_output_b = torch.bmm(
                    attn_weights_b.unsqueeze(0), encoder_hiddens.unsqueeze(0)
                )
                concat_output = torch.cat(
                    (
                        attn_output_f[0],
                        h_n[0],
                        attn_output_b[0],
                        h_n[1],
                    ),
                    1,
                )

            else:  # For RNN & GRU
                cosine_similarity = nn.CosineSimilarity(dim=-1)
                # hidden_reshaped = hidden.mean(dim=0, keepdim=True)
                # print(hidden_reshaped.shape)
                attn_weights_f = F.softmax(
                    cosine_similarity(hidden[0].unsqueeze(0), encoder_hiddens), dim=-1
                )
                attn_output_f = torch.bmm(
                    attn_weights_f.unsqueeze(0), encoder_hiddens.unsqueeze(0)
                )
                attn_weights_b = F.softmax(
                    cosine_similarity(hidden[1].unsqueeze(0), encoder_hiddens), dim=-1
                )
                attn_output_b = torch.bmm(
                    attn_weights_b.unsqueeze(0), encoder_hiddens.unsqueeze(0)
                )
                concat_output = torch.cat(
                    (
                        attn_output_f[0],
                        hidden[0],
                        attn_output_b[0],
                        hidden[1],
                    ),
                    1,
                )
        else:
            if self.nn_type == NNType.LSTM:  # For LSTM
                energy = torch.bmm(
                    hidden[0], encoder_hiddens.T.repeat(2, 1, 1)
                ) / np.sqrt(self.hidden_size)
                attn_weights = F.softmax(energy, dim=-1)
                attn_output = torch.bmm(attn_weights, encoder_hiddens.repeat(2, 1, 1))
                concat_output = torch.cat(
                    (attn_output[0], hidden[0][0], attn_output[1], hidden[0][1]), 1
                )
            else:  # For RNN & GRU
                energy = torch.bmm(hidden, encoder_hiddens.T.repeat(2, 1, 1)) / np.sqrt(
                    self.hidden_size
                )
                attn_weights = F.softmax(energy, dim=-1)
                attn_output = torch.bmm(attn_weights, encoder_hiddens.repeat(2, 1, 1))
                concat_output = torch.cat(
                    (attn_output[0], hidden[0], attn_output[1], hidden[1]), 1
                )
        return concat_output

    def forward(self, input: nn.Embedding, hidden: Tensor, encoder_hiddens: Tensor):
        embedded: Tensor = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        _, hidden = self.nn(embedded, hidden)
        concat_output = self.cal_attention(hidden, encoder_hiddens)
        output = F.log_softmax(self.out(concat_output), dim=1)
        return output, hidden

    def initHidden(self):
        return (
            torch.zeros(2, 1, self.hidden_size, device=device)
            if self.nn_type != NNType.LSTM
            else (
                torch.zeros(2, 1, self.hidden_size, device=device),
                torch.zeros(2, 1, self.hidden_size, device=device),
            )
        )

### Training


In [53]:
MAX_DOC_LENGTH = 1675  # Max doc length
MAX_QN_LENGTH = 23  # Max question length

In [None]:
def train_test(
    input_tensor,
    target_tensor,
    encoder,
    decoder,
    encoder_optimizer,
    decoder_optimizer,
    criterion,
):
    # Initialize the hidden state of the encoder
    encoder_hidden = encoder.initHidden()

    # Set the gradients of the optimizers to zero
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Get the length of the input and target tensors
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    # Initialize the loss to zero
    loss = 0

    # Iterate over the length of the input tensor
    for ei in range(input_length):
        # Pass each element of the input tensor through the encoder
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)

    # Set the initial input to the decoder as the first element of the target tensor
    decoder_input = target_tensor[0]
    # Set the initial hidden state of the decoder as the final hidden state of the encoder
    decoder_hidden = encoder_hidden

    # Iterate over the length of the target tensor
    for di in range(target_length):
        # Pass each element of the target tensor through the decoder
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        # Calculate and accumulate the loss by comparing the output of the decoder to the target tensor
        loss += criterion(decoder_output, target_tensor[di])
        # Set the next input to the decoder as the current element of the target tensor
        decoder_input = target_tensor[di]

    # Compute gradients using backpropagation
    loss.backward()

    # Update weights using optimizers
    encoder_optimizer.step()
    decoder_optimizer.step()

    # Return average loss per element in target sequence
    return loss.item() / target_length

In [None]:
def train(
    input_tensor,
    target_tensor,
    documentRNN,
    questionRNN,
    documentRNN_optimizer,
    questionRNN_optimizer,
    criterion,
    max_doc_length=MAX_DOC_LENGTH,
    max_qn_length=MAX_QN_LENGTH,
    nn_type="rnn",
):
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    # it is for storing the hidden states of input sequence later, which will be used for calculating the attention during the decoding process
    documentRNN_hiddens = torch.zeros(
        max_doc_length, documentRNN.hidden_size * 2, device=device
    )

    # zero-initialize an initial hidden state
    documentRNN_hidden = documentRNN.initHidden()
    questionRNN_hidden = questionRNN.initHidden()
    loss = 0
    documentRNN_optimizer.zero_grad()
    questionRNN_optimizer.zero_grad()

    # Feed the input_tensor into the encoder we defined
    for i in range(input_length):
        documentRNN_output, documentRNN_hidden = documentRNN(
            input_tensor[i], documentRNN_hidden
        )
        documentRNN_hiddens[i] = (
            documentRNN_hidden[0][0, 0]
            if nn_type == "lstm"
            else documentRNN_hidden[0, 0]
        )

    # Set the initial input to the decoder as the first element of the target tensor
    questionRNN_input = target_tensor[0]

    # Teacher forcing: Feed the target as the next input
    for i in range(target_length):
        questionRNN_output, questionRNN_hidden = questionRNN(
            questionRNN_input, questionRNN_hidden, documentRNN_hiddens
        )
        loss += criterion(questionRNN_output, target_tensor[i])
        questionRNN_input = target_tensor[i]

    loss.backward()

    documentRNN_optimizer.step()
    questionRNN_optimizer.step()

    return loss.item() / target_length

In [None]:
import time
import math


# Helper functions for training
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (- %s)" % (asMinutes(s), asMinutes(rs))

In [None]:
import random
from torch import optim


def trainIters(
    documentRNN,
    questionRNN,
    n_iters,
    print_every=1000,
    plot_every=100,
    learning_rate=0.01,
    nn_type="rnn",
):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    documentRNN_optimizer = optim.AdamW(documentRNN.parameters(), lr=learning_rate)
    questionRNN_optimizer = optim.AdamW(questionRNN.parameters(), lr=learning_rate)

    criterion = nn.NLLLoss()
    for iter in range(1, n_iters + 1):
        random_choice_ix = random.choice(
            range(n_data)
        )  # Get a random index within the scope of input data
        input_index_r = [[ind] for ind in input_index[random_choice_ix]]
        target_index_r = [[ind] for ind in target_index[random_choice_ix]]

        input_tensor = torch.LongTensor(input_index_r).to(device)
        target_tensor = torch.LongTensor(target_index_r).to(device)

        loss = train(
            input_tensor,
            target_tensor,
            documentRNN,
            questionRNN,
            documentRNN_optimizer,
            questionRNN_optimizer,
            criterion,
            nn_type=nn_type,
        )
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print(
                "%s (%d %d%%) %.4f"
                % (
                    timeSince(start, iter / n_iters),
                    iter,
                    iter / n_iters * 100,
                    print_loss_avg,
                )
            )

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_sent = pre_process([sentence])[0]
        intput_index = [word_to_ix[word] for word in input_sent]
        input_tensor = torch.LongTensor([[ind] for ind in intput_index]).to(device)

        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_hiddens = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
            # encoder_hiddens[ei] += encoder_hidden[0, 0]
            encoder_hiddens[ei] = encoder_hidden[0][0, 0]  # LSTM

        decoder_input = torch.tensor([[word_to_ix["<BOS>"]]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_hiddens
            )
            topv, topi = decoder_output.data.topk(
                1
            )  # simply adopt the predicted tag with the highest probabiity
            if (
                topi.item() == word_to_ix["<EOS>"]
            ):  # if <EOS> is generated, stop the generation
                decoded_words.append("<EOS>")
                break
            else:
                decoded_words.append(
                    word_list[topi.item()]
                )  # get the predicted word based on the index
            # use the predicted output as the input for the next time step generation
            decoder_input = topi.squeeze().detach()

        return decoded_words