In [59]:
import nltk

nltk.download("punkt")
from nltk import word_tokenize
import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nicho\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [60]:
# reading in the data
train_data = pd.read_csv("WikiQA-train.tsv", sep="\t")
test_data = pd.read_csv("WikiQA-test.tsv", sep="\t")

Extract the unique questions from the train and test data frames, including the documentID and the DocumentTitle


In [61]:
def get_questions_documenttag(data):
    qd = data[
        ["Question", "QuestionID", "DocumentID", "DocumentTitle"]
    ].drop_duplicates()
    return qd


train_question_doctag = get_questions_documenttag(train_data)
test_question_doctag = get_questions_documenttag(test_data)

In [62]:
# get unique questions
train_questions = train_question_doctag["Question"]
test_questions = test_question_doctag["Question"]

In [63]:
# get the unique document ids
train_docid = train_question_doctag["DocumentID"]
test_docid = test_question_doctag["DocumentID"]

Extract the answers to those questions.


In [64]:
def get_answers(data, questions, documentids):
    answers = []  # list of answers
    for q in range(len(questions)):
        question = questions.iloc[q]
        doc_id = documentids.iloc[q]  # add the document id
        df = data[data["Question"] == question]
        index = df.loc[df["Label"] == 1]["Sentence"].index.values
        if len(index) == 0:  # if no answer found
            answers.append([question, doc_id, "No answer"])
        else:  # if 1 answer found
            answers.append([question, doc_id, df.loc[index[0], "Sentence"]])
    return answers


train_answers = pd.DataFrame(get_answers(train_data, train_questions, train_docid))
test_answers = pd.DataFrame(get_answers(test_data, test_questions, test_docid))

The above get_answers returns train_answers and test_answers which, gives us in the following columns

-   Question
-   Related Document ID
-   Answer (if no answer to that question, return no answer)


In [65]:
def get_documents(data, questions, documentids):  # (done by Finn, tweaked by Dan)
    documents = []
    for q in range(len(questions)):
        question = questions.iloc[q]
        doc_id = documentids.iloc[q]  # add the document id
        df = data[data["Question"] == question]
        sentences = df["Sentence"].tolist()
        for i in range(0, len(sentences) - 1):
            sentences[i] = sentences[i] + " "
        documents.append([doc_id, "".join(sentences)])
    return documents


train_documents = pd.DataFrame(
    get_documents(train_data, train_questions, train_docid)
)  # return the individual document in list
test_documents = pd.DataFrame(
    get_documents(test_data, test_questions, test_docid)
)  # return the individual document in list

The above train_documents and test_documents called from the get_documents gives us in the following columns

-   Document ID
-   Full Document


In [66]:
# renaming all the columns for more standardised access
train_answers.columns = ["Question", "DocumentID", "Answer"]
test_answers.columns = ["Question", "DocumentID", "Answer"]
train_documents.columns = ["DocumentID", "Document"]
test_documents.columns = ["DocumentID", "Document"]

In [67]:
# result is 2117, 2117, 630, 630

len(train_answers), len(train_documents), len(test_answers), len(test_documents)

(2117, 2117, 630, 630)

**Prior to tagging, we should maybe clean the document and answers first:** (stopped here)

Maybe?

-   lowercase (might lose context, but we can use on questions)
-   removing any punctuation or weird symbols (do)
-   removal of stop words? (probably not)

Make sure that the pre-processing is standardised to be the same throughout doc and ans.


In [68]:
def preprocess_lower(text):
    # Lowercase the text for question, answer and documents
    text = text.lower()
    pattern = r"[^a-zA-Z0-9\s]"
    cleaned_text = re.sub(pattern, " ", text)
    return cleaned_text


train_answers[["Question", "Answer"]] = train_answers[["Question", "Answer"]].applymap(
    preprocess_lower
)
train_documents["Document"] = train_documents["Document"].apply(preprocess_lower)
test_answers[["Question", "Answer"]] = test_answers[["Question", "Answer"]].applymap(
    preprocess_lower
)
test_documents["Document"] = test_documents["Document"].apply(preprocess_lower)

In [69]:
train_documents

Unnamed: 0,DocumentID,Document
0,D1,a partly submerged glacier cave on perito more...
1,D2,in physics circular motion is a movement of ...
2,D5,apollo creed is a fictional character from the...
3,D6,in the united states the title of federal jud...
4,D7,the beretta 21a bobcat is a small pocket sized...
...,...,...
2112,D2805,blue mountain state is an american comedy seri...
2113,D2806,apple inc formerly apple computer inc is ...
2114,D2807,section 8 housing in the south bronx section 8...
2115,D2808,restaurants categorized by type and informatio...


In [70]:
def labelling(documents, answers):
    tagged_documents = []
    for q in range(len(answers)):
        tagged_document = []
        qn = answers["Question"].loc[q]
        doc_id = answers["DocumentID"].loc[q]
        content = documents.loc[documents["DocumentID"] == doc_id, "Document"].values[0]
        answer = answers["Answer"].loc[q]

        if answer == "no answer":
            tokens = word_tokenize(content)
            for j in range(len(tokens)):
                tagged_document.append("N")  # none
        else:
            parts = content.partition(answer)
            for j in range(len(parts)):
                tokens = word_tokenize(parts[j])
                if j == 1:
                    tagged_document.append("S")  # start of answer
                    for k in range(len(tokens) - 2):
                        tagged_document.append("I")  # inside of answer
                    tagged_document.append("E")  # end of answer
                else:
                    for k in range(len(tokens)):
                        tagged_document.append("N")  # outside answer
        tagged_documents.append(tagged_document)
    return tagged_documents


train_doc_ans_labels = labelling(train_documents, train_answers)
test_doc_ans_labels = labelling(test_documents, test_answers)

In [71]:
# check if tags are good
def testing_tokens(ind, labels, documents, answers):
    for i, j in zip(labels[ind], word_tokenize(documents["Document"][ind])):
        print([i, j])
    print(answers["Answer"][ind])


testing_tokens(100, train_doc_ans_labels, train_documents, train_answers)

['N', 'the']
['N', 'big']
['N', 'ten']
['N', 'conference']
['N', 'formerly']
['N', 'western']
['N', 'conference']
['N', 'and']
['N', 'big']
['N', 'nine']
['N', 'conference']
['N', 'is']
['N', 'the']
['N', 'oldest']
['N', 'division']
['N', 'i']
['N', 'college']
['N', 'athletic']
['N', 'conference']
['N', 'in']
['N', 'the']
['N', 'united']
['N', 'states']
['S', 'its']
['I', 'twelve']
['I', 'member']
['I', 'institutions']
['I', 'which']
['I', 'are']
['I', 'primarily']
['I', 'flagship']
['I', 'research']
['I', 'universities']
['I', 'in']
['I', 'their']
['I', 'respective']
['I', 'states']
['I', 'well']
['I', 'regarded']
['I', 'academically']
['I', 'and']
['I', 'with']
['I', 'relatively']
['I', 'large']
['I', 'student']
['I', 'enrollment']
['I', 'are']
['I', 'located']
['I', 'primarily']
['I', 'in']
['I', 'the']
['I', 'midwest']
['I', 'stretching']
['I', 'from']
['I', 'nebraska']
['I', 'in']
['I', 'the']
['I', 'west']
['I', 'to']
['I', 'penn']
['I', 'state']
['I', 'in']
['I', 'the']
['E', 'e

Cleaned Documents: train and test

train_answers - contains the ['Question','DocumentID','Answer']

train_documents - contains the ['DocumentID','Document']

train_doc_ans_labels - contains a list of list of answer tags for each document,


In [72]:
# To prepare the document for word embeddings:
train_doc_ques = pd.DataFrame(
    {"Document": train_documents["Document"], "Question": train_answers["Question"]}
)
test_doc_ques = pd.DataFrame(
    {"Document": test_documents["Document"], "Question": test_answers["Question"]}
)

### Word Embeddings

To use the CBOW model, we need the data in sentences. Extract this from the original dataset, don't use sent_tokenise, will mess with some of the fullstops, we want to maintain structure from above


In [73]:
def word_tokens(data):
    sentence_list = []
    for i in range(len(data)):
        sentence_list.append(word_tokenize(data[i]))
    return sentence_list


train_doc_list = word_tokens(train_doc_ques["Document"])
train_ques_list = word_tokens(train_doc_ques["Question"])
test_doc_list = word_tokens(test_doc_ques["Document"])
test_ques_list = word_tokens(test_doc_ques["Question"])

In [74]:
combined_text = train_doc_list + train_ques_list + test_doc_list + test_ques_list

In [75]:
# model trained, don't have to run this multiple times
wc_cbow_model = Word2Vec(
    sentences=combined_text,
    vector_size=100,
    window=5,
    min_count=1,
    workers=2,
    epochs=30,
)
wc_cbow_model.save("cbow.model")

To implement QA

1. Word Embeddings, using CBOW
2. Feature Extraction 1 - POS tags
3. Feature Extraction 2 - TF-IDF
4. Feature Extraction 3 - NER


In [76]:
def get_word_embeddings(doc):
    tokenized_doc = word_tokenize(doc)
    embeddings = [wc_cbow_model.wv[word] for word in tokenized_doc]
    return embeddings


train_doc_ques["Doc_Embeddings"] = train_doc_ques["Document"].apply(get_word_embeddings)
train_doc_ques["Q_Embeddings"] = train_doc_ques["Question"].apply(get_word_embeddings)
test_doc_ques["Doc_Embeddings"] = test_doc_ques["Document"].apply(get_word_embeddings)
test_doc_ques["Q_Embeddings"] = test_doc_ques["Question"].apply(get_word_embeddings)

In [77]:
train_doc_ques["Doc_Tokens"] = train_doc_ques["Document"].apply(word_tokenize)
train_doc_ques["Q_Tokens"] = train_doc_ques["Question"].apply(word_tokenize)
test_doc_ques["Doc_Tokens"] = test_doc_ques["Document"].apply(word_tokenize)
test_doc_ques["Q_Tokens"] = test_doc_ques["Question"].apply(word_tokenize)

In [78]:
def check_count(doc):
    count = 0
    for i in range(len(doc)):
        if len(doc["Doc_Embeddings"][i]) != len(doc["Doc_Tokens"][i]):
            count += 1
        elif len(doc["Q_Embeddings"][i]) != len(doc["Q_Tokens"][i]):
            count += 1
        else:
            continue
    return count


check_count(train_doc_ques)  # looks good

0

Note, need to convert the POS tags, NER tags into embeddings. After this, pad the questions and answers to the max question/document length in the combined training and test set.

### PoS Tagging


In [79]:
# Apply the pos tags to the tokens
from nltk.tag import pos_tag

# download the dependency and resource as required
nltk.download("averaged_perceptron_tagger")

train_doc_ques["Doc_POS"] = train_doc_ques["Doc_Tokens"].apply(pos_tag)
train_doc_ques["Q_POS"] = train_doc_ques["Q_Tokens"].apply(pos_tag)
test_doc_ques["Doc_POS"] = test_doc_ques["Doc_Tokens"].apply(pos_tag)
test_doc_ques["Q_POS"] = test_doc_ques["Q_Tokens"].apply(pos_tag)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nicho\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [80]:
# checking the POS tags: # looks ok
test_doc_ques["Q_POS"][0]

[('how', 'WRB'),
 ('african', 'JJ'),
 ('americans', 'NNS'),
 ('were', 'VBD'),
 ('immigrated', 'VBN'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('us', 'PRP')]

In [81]:
# Extract all unique POS Tags
all_pos_tags = (
    train_doc_ques["Doc_POS"].tolist()
    + test_doc_ques["Doc_POS"].tolist()
    + train_doc_ques["Q_POS"].tolist()
    + test_doc_ques["Q_POS"].tolist()
)


def get_unique_pos(data):
    pos_tags = set()
    for item in data:
        for _, pos_tag in item:
            pos_tags.add(pos_tag)

    pos_tag_index = {tag: i for i, tag in enumerate(sorted(pos_tags))}
    return pos_tag_index


pos_iden = get_unique_pos(all_pos_tags)  # list of tags
pos_iden

{'$': 0,
 'CC': 1,
 'CD': 2,
 'DT': 3,
 'EX': 4,
 'FW': 5,
 'IN': 6,
 'JJ': 7,
 'JJR': 8,
 'JJS': 9,
 'MD': 10,
 'NN': 11,
 'NNP': 12,
 'NNPS': 13,
 'NNS': 14,
 'PDT': 15,
 'POS': 16,
 'PRP': 17,
 'PRP$': 18,
 'RB': 19,
 'RBR': 20,
 'RBS': 21,
 'RP': 22,
 'SYM': 23,
 'TO': 24,
 'UH': 25,
 'VB': 26,
 'VBD': 27,
 'VBG': 28,
 'VBN': 29,
 'VBP': 30,
 'VBZ': 31,
 'WDT': 32,
 'WP': 33,
 'WP$': 34,
 'WRB': 35}

### NER Tagging


### Steps to run this:

-   pip install spacy
-   python -m spacy download en_core_web_sm

If loaded for the first time, restart kernel


In [82]:
# nltk using Spacy
# pip install -U spacy
!pip install -U spacy
!python -m spacy download en_core_web_sm
# python -m spacy download en_core_web_sm
import spacy
import en_core_web_sm

# loading pre-trained model of NER
nlp = en_core_web_sm.load()

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 12.8/12.8 MB 5.3 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [83]:
def ner_tagging(texts):
    tagged_texts = []
    for text in texts:
        doc = spacy.tokens.Doc(nlp.vocab, words=text)
        nlp.get_pipe("ner")(doc)
        tagged_texts.append([(token.text, token.ent_type_) for token in doc])
    return tagged_texts

In [84]:
# Will take a while...
train_doc_ques["Doc_NER"] = ner_tagging(train_doc_ques["Doc_Tokens"])
train_doc_ques["Q_NER"] = ner_tagging(train_doc_ques["Q_Tokens"])
test_doc_ques["Doc_NER"] = ner_tagging(test_doc_ques["Doc_Tokens"])
test_doc_ques["Q_NER"] = ner_tagging(test_doc_ques["Q_Tokens"])

In [85]:
# Similar approach to the POS

# Extract all unique POS Tags
all_ner_tags = (
    train_doc_ques["Doc_NER"].tolist()
    + test_doc_ques["Doc_NER"].tolist()
    + train_doc_ques["Q_NER"].tolist()
    + test_doc_ques["Q_NER"].tolist()
)


def get_unique_ner(data):
    ner_tags = set()
    for item in data:
        for _, ner_tag in item:
            ner_tags.add(ner_tag)

    ner_tag_index = {tag: i for i, tag in enumerate(sorted(ner_tags))}
    return ner_tag_index


ner_iden = get_unique_pos(all_ner_tags)  # list of tags
ner_iden

{'': 0,
 'CARDINAL': 1,
 'DATE': 2,
 'EVENT': 3,
 'FAC': 4,
 'GPE': 5,
 'LANGUAGE': 6,
 'LAW': 7,
 'LOC': 8,
 'MONEY': 9,
 'NORP': 10,
 'ORDINAL': 11,
 'ORG': 12,
 'PERCENT': 13,
 'PERSON': 14,
 'PRODUCT': 15,
 'QUANTITY': 16,
 'TIME': 17,
 'WORK_OF_ART': 18}

In [86]:
# check ohv dims
ner_idx = ner_iden.values()
aa = np.eye(max(ner_idx) + 1)
# aa

### TF-IDF

First, calculate the document frequency of each token in the entire corpus (training documents + testing documents). The result is a dictionary where each token is a key and its value is the document frequency.


In [87]:
def document_frequency(corpus):
    """
    Computes the document frequency for every token in the corpus.
    Returns a dictionary {token: doc_freq, ...}
    """
    document_frequency = {}
    for document in corpus:
        for token in np.unique(document):
            try:
                document_frequency[token] += 1
            except:
                document_frequency[token] = 1
    return document_frequency


train_corpus = (
    train_doc_ques["Doc_Tokens"].tolist() + train_doc_ques["Q_Tokens"].tolist()
)
test_corpus = test_doc_ques["Doc_Tokens"].tolist() + test_doc_ques["Q_Tokens"].tolist()
train_doc_freq = document_frequency(train_corpus)
test_doc_freq = document_frequency(test_corpus)

Now calculate TF-IDF using the document frequency from above.


In [88]:
from collections import Counter
import math


def compute_tf_idf(corpus, doc_frequency):
    """
    Computes the term frequency inverse document frequency for every token in every document in the corpus.
    Returns a list the same shape as the list of tokenized documents except every token is replaced with the tf-idf
    for that token.
    """
    tf_idf = {}
    tf_idf_list = []
    N = len(doc_frequency)
    doc_id = 0
    for document in corpus:
        tf_idf_doc = []
        counter = Counter(document)
        total_num_words = len(document)
        for token in np.unique(document):
            tf = counter[token] / total_num_words
            df = doc_frequency[token]
            idf = math.log(N / (df + 1)) + 1
            tf_idf[doc_id, token] = tf * idf
        for token in document:
            tf_idf_doc.append(tf_idf[doc_id, token])
        tf_idf_list.append(tf_idf_doc)
        doc_id += 1
    return tf_idf_list


train_doc_ques["Doc_TFIDF"] = compute_tf_idf(
    train_doc_ques["Doc_Tokens"].tolist(), train_doc_freq
)
train_doc_ques["Q_TFIDF"] = compute_tf_idf(
    train_doc_ques["Q_Tokens"].tolist(), train_doc_freq
)
test_doc_ques["Doc_TFIDF"] = compute_tf_idf(
    test_doc_ques["Doc_Tokens"].tolist(), test_doc_freq
)
test_doc_ques["Q_TFIDF"] = compute_tf_idf(
    test_doc_ques["Q_Tokens"].tolist(), test_doc_freq
)

In [89]:
test_doc_ques

Unnamed: 0,Document,Question,Doc_Embeddings,Q_Embeddings,Doc_Tokens,Q_Tokens,Doc_POS,Q_POS,Doc_NER,Q_NER,Doc_TFIDF,Q_TFIDF
0,african immigration to the united states refer...,how african americans were immigrated to the us,"[[1.3303403, 0.22515363, -0.3840666, 0.4854631...","[[-1.7915696, 0.65887636, -1.6675124, 2.976735...","[african, immigration, to, the, united, states...","[how, african, americans, were, immigrated, to...","[(african, JJ), (immigration, NN), (to, TO), (...","[(how, WRB), (african, JJ), (americans, NNS), ...","[(african, ORG), (immigration, ORG), (to, ), (...","[(how, ), (african, NORP), (americans, NORP), ...","[0.2444438957631187, 0.16900818614783913, 0.28...","[0.7085526392283996, 0.934997901293929, 0.9907..."
1,a prison from old french prisoun also known...,how large were early jails,"[[1.2888881, -0.7444254, -2.579834, 0.12156842...","[[-1.7915696, 0.65887636, -1.6675124, 2.976735...","[a, prison, from, old, french, prisoun, also, ...","[how, large, were, early, jails]","[(a, DT), (prison, NN), (from, IN), (old, JJ),...","[(how, WRB), (large, JJ), (were, VBD), (early,...","[(a, ), (prison, ), (from, ), (old, ), (french...","[(how, ), (large, ), (were, ), (early, ), (jai...","[0.23736000397378015, 0.15055751080418858, 0.0...","[1.1336842227654393, 1.2988332831657334, 1.145..."
2,a small electrically powered pump a large el...,how a water pump works,"[[1.2888881, -0.7444254, -2.579834, 0.12156842...","[[-1.7915696, 0.65887636, -1.6675124, 2.976735...","[a, small, electrically, powered, pump, a, lar...","[how, a, water, pump, works]","[(a, DT), (small, JJ), (electrically, RB), (po...","[(how, WRB), (a, DT), (water, NN), (pump, NN),...","[(a, ), (small, ), (electrically, ), (powered,...","[(how, ), (a, ), (water, ), (pump, ), (works, )]","[0.18179077227423296, 0.07355471089668812, 0.1...","[1.1336842227654393, 0.8362375524614717, 1.412..."
3,lolita is a 1962 comedy drama film by stanley ...,how old was sue lyon when she made lolita,"[[-0.15959537, 0.0957754, 0.20302504, 0.224813...","[[-1.7915696, 0.65887636, -1.6675124, 2.976735...","[lolita, is, a, 1962, comedy, drama, film, by,...","[how, old, was, sue, lyon, when, she, made, lo...","[(lolita, NN), (is, VBZ), (a, DT), (1962, CD),...","[(how, WRB), (old, JJ), (was, VBD), (sue, NN),...","[(lolita, ), (is, ), (a, ), (1962, DATE), (com...","[(how, ), (old, ), (was, ), (sue, PERSON), (ly...","[0.22677748220200447, 0.031618095192546124, 0....","[0.6298245682030218, 0.7657996161610346, 0.528..."
4,each antibody binds to a specific antigen an...,how are antibodies used in,"[[1.2710228, 1.8379426, -3.4153419, 2.465204, ...","[[-1.7915696, 0.65887636, -1.6675124, 2.976735...","[each, antibody, binds, to, a, specific, antig...","[how, are, antibodies, used, in]","[(each, DT), (antibody, NN), (binds, VBZ), (to...","[(how, WRB), (are, VBP), (antibodies, NNS), (u...","[(each, ), (antibody, ), (binds, ), (to, ), (a...","[(how, ), (are, ), (antibodies, ), (used, ), (...","[0.05405618374532114, 0.2406872267443911, 0.01...","[1.1336842227654393, 0.9670455719243073, 1.862..."
...,...,...,...,...,...,...,...,...,...,...,...,...
625,american cuts of beef including the brisket br...,where is the brisket from,"[[-2.5060985, 0.39937317, -0.9950006, -0.47873...","[[-1.30143, -0.89548033, 0.50666285, 0.6961141...","[american, cuts, of, beef, including, the, bri...","[where, is, the, brisket, from]","[(american, JJ), (cuts, NNS), (of, IN), (beef,...","[(where, WRB), (is, VBZ), (the, DT), (brisket,...","[(american, NORP), (cuts, ), (of, ), (beef, ),...","[(where, ), (is, ), (the, ), (brisket, ), (fro...","[0.044673596326364536, 0.15610157311465894, 0....","[1.1856348821507936, 0.8030996178906715, 0.792..."
626,the arm architecture describes a family of ris...,what is arm chipset,"[[0.31549832, -1.1028156, 0.07097474, -0.33913...","[[-0.9207317, 0.04587417, -1.906337, -1.993331...","[the, arm, architecture, describes, a, family,...","[what, is, arm, chipset]","[(the, DT), (arm, NN), (architecture, NN), (de...","[(what, WP), (is, VBZ), (arm, JJ), (chipset, NN)]","[(the, ), (arm, ), (architecture, ), (describe...","[(what, ), (is, ), (arm, ), (chipset, )]","[0.13240256299185374, 0.2925150013371109, 0.06...","[1.200426291895723, 1.0038745223633394, 2.1882..."
627,june bug or junebug may refer to beetles phy...,what is the life span of june bugs,"[[-3.6015184, 0.8126563, 0.32476482, 1.9092256...","[[-0.9207317, 0.04587417, -1.906337, -1.993331...","[june, bug, or, junebug, may, refer, to, beetl...","[what, is, the, life, span, of, june, bugs]","[(june, NN), (bug, NN), (or, CC), (junebug, NN...","[(what, WP), (is, VBZ), (the, DT), (life, NN),...","[(june, DATE), (bug, ), (or, ), (junebug, ), (...","[(what, ), (is, ), (the, ), (life, ), (span, )...","[0.3625851507468464, 0.345024546712403, 0.0560...","[0.6002131459478615, 0.5019372611816697, 0.495..."
628,this is a list of known biological mothers und...,who is the youngest female to give birth world...,"[[-1.3481425, 0.89886665, -9.846613, 1.9082732...","[[-2.9414575, 2.8251827, 0.50508225, 0.0651849...","[this, is, a, list, of, known, biological, mot...","[who, is, the, youngest, female, to, give, bir...","[(this, DT), (is, VBZ), (a, DT), (list, NN), (...","[(who, WP), (is, VBZ), (the, DT), (youngest, J...","[(this, ), (is, ), (a, ), (list, ), (of, ), (k...","[(who, ), (is, ), (the, ), (youngest, ), (fema...","[0.4279277383207209, 0.3088844684194891, 0.321...","[0.532822100709197, 0.40154980894533576, 0.396..."


In [90]:
train_doc_ques

Unnamed: 0,Document,Question,Doc_Embeddings,Q_Embeddings,Doc_Tokens,Q_Tokens,Doc_POS,Q_POS,Doc_NER,Q_NER,Doc_TFIDF,Q_TFIDF
0,a partly submerged glacier cave on perito more...,how are glacier caves formed,"[[1.2888881, -0.7444254, -2.579834, 0.12156842...","[[-1.7915696, 0.65887636, -1.6675124, 2.976735...","[a, partly, submerged, glacier, cave, on, peri...","[how, are, glacier, caves, formed]","[(a, DT), (partly, RB), (submerged, VBN), (gla...","[(how, WRB), (are, VBP), (glacier, JJ), (caves...","[(a, ), (partly, ), (submerged, ), (glacier, )...","[(how, ), (are, ), (glacier, ), (caves, ), (fo...","[0.24679288919367473, 0.145118630440616, 0.161...","[1.0342546228402965, 0.8500722992276416, 1.989..."
1,in physics circular motion is a movement of ...,how are the directions of the velocity and for...,"[[-1.0684596, 0.33509094, -2.0596101, -0.38476...","[[-1.7915696, 0.65887636, -1.6675124, 2.976735...","[in, physics, circular, motion, is, a, movemen...","[how, are, the, directions, of, the, velocity,...","[(in, IN), (physics, NNS), (circular, JJ), (mo...","[(how, WRB), (are, VBP), (the, DT), (direction...","[(in, ), (physics, ), (circular, ), (motion, )...","[(how, ), (are, ), (the, ), (directions, ), (o...","[0.10158600494541081, 0.047978500911022494, 0....","[0.34475154094676547, 0.2833574330758805, 0.45..."
2,apollo creed is a fictional character from the...,how did apollo creed die,"[[-0.2523032, 0.25388047, 0.37638307, 0.035796...","[[-1.7915696, 0.65887636, -1.6675124, 2.976735...","[apollo, creed, is, a, fictional, character, f...","[how, did, apollo, creed, die]","[(apollo, NNS), (creed, VBP), (is, VBZ), (a, D...","[(how, WRB), (did, VBD), (apollo, VB), (creed,...","[(apollo, ORG), (creed, ), (is, ), (a, ), (fic...","[(how, ), (did, ), (apollo, ORG), (creed, ), (...","[0.1506184153725464, 0.2875274053249442, 0.057...","[1.0342546228402965, 1.1571023666681808, 1.787..."
3,in the united states the title of federal jud...,how long is the term for federal judges,"[[-1.0684596, 0.33509094, -2.0596101, -0.38476...","[[-1.7915696, 0.65887636, -1.6675124, 2.976735...","[in, the, united, states, the, title, of, fede...","[how, long, is, the, term, for, federal, judges]","[(in, IN), (the, DT), (united, JJ), (states, V...","[(how, WRB), (long, JJ), (is, VBZ), (the, DT),...","[(in, ), (the, GPE), (united, GPE), (states, G...","[(how, ), (long, ), (is, ), (the, ), (term, ),...","[0.08829203392786322, 0.3204713483897852, 0.16...","[0.6464091392751853, 0.7778250162349475, 0.429..."
4,the beretta 21a bobcat is a small pocket sized...,how a beretta model 21 pistols magazines works,"[[0.31549832, -1.1028156, 0.07097474, -0.33913...","[[-1.7915696, 0.65887636, -1.6675124, 2.976735...","[the, beretta, 21a, bobcat, is, a, small, pock...","[how, a, beretta, model, 21, pistols, magazine...","[(the, DT), (beretta, NN), (21a, CD), (bobcat,...","[(how, WRB), (a, DT), (beretta, NN), (model, N...","[(the, ), (beretta, ), (21a, ), (bobcat, ), (i...","[(how, ), (a, ), (beretta, PRODUCT), (model, )...","[0.21161559146390707, 0.8290244630930195, 0.22...","[0.6464091392751853, 0.4550243894508378, 1.243..."
...,...,...,...,...,...,...,...,...,...,...,...,...
2112,blue mountain state is an american comedy seri...,where was blue mountain state filmed at,"[[-0.85282344, 0.19758348, 0.6617862, -0.82276...","[[-1.30143, -0.89548033, 0.50666285, 0.6961141...","[blue, mountain, state, is, an, american, come...","[where, was, blue, mountain, state, filmed, at]","[(blue, JJ), (mountain, NN), (state, NN), (is,...","[(where, WRB), (was, VBD), (blue, JJ), (mounta...","[(blue, LOC), (mountain, LOC), (state, ), (is,...","[(where, ), (was, ), (blue, ), (mountain, ), (...","[0.29476307406437613, 0.3823599012669257, 0.22...","[0.7503384296942396, 0.5907013675064214, 1.063..."
2113,apple inc formerly apple computer inc is ...,when was apple computer founded,"[[0.22615717, 0.085470006, 0.5799774, -0.70327...","[[-3.690818, -2.3945823, -0.99455154, 0.935833...","[apple, inc, formerly, apple, computer, inc, i...","[when, was, apple, computer, founded]","[(apple, NN), (inc, VBP), (formerly, RB), (app...","[(when, WRB), (was, VBD), (apple, NN), (comput...","[(apple, ORG), (inc, ORG), (formerly, ORG), (a...","[(when, ), (was, ), (apple, ), (computer, ), (...","[0.26158075612861253, 0.089435411305154, 0.029...","[0.9915721940674992, 0.8269819145089902, 1.700..."
2114,section 8 housing in the south bronx section 8...,what is section eight housing,"[[1.1065178, 0.39792594, -0.44520676, 1.643015...","[[-0.9207317, 0.04587417, -1.906337, -1.993331...","[section, 8, housing, in, the, south, bronx, s...","[what, is, section, eight, housing]","[(section, NN), (8, CD), (housing, NN), (in, I...","[(what, WP), (is, VBZ), (section, NN), (eight,...","[(section, LAW), (8, LAW), (housing, ), (in, )...","[(what, ), (is, ), (section, ), (eight, CARDIN...","[0.19207867615144178, 0.16090249029058593, 0.2...","[0.8426670929983164, 0.6879048840289103, 1.514..."
2115,restaurants categorized by type and informatio...,what is the main type of restaurant,"[[0.27395007, 0.8356683, 0.27677092, 0.7877626...","[[-0.9207317, 0.04587417, -1.906337, -1.993331...","[restaurants, categorized, by, type, and, info...","[what, is, the, main, type, of, restaurant]","[(restaurants, NNS), (categorized, VBN), (by, ...","[(what, WP), (is, VBZ), (the, DT), (main, JJ),...","[(restaurants, ), (categorized, ), (by, ), (ty...","[(what, ), (is, ), (the, ), (main, ), (type, )...","[0.8769638560774586, 0.9255146376556289, 0.404...","[0.6019050664273689, 0.4913606314492216, 0.483..."


In [91]:
def one_hot_vectorize(
    pos_tagger, ner_tagger, data
):  # pass in the unique dict for ner or pos
    pos_idx = pos_tagger.values()
    pos_ohv = np.eye(max(pos_idx) + 1)  # create the ohv
    ner_idx = ner_tagger.values()
    ner_ohv = np.eye(max(ner_idx) + 1)

    dpos_full_ohv, dner_full_ohv = [], []  # lists to append to
    qpos_full_ohv, qner_full_ohv = [], []  # lists to append to

    for item in data["Doc_POS"]:
        sent_ohv = []
        for word in item:
            tag = word[1]
            pos_index_iden = pos_tagger[tag]
            sent_ohv.append(pos_ohv[pos_index_iden])
        dpos_full_ohv.append(sent_ohv)

    for item in data["Q_POS"]:
        sent_ohv = []
        for word in item:
            tag = word[1]
            pos_index_iden = pos_tagger[tag]
            sent_ohv.append(pos_ohv[pos_index_iden])
        qpos_full_ohv.append(sent_ohv)

    for item in data["Doc_NER"]:
        sent_ohv = []
        for word in item:
            tag = word[1]
            ner_index_iden = ner_tagger[tag]
            sent_ohv.append(ner_ohv[ner_index_iden])
        dner_full_ohv.append(sent_ohv)

    for item in data["Q_NER"]:
        sent_ohv = []
        for word in item:
            tag = word[1]
            ner_index_iden = ner_tagger[tag]
            sent_ohv.append(ner_ohv[ner_index_iden])
        qner_full_ohv.append(sent_ohv)

    return (dpos_full_ohv, qpos_full_ohv, dner_full_ohv, qner_full_ohv)

In [92]:
# get the ohv for doc
(
    train_doc_pos_ohv,
    train_q_pos_ohv,
    train_doc_ner_ohv,
    train_q_ner_ohv,
) = one_hot_vectorize(pos_iden, ner_iden, train_doc_ques)
test_doc_pos_ohv, test_q_pos_ohv, test_doc_ner_ohv, test_q_ner_ohv = one_hot_vectorize(
    pos_iden, ner_iden, test_doc_ques
)

In [93]:
train_doc_ques[:5]

Unnamed: 0,Document,Question,Doc_Embeddings,Q_Embeddings,Doc_Tokens,Q_Tokens,Doc_POS,Q_POS,Doc_NER,Q_NER,Doc_TFIDF,Q_TFIDF
0,a partly submerged glacier cave on perito more...,how are glacier caves formed,"[[1.2888881, -0.7444254, -2.579834, 0.12156842...","[[-1.7915696, 0.65887636, -1.6675124, 2.976735...","[a, partly, submerged, glacier, cave, on, peri...","[how, are, glacier, caves, formed]","[(a, DT), (partly, RB), (submerged, VBN), (gla...","[(how, WRB), (are, VBP), (glacier, JJ), (caves...","[(a, ), (partly, ), (submerged, ), (glacier, )...","[(how, ), (are, ), (glacier, ), (caves, ), (fo...","[0.24679288919367473, 0.145118630440616, 0.161...","[1.0342546228402965, 0.8500722992276416, 1.989..."
1,in physics circular motion is a movement of ...,how are the directions of the velocity and for...,"[[-1.0684596, 0.33509094, -2.0596101, -0.38476...","[[-1.7915696, 0.65887636, -1.6675124, 2.976735...","[in, physics, circular, motion, is, a, movemen...","[how, are, the, directions, of, the, velocity,...","[(in, IN), (physics, NNS), (circular, JJ), (mo...","[(how, WRB), (are, VBP), (the, DT), (direction...","[(in, ), (physics, ), (circular, ), (motion, )...","[(how, ), (are, ), (the, ), (directions, ), (o...","[0.10158600494541081, 0.047978500911022494, 0....","[0.34475154094676547, 0.2833574330758805, 0.45..."
2,apollo creed is a fictional character from the...,how did apollo creed die,"[[-0.2523032, 0.25388047, 0.37638307, 0.035796...","[[-1.7915696, 0.65887636, -1.6675124, 2.976735...","[apollo, creed, is, a, fictional, character, f...","[how, did, apollo, creed, die]","[(apollo, NNS), (creed, VBP), (is, VBZ), (a, D...","[(how, WRB), (did, VBD), (apollo, VB), (creed,...","[(apollo, ORG), (creed, ), (is, ), (a, ), (fic...","[(how, ), (did, ), (apollo, ORG), (creed, ), (...","[0.1506184153725464, 0.2875274053249442, 0.057...","[1.0342546228402965, 1.1571023666681808, 1.787..."
3,in the united states the title of federal jud...,how long is the term for federal judges,"[[-1.0684596, 0.33509094, -2.0596101, -0.38476...","[[-1.7915696, 0.65887636, -1.6675124, 2.976735...","[in, the, united, states, the, title, of, fede...","[how, long, is, the, term, for, federal, judges]","[(in, IN), (the, DT), (united, JJ), (states, V...","[(how, WRB), (long, JJ), (is, VBZ), (the, DT),...","[(in, ), (the, GPE), (united, GPE), (states, G...","[(how, ), (long, ), (is, ), (the, ), (term, ),...","[0.08829203392786322, 0.3204713483897852, 0.16...","[0.6464091392751853, 0.7778250162349475, 0.429..."
4,the beretta 21a bobcat is a small pocket sized...,how a beretta model 21 pistols magazines works,"[[0.31549832, -1.1028156, 0.07097474, -0.33913...","[[-1.7915696, 0.65887636, -1.6675124, 2.976735...","[the, beretta, 21a, bobcat, is, a, small, pock...","[how, a, beretta, model, 21, pistols, magazine...","[(the, DT), (beretta, NN), (21a, CD), (bobcat,...","[(how, WRB), (a, DT), (beretta, NN), (model, N...","[(the, ), (beretta, ), (21a, ), (bobcat, ), (i...","[(how, ), (a, ), (beretta, PRODUCT), (model, )...","[0.21161559146390707, 0.8290244630930195, 0.22...","[0.6464091392751853, 0.4550243894508378, 1.243..."


In [94]:
print(train_doc_ques["Document"].head(1))

0    a partly submerged glacier cave on perito more...
Name: Document, dtype: object


In [95]:
# reduce the dataframe to just tokens and embeddings:
doc_emb_train = train_doc_ques[["Doc_Tokens", "Doc_Embeddings", "Doc_TFIDF"]]
doc_pos_ner = pd.DataFrame({"Doc_POS": train_doc_pos_ohv, "Doc_NER": train_doc_ner_ohv})
doc_emb_train = pd.concat([doc_emb_train, doc_pos_ner], axis=1)

q_emb_train = train_doc_ques[["Q_Tokens", "Q_Embeddings", "Q_TFIDF"]]
q_pos_ner = pd.DataFrame({"Q_POS": train_q_pos_ohv, "Q_NER": train_q_ner_ohv})
q_emb_train = pd.concat([q_emb_train, q_pos_ner], axis=1)

In [96]:
doc_emb_test = test_doc_ques[["Doc_Tokens", "Doc_Embeddings", "Doc_TFIDF"]]
doc_pos_ner = pd.DataFrame({"Doc_POS": test_doc_pos_ohv, "Doc_NER": test_doc_ner_ohv})
doc_emb_test = pd.concat([doc_emb_test, doc_pos_ner], axis=1)

q_emb_test = test_doc_ques[["Q_Tokens", "Q_Embeddings", "Q_TFIDF"]]
q_pos_ner = pd.DataFrame({"Q_POS": test_q_pos_ohv, "Q_NER": test_q_ner_ohv})
q_emb_test = pd.concat([q_emb_test, q_pos_ner], axis=1)

### Word Embeddings (Doc and Qn)

The embeddings of the questions and answers of the train and test set can be found here:

-   Train Document - doc_emb_train
-   Train Q - q_emb_train
-   Test Document - doc_emb_test
-   Test Q - q_emb_test

The max_document size is 1675 and max_question size is 23.


In [97]:
doc_emb_train.loc[0]

Doc_Tokens        [a, partly, submerged, glacier, cave, on, peri...
Doc_Embeddings    [[1.2888881, -0.7444254, -2.579834, 0.12156842...
Doc_TFIDF         [0.24679288919367473, 0.145118630440616, 0.161...
Doc_POS           [[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
Doc_NER           [[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
Name: 0, dtype: object

In [98]:
len(mini.iloc[2])

NameError: name 'mini' is not defined

In [99]:
len(doc_emb_train)

2117

In [100]:
def full_array(data, data_type="Document"):
    num_vec_length = 156
    max_doc = 1675
    max_qn = 23
    zero_vec = np.zeros(156)

    if data_type == "Document":
        full_vec = []  # create a list for list of list for document
        for dat in range(len(data)):  # go through each line
            doc_ques = data.loc[dat]  # document data
            v = []  # create list to each word
            for j in range(len(doc_ques.iloc[0])):
                vn = []  # list of concat word embeddings
                vn.append(doc_ques.iloc[1][j].tolist())
                vn.append(doc_ques.iloc[2][j])
                vn.append(doc_ques.iloc[3][j].tolist())
                vn.append(doc_ques.iloc[4][j].tolist())
                flatten = [
                    item
                    for sublist in vn
                    for item in (sublist if isinstance(sublist, list) else [sublist])
                ]
                v.append(flatten)
            while len(v) < max_doc:
                v.append(zero_vec)
            full_vec.append(v)

    if data_type == "Question":
        full_vec = []  # create a list for list of list for document
        for dat in range(len(data)):  # go through each line
            doc_ques = data.loc[dat]  # document data
            v = []  # create list to each word
            for j in range(len(doc_ques.iloc[0])):
                vn = []  # list of concat word embeddings
                vn.append(doc_ques.iloc[1][j].tolist())
                vn.append(doc_ques.iloc[2][j])
                vn.append(doc_ques.iloc[3][j].tolist())
                vn.append(doc_ques.iloc[4][j].tolist())
                flatten = [
                    item
                    for sublist in vn
                    for item in (sublist if isinstance(sublist, list) else [sublist])
                ]
                v.append(flatten)
            while len(v) < max_qn:
                v.append(zero_vec)
            full_vec.append(v)
    return full_vec

In [101]:
# Training/Test Documents to pass in, takes about a min
final_doc_train = full_array(doc_emb_train, data_type="Document")
final_doc_test = full_array(doc_emb_test, data_type="Document")

In [102]:
# Training/Test Questions to pass in, takes about a few seconds
final_qn_train = full_array(q_emb_train, data_type="Question")
final_qn_test = full_array(q_emb_test, data_type="Question")

### Converting into Tensors:


In [103]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [104]:
# takes a min
tf_final_doc_train = torch.tensor(final_doc_train, device=device)
tf_final_doc_test = torch.tensor(final_doc_test, device=device)
tf_final_qn_train = torch.tensor(final_qn_train, device=device)
tf_final_qn_test = torch.tensor(final_qn_test, device=device)

In [105]:
# check dimensions
print(tf_final_doc_train.shape)
print(tf_final_doc_test.shape)
print(tf_final_qn_train.shape)
print(tf_final_qn_test.shape)

torch.Size([2117, 1675, 156])
torch.Size([630, 1675, 156])
torch.Size([2117, 23, 156])
torch.Size([630, 23, 156])


In [119]:
print(len(torch.unique(tf_final_qn_train)))

405806


In [106]:
print(final_doc_test[0])

[[1.3303402662277222, 0.2251536250114441, -0.3840666115283966, 0.4854631721973419, 0.5468978881835938, -1.7246865034103394, 2.2640626430511475, 0.7793048024177551, 0.2848823666572571, -1.7108184099197388, -1.9333291053771973, -2.212942123413086, 0.7297947406768799, -0.0034556263126432896, -0.027706703171133995, -1.490108609199524, -0.015787087380886078, -1.7721045017242432, -0.28272780776023865, 1.3624590635299683, -0.7380390763282776, 3.216907024383545, -0.3150143325328827, -2.203427314758301, 0.9360759854316711, -0.8896309733390808, 1.2138572931289673, 0.8713009357452393, 0.801731526851654, -1.2492905855178833, -0.9358506798744202, -1.3816291093826294, 1.1662856340408325, -0.8822349905967712, -0.5917984843254089, 0.7044068574905396, -1.0778135061264038, 1.0111157894134521, -0.5077717304229736, 0.8377698063850403, -1.9014887809753418, 0.5850471258163452, 0.5284187197685242, 0.1089165136218071, -0.039318591356277466, 3.2666895389556885, -1.2216646671295166, -0.10724107176065445, -0.772

In [107]:
print(1 in tf_final_doc_train)

True


### Model


### Training


In [108]:
MAX_DOC_LENGTH = 1675  # Max doc length
MAX_QN_LENGTH = 23  # Max question length

In [109]:
import time
import math


# Helper functions for training
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (- %s)" % (asMinutes(s), asMinutes(rs))

In [110]:
import torch
from torch import Tensor
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class DocumentBiRNN(nn.Module):
    def __init__(
        self,
        hidden_size: int,
        embedding: nn.Embedding,
        num_layers=1,
    ):
        super(DocumentBiRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = embedding
        self.lstm = nn.LSTM(
            hidden_size,
            hidden_size,
            num_layers=num_layers,
            bidirectional=True,
        )

    def forward(self, input: nn.Embedding, hidden: Tensor):
        embedded: Tensor = self.embedding(input).view(1, 1, -1)
        output: Tensor
        output, _ = self.lstm(embedded, hidden)
        return output

    def initHidden(self):
        return (
            torch.zeros(2, 1, self.hidden_size, device=device),
            torch.zeros(2, 1, self.hidden_size, device=device),
        )

In [111]:
import torch
from torch import Tensor
import torch.nn as nn
from typing import Literal
from enum import Enum
import torch.nn.functional as F
import numpy as np
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class AttentionMethod(Enum):
    DOT_PRODUCT = "dot_product"
    SCALE_DOT_PRODUCT = "scale_dot_product"
    COSINE_SIMILARITY = "cosine_similarity"

    def __str__(self):
        return self.value


class QuestionBiRNN(nn.Module):
    def __init__(
        self,
        hidden_size: int,
        output_size: int,
        embedding: nn.Embedding,
        max_length: int,
        num_layers=1,
        attention_method: Literal[
            "dot_product",
            "scale_dot_product",
            "cosine_similarity",
        ] = "dot_product",
    ):
        super(QuestionBiRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.max_length = max_length
        self.embedding = embedding
        self.attention_method = AttentionMethod(attention_method)
        self.lstm = nn.LSTM(
            hidden_size,
            hidden_size,
            num_layers=num_layers,
            bidirectional=True,
        )
        self.out = nn.Linear(self.hidden_size * 4, self.output_size)

    def forward(self, input: nn.Embedding, hidden: Tensor, document_outputs: Tensor):
        embedded: Tensor = self.embedding(input).view(1, 1, -1)
        output: Tensor
        # use the last hidden state
        _, (hidden, _) = self.lstm(embedded, hidden)
        # Concatenate the last hidden state from the forward and backward LSTMs for the question
        hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)

        # Pass the concatenated hidden state through the fully connected layer to obtain the summary of the question
        summary = self.out(hidden)
        return summary
        # attn_scores = self.cal_attention(summary, document_outputs)
        # output = F.softmax(attn_scores, dim=-1)
        # return output

    def initHidden(self):
        return (
            torch.zeros(2, 1, self.hidden_size, device=device),
            torch.zeros(2, 1, self.hidden_size, device=device),
        )

    # def cal_attention(self, hidden: Tensor, document_hiddens: Tensor):
    #     if self.attention_method == AttentionMethod.DOT_PRODUCT:
    #         # Calculate the dot product between all hidden states of the first model and the last hidden state of the second model
    #         # attn_score = torch.bmm(
    #         #     document_hiddens.transpose(0, 1), hidden.unsqueeze(2)
    #         # ).squeeze()
    #         # return attn_score
    #         energy = torch.bmm(hidden, document_hiddens.T.repeat(2, 1, 1))
    #         attn_weights = F.softmax(energy, dim=-1)
    #         attn_output = torch.bmm(attn_weights, document_hiddens.repeat(2, 1, 1))

    #     else:
    #         energy = torch.bmm(hidden[0], document_hiddens.T.repeat(2, 1, 1)) / np.sqrt(
    #             self.hidden_size
    #         )
    #         attn_weights = F.softmax(energy, dim=-1)
    #         attn_output = torch.bmm(attn_weights, document_hiddens.repeat(2, 1, 1))
    #     return attn_output

In [112]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size

    def forward(self, document_output, question_summary):
        # Transpose the question output for dot product computation
        question_summary_transposed = question_summary.transpose(0, 1)

        # Compute attention scores using dot product
        attention_scores = torch.matmul(document_output, question_summary_transposed)

        # Compute attention weights using softmax
        attention_weights = F.softmax(attention_scores, dim=0)

        # Compute weighted sum of document output using attention weights
        weighted_doc_output = torch.sum(attention_weights * document_output, dim=0)

        return weighted_doc_output

In [113]:
class ReadingComprehensionModel(nn.Module):
    def __init__(self, document_rnn, question_rnn, attention, hidden_size, output_size):
        super(ReadingComprehensionModel, self).__init__()
        self.document_rnn = document_rnn
        self.question_rnn = question_rnn
        self.attention = attention
        self.start_pred = nn.Linear(hidden_size * 2, output_size)
        self.end_pred = nn.Linear(hidden_size * 2, output_size)

In [115]:
def trainIter(model, train_data, num_epochs, criterion, optimizer):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for document_input, question_input, start_target, end_target in train_data:
            optimizer.zero_grad()

            document_hidden = model.document_rnn.initHidden()
            question_hidden = model.question_rnn.initHidden()

            document_output, _ = model.document_rnn(document_input, document_hidden)
            _, question_hidden = model.question_rnn(question_input, question_hidden)

            weighted_doc_output = model.attention(document_output, question_hidden)

            start_logits = model.start_pred(weighted_doc_output)
            end_logits = model.end_pred(weighted_doc_output)

            loss = criterion(start_logits, start_target) + criterion(
                end_logits, end_target
            )
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_data)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

In [None]:
def evaluate(model, eval_data):
    model.eval()
    total_examples = 0
    total_correct = 0

    with torch.no_grad():
        for document_input, question_input, start_target, end_target in eval_data:
            document_hidden = model.document_rnn.initHidden()
            question_hidden = model.question_rnn.initHidden()

            document_output, _ = model.document_rnn(document_input, document_hidden)
            _, question_hidden = model.question_rnn(question_input, question_hidden)

            weighted_doc_output = model.attention(document_output, question_hidden)

            start_logits = model.start_pred(weighted_doc_output)
            end_logits = model.end_pred(weighted_doc_output)

            start_pred = torch.argmax(start_logits, dim=0)
            end_pred = torch.argmax(end_logits, dim=0)

            total_examples += 1
            if start_pred == start_target and end_pred == end_target:
                total_correct += 1

    accuracy = total_correct / total_examples
    return accuracy

In [116]:
from torch import optim

hidden_size = 50
learning_rate = 0.1

document_num_embeddings = len(word_to_ix)
question_num_embeddings = len(word_to_ix)
document_embedding = nn.Embedding(document_num_embeddings, hidden_size)
question_embedding = nn.Embedding(question_num_embeddings, hidden_size)
document_rnn = DocumentBiRNN(hidden_size=hidden_size, embedding=document_embedding)
question_rnn = QuestionBiRNN(
    hidden_size=hidden_size,
    output_size=question_num_embeddings,
    embedding=question_embedding,
    max_length=MAX_QN_LENGTH,
)
attention = Attention(hidden_size=hidden_size)
reading_comp = ReadingComprehensionModel(
    document_rnn, question_rnn, attention, hidden_size=..., output_size=...
)
optimizer = optim.Adam(reading_comp.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

trainIter(reading_comp, train_data, 10, criterion, optimizer)

TypeError: ReadingComprehensionModel.__init__() missing 5 required positional arguments: 'document_rnn', 'question_rnn', 'attention', 'hidden_size', and 'output_size'