In [None]:
import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec
import nltk
from nltk import word_tokenize

nltk.download("punkt")

[nltk_data] Downloading package punkt to /Users/daniel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 1. Data Wrangling

In [None]:
# reading in the data
train_data = pd.read_csv("WikiQA-train.tsv", sep="\t")
test_data = pd.read_csv("WikiQA-test.tsv", sep="\t")

Extract the unique questions from the train and test data frames, including the documentID and the DocumentTitle


In [None]:
def get_questions_documenttag(data):
    qd = data[
        ["Question", "QuestionID", "DocumentID", "DocumentTitle"]
    ].drop_duplicates()
    return qd


train_question_doctag = get_questions_documenttag(train_data)
test_question_doctag = get_questions_documenttag(test_data)

In [None]:
# get unique questions
train_questions = train_question_doctag["Question"]
test_questions = test_question_doctag["Question"]

In [None]:
# get the unique document ids
train_docid = train_question_doctag["DocumentID"]
test_docid = test_question_doctag["DocumentID"]

Extract the answers to those questions.


In [None]:
def get_answers(data, questions, documentids):
    answers = []  # list of answers
    for q in range(len(questions)):
        question = questions.iloc[q]
        doc_id = documentids.iloc[q]  # add the document id
        df = data[data["Question"] == question]
        index = df.loc[df["Label"] == 1]["Sentence"].index.values
        if len(index) == 0:  # if no answer found
            answers.append([question, doc_id, "No answer"])
        else:  # if 1 answer found
            answers.append([question, doc_id, df.loc[index[0], "Sentence"]])
    return answers


train_answers = pd.DataFrame(get_answers(train_data, train_questions, train_docid))
test_answers = pd.DataFrame(get_answers(test_data, test_questions, test_docid))

The above get_answers returns train_answers and test_answers which, gives us in the following columns

-   Question
-   Related Document ID
-   Answer (if no answer to that question, return no answer)


In [None]:
def get_documents(data, questions, documentids):  # (done by Finn, tweaked by Dan)
    documents = []
    for q in range(len(questions)):
        question = questions.iloc[q]
        doc_id = documentids.iloc[q]  # add the document id
        df = data[data["Question"] == question]
        sentences = df["Sentence"].tolist()
        for i in range(0, len(sentences) - 1):
            sentences[i] = sentences[i] + " "
        documents.append([doc_id, "".join(sentences)])
    return documents


train_documents = pd.DataFrame(
    get_documents(train_data, train_questions, train_docid)
)  # return the individual document in list
test_documents = pd.DataFrame(
    get_documents(test_data, test_questions, test_docid)
)  # return the individual document in list

The above train_documents and test_documents called from the get_documents gives us in the following columns

-   Document ID
-   Full Document


In [None]:
# renaming all the columns for more standardised access
train_answers.columns = ["Question", "DocumentID", "Answer"]
test_answers.columns = ["Question", "DocumentID", "Answer"]
train_documents.columns = ["DocumentID", "Document"]
test_documents.columns = ["DocumentID", "Document"]

In [None]:
# result is 2117, 2117, 630, 630

len(train_answers), len(train_documents), len(test_answers), len(test_documents)

(2117, 2117, 630, 630)

**Prior to tagging, we should maybe clean the document and answers first:** (stopped here)

Maybe?

-   lowercase (might lose context, but we can use on questions)
-   removing any punctuation or weird symbols (do)
-   removal of stop words? (probably not)

Make sure that the pre-processing is standardised to be the same throughout doc and ans.


In [None]:
# These are just common English contractions. We used it in Lab 5 before!
contraction_dict = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "this's": "this is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "here's": "here is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have",
}

In [None]:
def preprocess_lower(text):
    # Lowercase the text for question, answer and documents
    text = text.lower()
    for word, new_word in contraction_dict.items():
        text = text.replace(word, new_word)  # dealing with contractions
    pattern = r"[^a-zA-Z0-9\s]"
    cleaned_text = re.sub(pattern, " ", text)
    return cleaned_text


train_answers[["Question", "Answer"]] = train_answers[["Question", "Answer"]].applymap(
    preprocess_lower
)
train_documents["Document"] = train_documents["Document"].apply(preprocess_lower)
test_answers[["Question", "Answer"]] = test_answers[["Question", "Answer"]].applymap(
    preprocess_lower
)
test_documents["Document"] = test_documents["Document"].apply(preprocess_lower)

In [None]:
train_documents

Unnamed: 0,DocumentID,Document
0,D1,a partly submerged glacier cave on perito more...
1,D2,in physics circular motion is a movement of ...
2,D5,apollo creed is a fictional character from the...
3,D6,in the united states the title of federal jud...
4,D7,the beretta 21a bobcat is a small pocket sized...
...,...,...
2112,D2805,blue mountain state is an american comedy seri...
2113,D2806,apple inc formerly apple computer inc is ...
2114,D2807,section 8 housing in the south bronx section 8...
2115,D2808,restaurants categorized by type and informatio...


In [None]:
def labelling(documents, answers):
    tagged_documents = []
    for q in range(len(answers)):
        tagged_document = []
        qn = answers["Question"].loc[q]
        doc_id = answers["DocumentID"].loc[q]
        content = documents.loc[documents["DocumentID"] == doc_id, "Document"].values[0]
        answer = answers["Answer"].loc[q]

        if answer == "no answer":
            tokens = word_tokenize(content)
            for j in range(len(tokens)):
                tagged_document.append("N")  # none
        else:
            parts = content.partition(answer)
            for j in range(len(parts)):
                tokens = word_tokenize(parts[j])
                if j == 1:
                    tagged_document.append("S")  # start of answer
                    for k in range(len(tokens) - 2):
                        tagged_document.append("I")  # inside of answer
                    tagged_document.append("E")  # end of answer
                else:
                    for k in range(len(tokens)):
                        tagged_document.append("N")  # outside answer
        tagged_documents.append(tagged_document)
    return tagged_documents


train_doc_ans_labels = labelling(train_documents, train_answers)
test_doc_ans_labels = labelling(test_documents, test_answers)

In [None]:
# check if tags are good
def testing_tokens(ind, labels, documents, answers):
    for i, j in zip(labels[ind], word_tokenize(documents["Document"][ind])):
        print([i, j])
    print(answers["Answer"][ind])


testing_tokens(1000, train_doc_ans_labels, train_documents, train_answers)

['N', 'egg']
['N', 'roll']
['N', 'is']
['N', 'a']
['N', 'term']
['N', 'used']
['N', 'for']
['N', 'many']
['N', 'different']
['N', 'foods']
['N', 'around']
['N', 'the']
['N', 'world']
['S', '2']
['I', 'egg']
['I', 'roll']
['I', 'varieties']
['I', 'of']
['I', 'egg']
['I', 'rolls']
['I', 'are']
['I', 'found']
['I', 'in']
['I', 'mainland']
['I', 'china']
['I', 'many']
['I', 'chinese']
['I', 'speaking']
['I', 'regions']
['I', 'of']
['I', 'asia']
['I', 'and']
['I', 'chinese']
['I', 'immigrant']
['I', 'communities']
['I', 'around']
['I', 'the']
['E', 'world']
['N', 'egg']
['N', 'rolls']
['N', 'as']
['N', 'referred']
['N', 'to']
['N', 'in']
['N', 'china']
['N', 'in']
['N', 'guangdong']
['N', 'and']
['N', 'hong']
['N', 'kong']
['N', 'egg']
['N', 'roll']
['N', 'usually']
['N', 'refers']
['N', 'to']
['N', 'biscuit']
['N', 'roll']
['N', 'this']
['N', 'is']
['N', 'a']
['N', 'type']
['N', 'of']
['N', 'biscuit']
['N', 'the']
['N', 'ingredient']
['N', 'included']
['N', 'egg']
['N', 'flour']
['N', 'and

Cleaned Documents: train and test

train_answers - contains the ['Question','DocumentID','Answer']

train_documents - contains the ['DocumentID','Document']

train_doc_ans_labels - contains a list of list of answer tags for each document,


In [None]:
# To prepare the document for word embeddings:
train_doc_ques = pd.DataFrame(
    {"Document": train_documents["Document"], "Question": train_answers["Question"]}
)
test_doc_ques = pd.DataFrame(
    {"Document": test_documents["Document"], "Question": test_answers["Question"]}
)

### Word Embeddings

To use the CBOW model, we need the data in sentences. Extract this from the original dataset, don't use sent_tokenise, will mess with some of the fullstops, we want to maintain structure from above


In [None]:
def word_tokens(data):
    sentence_list = []
    for i in range(len(data)):
        sentence_list.append(word_tokenize(data[i]))
    return sentence_list


train_doc_list = word_tokens(train_doc_ques["Document"])
train_ques_list = word_tokens(train_doc_ques["Question"])
test_doc_list = word_tokens(test_doc_ques["Document"])
test_ques_list = word_tokens(test_doc_ques["Question"])

In [None]:
combined_text = train_doc_list + train_ques_list + test_doc_list + test_ques_list

In [None]:
# model trained, don't have to run this multiple times
wc_cbow_model = Word2Vec(
    sentences=combined_text,
    vector_size=100,
    window=5,
    min_count=1,
    workers=2,
    epochs=30,
)
# wc_cbow_model.save("cbow.model")

To implement QA

1. Word Embeddings, using CBOW
2. Feature Extraction 1 - POS tags
3. Feature Extraction 2 - TF-IDF
4. Feature Extraction 3 - NER


In [None]:
# run this if model in directory
wc_cbow_model = Word2Vec.load("./cbow.model")

In [None]:
def get_word_embeddings(doc):
    tokenized_doc = word_tokenize(doc)
    embeddings = [wc_cbow_model.wv[word] for word in tokenized_doc]
    return embeddings


train_doc_ques["Doc_Embeddings"] = train_doc_ques["Document"].apply(get_word_embeddings)
train_doc_ques["Q_Embeddings"] = train_doc_ques["Question"].apply(get_word_embeddings)
test_doc_ques["Doc_Embeddings"] = test_doc_ques["Document"].apply(get_word_embeddings)
test_doc_ques["Q_Embeddings"] = test_doc_ques["Question"].apply(get_word_embeddings)

In [None]:
train_doc_ques["Doc_Tokens"] = train_doc_ques["Document"].apply(word_tokenize)
train_doc_ques["Q_Tokens"] = train_doc_ques["Question"].apply(word_tokenize)
test_doc_ques["Doc_Tokens"] = test_doc_ques["Document"].apply(word_tokenize)
test_doc_ques["Q_Tokens"] = test_doc_ques["Question"].apply(word_tokenize)

In [None]:
def find_max_length(column):
    max_length = 0
    lns = []
    for i in range(len(column)):
        lns.append(len(column[i]))
        if len(column[i]) > max_length:
            max_length = len(column[i])
    return "Max Seq Length is {}, Median is {}, Number of lines is {})".format(
        max_length, np.median(lns), len(lns)
    )


find_max_length(test_doc_ques["Doc_Embeddings"])
find_max_length(train_doc_ques["Doc_Embeddings"])

'Max Seq Length is 924, Median is 182.0, Number of lines is 2117)'

In [None]:
def check_count(doc):
    count = 0
    for i in range(len(doc)):
        if len(doc["Doc_Embeddings"][i]) != len(doc["Doc_Tokens"][i]):
            count += 1
        elif len(doc["Q_Embeddings"][i]) != len(doc["Q_Tokens"][i]):
            count += 1
        else:
            continue
    return count


check_count(train_doc_ques)  # looks good

0

Note, need to convert the POS tags, NER tags into embeddings. After this, pad the questions and answers to the max question/document length in the combined training and test set.

### PoS Tagging


In [None]:
# Apply the pos tags to the tokens
from nltk.tag import pos_tag

# download the dependency and resource as required
nltk.download("averaged_perceptron_tagger")

train_doc_ques["Doc_POS"] = train_doc_ques["Doc_Tokens"].apply(pos_tag)
train_doc_ques["Q_POS"] = train_doc_ques["Q_Tokens"].apply(pos_tag)
test_doc_ques["Doc_POS"] = test_doc_ques["Doc_Tokens"].apply(pos_tag)
test_doc_ques["Q_POS"] = test_doc_ques["Q_Tokens"].apply(pos_tag)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/daniel/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
# checking the POS tags: # looks ok
train_doc_ques["Q_POS"][100]

[('how', 'WRB'),
 ('many', 'JJ'),
 ('schools', 'NNS'),
 ('are', 'VBP'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('big', 'JJ'),
 ('ten', 'NN')]

In [None]:
# Extract all unique POS Tags
all_pos_tags = (
    train_doc_ques["Doc_POS"].tolist()
    + test_doc_ques["Doc_POS"].tolist()
    + train_doc_ques["Q_POS"].tolist()
    + test_doc_ques["Q_POS"].tolist()
)


def get_unique_pos(data):
    pos_tags = set()
    for item in data:
        for _, pos_tag in item:
            pos_tags.add(pos_tag)

    pos_tag_index = {tag: i for i, tag in enumerate(sorted(pos_tags))}
    return pos_tag_index


pos_iden = get_unique_pos(all_pos_tags)  # list of tags
pos_iden

{'$': 0,
 'CC': 1,
 'CD': 2,
 'DT': 3,
 'EX': 4,
 'FW': 5,
 'IN': 6,
 'JJ': 7,
 'JJR': 8,
 'JJS': 9,
 'MD': 10,
 'NN': 11,
 'NNP': 12,
 'NNPS': 13,
 'NNS': 14,
 'PDT': 15,
 'POS': 16,
 'PRP': 17,
 'PRP$': 18,
 'RB': 19,
 'RBR': 20,
 'RBS': 21,
 'RP': 22,
 'SYM': 23,
 'TO': 24,
 'UH': 25,
 'VB': 26,
 'VBD': 27,
 'VBG': 28,
 'VBN': 29,
 'VBP': 30,
 'VBZ': 31,
 'WDT': 32,
 'WP': 33,
 'WP$': 34,
 'WRB': 35}

### NER Tagging


### Steps to run this:

-   pip install spacy
-   python -m spacy download en_core_web_sm

If loaded for the first time, restart kernel


In [None]:
# nltk using Spacy
# pip install -U spacy
# python -m spacy download en_core_web_sm
import spacy
import en_core_web_sm

# loading pre-trained model of NER
#nlp = en_core_web_sm.load()
#nlp.to_disk("en_core_web_sm")

2023-05-19 10:24:54.108226: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
nlp = spacy.load("./en_core_web_sm/")

In [None]:
def ner_tagging(texts):
    tagged_texts = []
    for text in texts:
        doc = spacy.tokens.Doc(nlp.vocab, words=text)
        nlp.get_pipe("ner")(doc)
        tagged_texts.append([(token.text, token.ent_type_) for token in doc])
    return tagged_texts

In [None]:
# Will take a while...
train_doc_ques["Doc_NER"] = ner_tagging(train_doc_ques["Doc_Tokens"])
train_doc_ques["Q_NER"] = ner_tagging(train_doc_ques["Q_Tokens"])
test_doc_ques["Doc_NER"] = ner_tagging(test_doc_ques["Doc_Tokens"])
test_doc_ques["Q_NER"] = ner_tagging(test_doc_ques["Q_Tokens"])

In [None]:
train_doc_ques["Q_NER"][12]

[('how', ''),
 ('much', ''),
 ('are', ''),
 ('the', ''),
 ('harry', 'WORK_OF_ART'),
 ('potter', 'WORK_OF_ART'),
 ('movies', 'WORK_OF_ART'),
 ('worth', 'WORK_OF_ART')]

In [None]:
# Similar approach to the POS

# Extract all unique POS Tags
all_ner_tags = (
    train_doc_ques["Doc_NER"].tolist()
    + test_doc_ques["Doc_NER"].tolist()
    + train_doc_ques["Q_NER"].tolist()
    + test_doc_ques["Q_NER"].tolist()
)


def get_unique_ner(data):
    ner_tags = set()
    for item in data:
        for _, ner_tag in item:
            ner_tags.add(ner_tag)

    ner_tag_index = {tag: i for i, tag in enumerate(sorted(ner_tags))}
    return ner_tag_index


ner_iden = get_unique_pos(all_ner_tags)  # list of tags
ner_iden

{'': 0,
 'CARDINAL': 1,
 'DATE': 2,
 'EVENT': 3,
 'FAC': 4,
 'GPE': 5,
 'LANGUAGE': 6,
 'LAW': 7,
 'LOC': 8,
 'MONEY': 9,
 'NORP': 10,
 'ORDINAL': 11,
 'ORG': 12,
 'PERCENT': 13,
 'PERSON': 14,
 'PRODUCT': 15,
 'QUANTITY': 16,
 'TIME': 17,
 'WORK_OF_ART': 18}

In [None]:
# check ohv dims
ner_idx = ner_iden.values()
aa = np.eye(max(ner_idx) + 1)
# aa

### TF-IDF

First, calculate the document frequency of each token in the entire corpus (training documents + testing documents). The result is a dictionary where each token is a key and its value is the document frequency.


In [None]:
def document_frequency(corpus):
    """
    Computes the document frequency for every token in the corpus.
    Returns a dictionary {token: doc_freq, ...}
    """
    document_frequency = {}
    for document in corpus:
        for token in np.unique(document):
            try:
                document_frequency[token] += 1
            except:
                document_frequency[token] = 1
    return document_frequency


train_corpus = (
    train_doc_ques["Doc_Tokens"].tolist() + train_doc_ques["Q_Tokens"].tolist()
)
test_corpus = test_doc_ques["Doc_Tokens"].tolist() + test_doc_ques["Q_Tokens"].tolist()
train_doc_freq = document_frequency(train_corpus)
test_doc_freq = document_frequency(test_corpus)

Now calculate TF-IDF using the document frequency from above.


In [None]:
from collections import Counter
import math


def compute_tf_idf(corpus, doc_frequency):
    """
    Computes the term frequency inverse document frequency for every token in every document in the corpus.
    Returns a list the same shape as the list of tokenized documents except every token is replaced with the tf-idf
    for that token.
    """
    tf_idf = {}
    tf_idf_list = []
    N = len(doc_frequency)
    doc_id = 0
    for document in corpus:
        tf_idf_doc = []
        counter = Counter(document)
        total_num_words = len(document)
        for token in np.unique(document):
            tf = counter[token] / total_num_words
            df = doc_frequency[token]
            idf = math.log(N / (df + 1)) + 1
            tf_idf[doc_id, token] = tf * idf
        for token in document:
            tf_idf_doc.append(tf_idf[doc_id, token])
        tf_idf_list.append(tf_idf_doc)
        doc_id += 1
    return tf_idf_list


train_doc_ques["Doc_TFIDF"] = compute_tf_idf(
    train_doc_ques["Doc_Tokens"].tolist(), train_doc_freq
)
train_doc_ques["Q_TFIDF"] = compute_tf_idf(
    train_doc_ques["Q_Tokens"].tolist(), train_doc_freq
)
test_doc_ques["Doc_TFIDF"] = compute_tf_idf(
    test_doc_ques["Doc_Tokens"].tolist(), test_doc_freq
)
test_doc_ques["Q_TFIDF"] = compute_tf_idf(
    test_doc_ques["Q_Tokens"].tolist(), test_doc_freq
)

In [None]:
def one_hot_vectorize(
    pos_tagger, ner_tagger, data
):  # pass in the unique dict for ner or pos
    pos_idx = pos_tagger.values()
    pos_ohv = np.eye(max(pos_idx) + 1)  # create the ohv
    ner_idx = ner_tagger.values()
    ner_ohv = np.eye(max(ner_idx) + 1)

    dpos_full_ohv, dner_full_ohv = [], []  # lists to append to
    qpos_full_ohv, qner_full_ohv = [], []  # lists to append to

    for item in data["Doc_POS"]:
        sent_ohv = []
        for word in item:
            tag = word[1]
            pos_index_iden = pos_tagger[tag]
            sent_ohv.append(pos_ohv[pos_index_iden])
        dpos_full_ohv.append(sent_ohv)

    for item in data["Q_POS"]:
        sent_ohv = []
        for word in item:
            tag = word[1]
            pos_index_iden = pos_tagger[tag]
            sent_ohv.append(pos_ohv[pos_index_iden])
        qpos_full_ohv.append(sent_ohv)

    for item in data["Doc_NER"]:
        sent_ohv = []
        for word in item:
            tag = word[1]
            ner_index_iden = ner_tagger[tag]
            sent_ohv.append(ner_ohv[ner_index_iden])
        dner_full_ohv.append(sent_ohv)

    for item in data["Q_NER"]:
        sent_ohv = []
        for word in item:
            tag = word[1]
            ner_index_iden = ner_tagger[tag]
            sent_ohv.append(ner_ohv[ner_index_iden])
        qner_full_ohv.append(sent_ohv)

    return (dpos_full_ohv, qpos_full_ohv, dner_full_ohv, qner_full_ohv)

In [None]:
# get the ohv for doc
(
    train_doc_pos_ohv,
    train_q_pos_ohv,
    train_doc_ner_ohv,
    train_q_ner_ohv,
) = one_hot_vectorize(pos_iden, ner_iden, train_doc_ques)
test_doc_pos_ohv, test_q_pos_ohv, test_doc_ner_ohv, test_q_ner_ohv = one_hot_vectorize(
    pos_iden, ner_iden, test_doc_ques
)

In [None]:
# reduce the dataframe to just tokens and embeddings:
doc_emb_train = train_doc_ques[["Doc_Tokens", "Doc_Embeddings", "Doc_TFIDF"]]
doc_pos_ner = pd.DataFrame({"Doc_POS": train_doc_pos_ohv, "Doc_NER": train_doc_ner_ohv})
doc_emb_train = pd.concat([doc_emb_train, doc_pos_ner], axis=1)

q_emb_train = train_doc_ques[["Q_Tokens", "Q_Embeddings", "Q_TFIDF"]]
q_pos_ner = pd.DataFrame({"Q_POS": train_q_pos_ohv, "Q_NER": train_q_ner_ohv})
q_emb_train = pd.concat([q_emb_train, q_pos_ner], axis=1)

In [None]:
doc_emb_test = test_doc_ques[["Doc_Tokens", "Doc_Embeddings", "Doc_TFIDF"]]
doc_pos_ner = pd.DataFrame({"Doc_POS": test_doc_pos_ohv, "Doc_NER": test_doc_ner_ohv})
doc_emb_test = pd.concat([doc_emb_test, doc_pos_ner], axis=1)

q_emb_test = test_doc_ques[["Q_Tokens", "Q_Embeddings", "Q_TFIDF"]]
q_pos_ner = pd.DataFrame({"Q_POS": test_q_pos_ohv, "Q_NER": test_q_ner_ohv})
q_emb_test = pd.concat([q_emb_test, q_pos_ner], axis=1)

### Word Embeddings (Doc and Qn)

The embeddings of the questions and answers of the train and test set can be found here:

-   Train Document - doc_emb_train
-   Train Q - q_emb_train
-   Test Document - doc_emb_test
-   Test Q - q_emb_test

The max_document size is 1675 and max_question size is 23.
Combine all the embeddings into a full array

In [None]:
def full_array(data, data_type="Document"):
    num_vec_length = 156
    max_doc = 1675
    max_qn = 23
    zero_vec = np.zeros(156)

    if data_type == "Document":
        full_vec = []  # create a list for list of list for document
        for dat in range(len(data)):  # go through each line
            doc_ques = data.loc[dat]  # document data
            v = []  # create list to each word
            for j in range(len(doc_ques.iloc[0])):
                vn = []  # list of concat word embeddings
                vn.append(doc_ques.iloc[1][j].tolist())  # Word2Vec
                vn.append(doc_ques.iloc[2][j])  # TF-IDF
                vn.append(doc_ques.iloc[3][j].tolist())  # POS
                vn.append(doc_ques.iloc[4][j].tolist())  # NER
                flatten = [
                    item
                    for sublist in vn
                    for item in (sublist if isinstance(sublist, list) else [sublist])
                ]
                v.append(flatten)
            while len(v) < max_doc:
                v.append(zero_vec)
            full_vec.append(v)

    if data_type == "Question":
        full_vec = []  # create a list for list of list for document
        for dat in range(len(data)):  # go through each line
            doc_ques = data.loc[dat]  # document data
            v = []  # create list to each word
            for j in range(len(doc_ques.iloc[0])):
                vn = []  # list of concat word embeddings
                vn.append(doc_ques.iloc[1][j].tolist())  # Word2Vec
                vn.append(doc_ques.iloc[2][j])  # TF-IDF
                vn.append(doc_ques.iloc[3][j].tolist())  # POS
                vn.append(doc_ques.iloc[4][j].tolist())  # NER
                flatten = [
                    item
                    for sublist in vn
                    for item in (sublist if isinstance(sublist, list) else [sublist])
                ]
                v.append(flatten)
            while len(v) < max_qn:
                v.append(zero_vec)
            full_vec.append(v)
    return full_vec

In [None]:
# Training/Test Documents to pass in, takes about a min
final_doc_train = np.stack(full_array(doc_emb_train, data_type="Document"))
final_doc_test = np.stack(full_array(doc_emb_test, data_type="Document"))

In [None]:
# Training/Test Questions to pass in, takes about a few seconds
final_qn_train = np.stack(full_array(q_emb_train, data_type="Question"))
final_qn_test = np.stack(full_array(q_emb_test, data_type="Question"))

In [None]:
def convert_labels(labels):
    check = []
    for i in labels:
        if len(i) < 1675:
            while len(i) < 1675:
                i.append("N")
            check.append(i)
        else:
            check.append(i)
    return check


tr_labels = np.array(convert_labels(train_doc_ans_labels))
ts_labels = np.array(convert_labels(test_doc_ans_labels))

Earlier, we found that the sequence length of the documents can get quite large, which might introduce a lot of noise into the model with paddings. One alternative to reduce this noise is to perhaps truncate the sequences down to just over the median for the documents. We also need to do this for the outputs.


In [None]:
find_max_length(test_doc_ques["Doc_Embeddings"]), find_max_length(
    train_doc_ques["Doc_Embeddings"]
), find_max_length(test_doc_ques["Q_Embeddings"]), find_max_length(
    train_doc_ques["Q_Embeddings"]
)

('Max Seq Length is 1675, Median is 181.0, Number of lines is 630)',
 'Max Seq Length is 924, Median is 182.0, Number of lines is 2117)',
 'Max Seq Length is 19, Median is 7.0, Number of lines is 630)',
 'Max Seq Length is 23, Median is 7.0, Number of lines is 2117)')

In [None]:
# truncate the embeddings and the labels to 200


def truncate(data, labels, max_seq=200):
    data = data[:, :max_seq, :]
    labels = labels[:, :max_seq]
    return data, labels


final_doc_train, tr_labels = truncate(final_doc_train, tr_labels)
final_doc_test, ts_labels = truncate(final_doc_test, ts_labels)

In [None]:
# final prepared documents are found here:

# final_doc_train, tr_labels - doc embeddings and output labels for training
# final_doc_test, ts_labels - doc embeddings and output labels for testing
# final_qn_train - question embeddings for training
# final_qn_test - question embeddings for testing

Convert the arrays to a float32, then save those np arrays

In [None]:
# reduce to float32 
final_doc_train = final_doc_train.astype("float32")
final_doc_test = final_doc_test.astype("float32")
final_qn_train = final_qn_train.astype("float32")
final_qn_test = final_qn_test.astype("float32")

# save the np.arrays
np.save("./cleaneddata/final_doc_train.npy", final_doc_train)
np.save("./cleaneddata/final_doc_test.npy", final_doc_test)
np.save("./cleaneddata/final_qn_train.npy", final_qn_train)
np.save("./cleaneddata/final_qn_test.npy", final_qn_test)
np.save("./cleaneddata/tr_labels.npy", tr_labels)
np.save("./cleaneddata/ts_labels.npy", ts_labels)
