In [46]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

import nltk
nltk.download('punkt')
from nltk import word_tokenize
import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/finnmurphy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [47]:
# reading in the data
train_data = pd.read_csv('WikiQA-train.tsv', sep='\t')
test_data = pd.read_csv('WikiQA-test.tsv', sep='\t')

Extract the unique questions from the train and test data frames, including the documentID and the DocumentTitle

In [48]:
def get_questions_documenttag(data):
    qd = data[['Question', 'QuestionID', 'DocumentID','DocumentTitle']].drop_duplicates()
    return qd
train_question_doctag = get_questions_documenttag(train_data)
test_question_doctag = get_questions_documenttag(test_data)

In [49]:
# get unique questions
train_questions = train_question_doctag['Question']
test_questions = test_question_doctag['Question']

In [50]:
# get the unique document ids
train_docid = train_question_doctag['DocumentID']
test_docid = test_question_doctag['DocumentID']

Extract the answers to those questions.

In [51]:
def get_answers(data, questions, documentids): 
    answers = [] # list of answers
    for q in range(len(questions)):
        question = questions.iloc[q]
        doc_id = documentids.iloc[q] # add the document id
        df = data[data['Question'] == question]
        index = df.loc[df['Label'] == 1]['Sentence'].index.values
        if len(index) == 0: # if no answer found
            answers.append([question, doc_id, 'No answer'])
        else: # if 1 answer found
            answers.append([question, doc_id, df.loc[index[0], "Sentence"]])
    return answers

train_answers = pd.DataFrame(get_answers(train_data, train_questions, train_docid))
test_answers = pd.DataFrame(get_answers(test_data, test_questions, test_docid))

The above get_answers returns train_answers and test_answers which, gives us in the following columns
- Question
- Related Document ID
- Answer (if no answer to that question, return no answer)

In [52]:
def get_documents(data, questions, documentids): # (done by Finn, tweaked by Dan)
    documents = []
    for q in range(len(questions)):
        question = questions.iloc[q]
        doc_id = documentids.iloc[q] # add the document id
        df = data[data['Question'] == question]
        sentences = df['Sentence'].tolist()
        for i in range(0, len(sentences) - 1):
            sentences[i] = sentences[i] + ' '
        documents.append([doc_id,''.join(sentences)])
    return documents

train_documents = pd.DataFrame(get_documents(train_data, train_questions, train_docid)) # return the individual document in list
test_documents = pd.DataFrame(get_documents(test_data, test_questions, test_docid)) # return the individual document in list

The above train_documents and test_documents called from the get_documents gives us in the following columns
- Document ID
- Full Document

In [53]:
# renaming all the columns for more standardised access
train_answers.columns = ['Question','DocumentID','Answer']
test_answers.columns = ['Question','DocumentID','Answer']
train_documents.columns = ['DocumentID','Document']
test_documents.columns = ['DocumentID','Document']

In [54]:
# result is 2117, 2117, 630, 630

len(train_answers),len(train_documents), len(test_answers),len(test_documents)

(2117, 2117, 630, 630)

**Prior to tagging, we should maybe clean the document and answers first:** (stopped here)

Maybe? 
- lowercase (might lose context, but we can use on questions)
- removing any punctuation or weird symbols (do)
- removal of stop words? (probably not)

Make sure that the pre-processing is standardised to be the same throughout doc and ans.

In [55]:
def preprocess_lower(text):
    # Lowercase the text for question, answer and documents
    text = text.lower()
    return text

train_answers[['Question', 'Answer']] = train_answers[['Question', 'Answer']].applymap(preprocess_lower)
train_documents['Document'] = train_documents['Document'].apply(preprocess_lower)
test_answers[['Question', 'Answer']] = test_answers[['Question', 'Answer']].applymap(preprocess_lower)
test_documents['Document'] = test_documents['Document'].apply(preprocess_lower)

In [56]:
train_documents

Unnamed: 0,DocumentID,Document
0,D1,a partly submerged glacier cave on perito more...
1,D2,"in physics , circular motion is a movement of ..."
2,D5,apollo creed is a fictional character from the...
3,D6,"in the united states, the title of federal jud..."
4,D7,the beretta 21a bobcat is a small pocket-sized...
...,...,...
2112,D2805,blue mountain state is an american comedy seri...
2113,D2806,"apple inc., formerly apple computer, inc., is ..."
2114,D2807,section 8 housing in the south bronx section 8...
2115,D2808,restaurants categorized by type and informatio...


In [57]:
def labelling(documents, answers):
    tagged_documents = []
    for q in range(len(answers)):
        tagged_document = []
        qn = answers['Question'].loc[q]
        doc_id = answers['DocumentID'].loc[q]
        content = documents.loc[documents['DocumentID'] == doc_id,'Document'].values[0]
        answer = answers['Answer'].loc[q]

        if answer == 'no answer':
            tokens = word_tokenize(content)
            for j in range(len(tokens)):
                tagged_document.append('N') # none 
        else:
            parts = content.partition(answer)
            for j in range(len(parts)):
                tokens = word_tokenize(parts[j])
                if j == 1:
                    tagged_document.append('S') # start of answer
                    for k in range(len(tokens) - 2):
                        tagged_document.append('I') # inside of answer
                    tagged_document.append('E') # end of answer
                else:
                    for k in range(len(tokens)):
                        tagged_document.append('N') # outside answer
        tagged_documents.append(tagged_document)
    return(tagged_documents)

train_doc_ans_labels = labelling(train_documents, train_answers)
test_doc_ans_labels = labelling(test_documents, test_answers)

In [58]:
# check if tags are good
def testing_tokens(ind, labels, documents, answers):
    for i,j in zip(labels[ind],word_tokenize(documents['Document'][ind])):
        print([i,j])
    print(answers['Answer'][ind])
testing_tokens(100 , train_doc_ans_labels, train_documents, train_answers)

['N', 'the']
['N', 'big']
['N', 'ten']
['N', 'conference']
['N', ',']
['N', 'formerly']
['N', 'western']
['N', 'conference']
['N', 'and']
['N', 'big']
['N', 'nine']
['N', 'conference']
['N', ',']
['N', 'is']
['N', 'the']
['N', 'oldest']
['N', 'division']
['N', 'i']
['N', 'college']
['N', 'athletic']
['N', 'conference']
['N', 'in']
['N', 'the']
['N', 'united']
['N', 'states']
['N', '.']
['S', 'its']
['I', 'twelve']
['I', 'member']
['I', 'institutions']
['I', '(']
['I', 'which']
['I', 'are']
['I', 'primarily']
['I', 'flagship']
['I', 'research']
['I', 'universities']
['I', 'in']
['I', 'their']
['I', 'respective']
['I', 'states']
['I', ',']
['I', 'well-regarded']
['I', 'academically']
['I', ',']
['I', 'and']
['I', 'with']
['I', 'relatively']
['I', 'large']
['I', 'student']
['I', 'enrollment']
['I', ')']
['I', 'are']
['I', 'located']
['I', 'primarily']
['I', 'in']
['I', 'the']
['I', 'midwest']
['I', ',']
['I', 'stretching']
['I', 'from']
['I', 'nebraska']
['I', 'in']
['I', 'the']
['I', 'we

Cleaned Documents: train and test

train_answers - contains the ['Question','DocumentID','Answer'] 

train_documents - contains the ['DocumentID','Document']

train_doc_ans_labels - contains a list of list of answer tags for each document, 

In [59]:
# To prepare the document for word embeddings:
train_doc_ques = pd.DataFrame({'Document': train_documents['Document'],
                               'Question': train_answers['Question']})
test_doc_ques = pd.DataFrame({'Document': test_documents['Document'],
                               'Question': test_answers['Question']})

### Word Embeddings

To use the CBOW model, we need the data in sentences. Extract this from the original dataset, don't use sent_tokenise, will mess with some of the fullstops, we want to maintain structure from above

In [60]:
def word_tokens(data):
    sentence_list = []
    for i in range(len(data)):
        sentence_list.append(word_tokenize(data[i]))
    return(sentence_list)
train_doc_list = word_tokens(train_doc_ques['Document'])
train_ques_list = word_tokens(train_doc_ques['Question'])
test_doc_list = word_tokens(test_doc_ques['Document'])
test_ques_list = word_tokens(test_doc_ques['Question'])

In [61]:
combined_text = train_doc_list + train_ques_list + test_doc_list + test_ques_list

In [62]:
wc_cbow_model = Word2Vec(sentences=combined_text, vector_size=100, window=5, min_count=1, workers=2, epochs=30)
wc_cbow_model.save("cbow.model")

To implement QA

1. Word Embeddings, using CBOW
2. Feature Extraction 1 - POS tags
3. Feature Extraction 2 - TF-IDF 
4. Feature Extraction 3 - NER

In [63]:
def get_word_embeddings(doc):
    tokenized_doc = word_tokenize(doc)
    embeddings = [wc_cbow_model.wv[word] for word in tokenized_doc]
    return embeddings

train_doc_ques['Doc_Embeddings'] = train_doc_ques['Document'].apply(get_word_embeddings)
train_doc_ques['Q_Embeddings'] = train_doc_ques['Question'].apply(get_word_embeddings)
test_doc_ques['Doc_Embeddings'] = test_doc_ques['Document'].apply(get_word_embeddings)
test_doc_ques['Q_Embeddings'] = test_doc_ques['Question'].apply(get_word_embeddings)

In [64]:
train_doc_ques['Doc_Tokens'] = train_doc_ques['Document'].apply(word_tokenize)
train_doc_ques['Q_Tokens'] =  train_doc_ques['Question'].apply(word_tokenize)
test_doc_ques['Doc_Tokens'] = test_doc_ques['Document'].apply(word_tokenize)
test_doc_ques['Q_Tokens'] = test_doc_ques['Question'].apply(word_tokenize)

In [65]:
def check_count(doc):
    count = 0
    for i in range(len(doc)):
        if len(doc['Doc_Embeddings'][i]) != len(doc['Doc_Tokens'][i]):
            count += 1
        elif len(doc['Q_Embeddings'][i]) != len(doc['Q_Tokens'][i]):
            count += 1
        else:
            continue
    return(count)
        
check_count(train_doc_ques) # looks good

0

Note, need to convert the POS tags, NER tags into embeddings. After this, pad the questions and answers to the max question/document length in the combined training and test set.

### PoS Tagging

In [66]:
# Apply the pos tags to the tokens 
from nltk.tag import pos_tag
# download the dependency and resource as required
nltk.download('averaged_perceptron_tagger')

train_doc_ques['Doc_POS'] = train_doc_ques['Doc_Tokens'].apply(pos_tag)
train_doc_ques['Q_POS'] =  train_doc_ques['Q_Tokens'].apply(pos_tag)
test_doc_ques['Doc_POS'] = test_doc_ques['Doc_Tokens'].apply(pos_tag)
test_doc_ques['Q_POS'] = test_doc_ques['Q_Tokens'].apply(pos_tag)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/finnmurphy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [67]:
# checking the POS tags: # looks ok
test_doc_ques['Q_POS'][0]

[('how', 'WRB'),
 ('african', 'JJ'),
 ('americans', 'NNS'),
 ('were', 'VBD'),
 ('immigrated', 'VBN'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('us', 'PRP')]

In [68]:
# Extract all unique POS Tags
all_pos_tags = train_doc_ques['Doc_POS'].tolist() + test_doc_ques['Doc_POS'].tolist() + train_doc_ques['Q_POS'].tolist() + test_doc_ques['Q_POS'].tolist()

def get_unique_pos(data):
    pos_tags = set()
    for item in data:
        for _,pos_tag in item:
            pos_tags.add(pos_tag)

    pos_tag_index = {tag: i for i, tag in enumerate(sorted(pos_tags))}
    return pos_tag_index

pos_iden = get_unique_pos(all_pos_tags) # list of tags
pos_iden

{'#': 0,
 '$': 1,
 "''": 2,
 '(': 3,
 ')': 4,
 ',': 5,
 '.': 6,
 ':': 7,
 'CC': 8,
 'CD': 9,
 'DT': 10,
 'EX': 11,
 'FW': 12,
 'IN': 13,
 'JJ': 14,
 'JJR': 15,
 'JJS': 16,
 'MD': 17,
 'NN': 18,
 'NNP': 19,
 'NNPS': 20,
 'NNS': 21,
 'PDT': 22,
 'POS': 23,
 'PRP': 24,
 'PRP$': 25,
 'RB': 26,
 'RBR': 27,
 'RBS': 28,
 'RP': 29,
 'SYM': 30,
 'TO': 31,
 'UH': 32,
 'VB': 33,
 'VBD': 34,
 'VBG': 35,
 'VBN': 36,
 'VBP': 37,
 'VBZ': 38,
 'WDT': 39,
 'WP': 40,
 'WP$': 41,
 'WRB': 42,
 '``': 43}

### NER Tagging

### Steps to run this:

- pip install spacy 
- python -m spacy download en_core_web_sm

If loaded for the first time, restart kernel

In [69]:
# nltk using Spacy
# pip install -U spacy
# python -m spacy download en_core_web_sm
import spacy
import en_core_web_sm

# loading pre-trained model of NER
nlp = en_core_web_sm.load()

In [70]:
def ner_tagging(texts):
    tagged_texts = []
    for text in texts:
        doc = spacy.tokens.Doc(nlp.vocab, words=text)
        nlp.get_pipe("ner")(doc)
        tagged_texts.append([(token.text, token.ent_type_) for token in doc])
    return tagged_texts

In [71]:
# Will take a while...
train_doc_ques['Doc_NER'] = ner_tagging(train_doc_ques['Doc_Tokens'])
train_doc_ques['Q_NER'] = ner_tagging(train_doc_ques['Q_Tokens'])
test_doc_ques['Doc_NER'] = ner_tagging(test_doc_ques['Doc_Tokens'])
test_doc_ques['Q_NER'] = ner_tagging(test_doc_ques['Q_Tokens'])

In [72]:
# Similar approach to the POS

# Extract all unique POS Tags
all_ner_tags = train_doc_ques['Doc_NER'].tolist() + test_doc_ques['Doc_NER'].tolist() + train_doc_ques['Q_NER'].tolist() + test_doc_ques['Q_NER'].tolist()

def get_unique_ner(data):
    ner_tags = set()
    for item in data:
        for _,ner_tag in item:
            ner_tags.add(ner_tag)

    ner_tag_index = {tag: i for i, tag in enumerate(sorted(ner_tags))}
    return ner_tag_index

ner_iden = get_unique_pos(all_ner_tags) # list of tags
ner_iden

{'': 0,
 'CARDINAL': 1,
 'DATE': 2,
 'EVENT': 3,
 'FAC': 4,
 'GPE': 5,
 'LANGUAGE': 6,
 'LAW': 7,
 'LOC': 8,
 'MONEY': 9,
 'NORP': 10,
 'ORDINAL': 11,
 'ORG': 12,
 'PERCENT': 13,
 'PERSON': 14,
 'PRODUCT': 15,
 'QUANTITY': 16,
 'TIME': 17,
 'WORK_OF_ART': 18}

In [73]:
ner_idx = ner_iden.values()
aa = np.eye(max(ner_idx) + 1)
len(aa)

19

### TF-IDF

First, calculate the document frequency of each token in the entire corpus (training documents + testing documents). The result is a dictionary where each token is a key and its value is the document frequency.

In [103]:
def document_frequency(corpus):
    """
    Computes the document frequency for every token in the corpus. 
    Returns a dictionary {token: doc_freq, ...}
    """
    document_frequency = {}
    for document in corpus:
        for token in np.unique(document):
            try:
                document_frequency[token] += 1
            except:
                document_frequency[token] = 1
    return document_frequency

train_corpus = train_doc_ques['Doc_Tokens'].tolist() + train_doc_ques['Q_Tokens'].tolist()
test_corpus = test_doc_ques['Doc_Tokens'].tolist() + test_doc_ques['Q_Tokens'].tolist()
train_doc_freq = document_frequency(train_corpus)
test_doc_freq = document_frequency(test_corpus)

Now calculate TF-IDF using the document frequency from above.

In [111]:
from collections import Counter
import math

def compute_tf_idf(corpus, doc_frequency):
    """
    Computes the term frequency inverse document frequency for every token in every document in the corpus.
    """
    tf_idf = {}
    tf_idf_list = []
    N = len(doc_frequency)
    doc_id = 0
    for document in corpus:
        tf_idf_doc = []
        counter = Counter(document)
        total_num_words = len(document)
        for token in np.unique(document):
            tf = counter[token] / total_num_words
            df = doc_frequency[token]
            idf = math.log(N / (df + 1)) + 1
            tf_idf[doc_id, token] = tf * idf
        for token in document:
            tf_idf_doc.append(tf_idf[doc_id, token])
        tf_idf_list.append(tf_idf_doc)
        doc_id += 1
    return tf_idf_list

train_doc_tf_idf = compute_tf_idf(train_doc_ques['Doc_Tokens'].tolist(), train_doc_freq)
train_q_tf_idf = compute_tf_idf(train_doc_ques['Q_Tokens'].tolist(), train_doc_freq)
test_doc_tf_idf = compute_tf_idf(test_doc_ques['Doc_Tokens'].tolist(), test_doc_freq)
test_q_tf_idf = compute_tf_idf(test_doc_ques['Q_Tokens'].tolist(), test_doc_freq)

(62, 62)

In [99]:
train_doc_ques.head()

Unnamed: 0,Document,Question,Doc_Embeddings,Q_Embeddings,Doc_Tokens,Q_Tokens,Doc_POS,Q_POS,Doc_NER,Q_NER
0,a partly submerged glacier cave on perito more...,how are glacier caves formed?,"[[2.930701, -0.3543262, 1.688855, -1.1701261, ...","[[-2.889528, -1.7223865, 0.31415024, -1.810862...","[a, partly, submerged, glacier, cave, on, peri...","[how, are, glacier, caves, formed, ?]","[(a, DT), (partly, RB), (submerged, VBN), (gla...","[(how, WRB), (are, VBP), (glacier, JJ), (caves...","[(a, ), (partly, ), (submerged, ), (glacier, )...","[(how, ), (are, ), (glacier, ), (caves, ), (fo..."
1,"in physics , circular motion is a movement of ...",how are the directions of the velocity and for...,"[[-0.1628859, 0.003616946, 0.6113787, 0.151834...","[[-2.889528, -1.7223865, 0.31415024, -1.810862...","[in, physics, ,, circular, motion, is, a, move...","[how, are, the, directions, of, the, velocity,...","[(in, IN), (physics, NNS), (,, ,), (circular, ...","[(how, WRB), (are, VBP), (the, DT), (direction...","[(in, ), (physics, ), (,, ), (circular, ), (mo...","[(how, ), (are, ), (the, ), (directions, ), (o..."
2,apollo creed is a fictional character from the...,how did apollo creed die,"[[0.31325287, 0.7915107, -0.22150037, -0.34180...","[[-2.889528, -1.7223865, 0.31415024, -1.810862...","[apollo, creed, is, a, fictional, character, f...","[how, did, apollo, creed, die]","[(apollo, NNS), (creed, VBP), (is, VBZ), (a, D...","[(how, WRB), (did, VBD), (apollo, VB), (creed,...","[(apollo, ORG), (creed, ), (is, ), (a, ), (fic...","[(how, ), (did, ), (apollo, ORG), (creed, ), (..."
3,"in the united states, the title of federal jud...",how long is the term for federal judges,"[[-0.1628859, 0.003616946, 0.6113787, 0.151834...","[[-2.889528, -1.7223865, 0.31415024, -1.810862...","[in, the, united, states, ,, the, title, of, f...","[how, long, is, the, term, for, federal, judges]","[(in, IN), (the, DT), (united, JJ), (states, N...","[(how, WRB), (long, JJ), (is, VBZ), (the, DT),...","[(in, ), (the, GPE), (united, GPE), (states, G...","[(how, ), (long, ), (is, ), (the, ), (term, ),..."
4,the beretta 21a bobcat is a small pocket-sized...,how a beretta model 21 pistols magazines works,"[[0.5426823, -0.91372156, -0.005410878, 0.4399...","[[-2.889528, -1.7223865, 0.31415024, -1.810862...","[the, beretta, 21a, bobcat, is, a, small, pock...","[how, a, beretta, model, 21, pistols, magazine...","[(the, DT), (beretta, NN), (21a, CD), (bobcat,...","[(how, WRB), (a, DT), (beretta, NN), (model, N...","[(the, ), (beretta, ), (21a, ), (bobcat, ), (i...","[(how, ), (a, ), (beretta, PRODUCT), (model, )..."


In [38]:
def one_hot_vectorize(pos_tagger, ner_tagger, data): # pass in the unique dict for ner or pos
    pos_idx = pos_tagger.values()
    pos_ohv = np.eye(max(pos_idx) + 1) # create the ohv
    ner_idx = ner_tagger.values()
    ner_ohv = np.eye(max(ner_idx) + 1)
    
    dpos_full_ohv, dner_full_ohv = [], [] # lists to append to 
    qpos_full_ohv, qner_full_ohv = [], [] # lists to append to

    for item in data['Doc_POS']:
        sent_ohv = []
        for word in item:
            tag = word[1]
            pos_index_iden = pos_tagger[tag]
            sent_ohv.append(pos_ohv[pos_index_iden])
        dpos_full_ohv.append(sent_ohv)
    
    for item in data['Q_POS']:
        sent_ohv = []
        for word in item:
            tag = word[1]
            pos_index_iden = pos_tagger[tag]
            sent_ohv.append(pos_ohv[pos_index_iden])
        qpos_full_ohv.append(sent_ohv)
    
    for item in data['Doc_NER']:
        sent_ohv = []
        for word in item:
            tag = word[1]
            ner_index_iden = ner_tagger[tag]
            sent_ohv.append(ner_ohv[ner_index_iden])
        dner_full_ohv.append(sent_ohv)
    
    for item in data['Q_NER']:
        sent_ohv = []
        for word in item:
            tag = word[1]
            ner_index_iden = ner_tagger[tag]
            sent_ohv.append(ner_ohv[ner_index_iden])
        qner_full_ohv.append(sent_ohv)
    
    return(dpos_full_ohv, qpos_full_ohv, dner_full_ohv, qner_full_ohv)


In [39]:
# get the ohv for doc
train_doc_pos_ohv, train_q_pos_ohv, train_doc_ner_ohv, train_q_ner_ohv = one_hot_vectorize(pos_iden, ner_iden, train_doc_ques)
test_doc_pos_ohv, test_q_pos_ohv, test_doc_ner_ohv, test_q_ner_ohv = one_hot_vectorize(pos_iden, ner_iden, test_doc_ques)

In [40]:
train_doc_ques[:5]

Unnamed: 0,Document,Question,Doc_Embeddings,Q_Embeddings,Doc_Tokens,Q_Tokens,Doc_POS,Q_POS,Doc_NER,Q_NER
0,a partly submerged glacier cave on perito more...,how are glacier caves formed?,"[[2.9428787, -0.752085, 0.5050353, -1.7412678,...","[[-3.5760026, -0.99570036, 0.27545172, -1.0006...","[a, partly, submerged, glacier, cave, on, peri...","[how, are, glacier, caves, formed, ?]","[(a, DT), (partly, RB), (submerged, VBN), (gla...","[(how, WRB), (are, VBP), (glacier, JJ), (caves...","[(a, ), (partly, ), (submerged, ), (glacier, )...","[(how, ), (are, ), (glacier, ), (caves, ), (fo..."
1,"in physics , circular motion is a movement of ...",how are the directions of the velocity and for...,"[[-0.6306072, 0.5474009, 1.064857, -1.73173, -...","[[-3.5760026, -0.99570036, 0.27545172, -1.0006...","[in, physics, ,, circular, motion, is, a, move...","[how, are, the, directions, of, the, velocity,...","[(in, IN), (physics, NNS), (,, ,), (circular, ...","[(how, WRB), (are, VBP), (the, DT), (direction...","[(in, ), (physics, ), (,, ), (circular, ), (mo...","[(how, ), (are, ), (the, ), (directions, ), (o..."
2,apollo creed is a fictional character from the...,how did apollo creed die,"[[0.08698449, 0.91935104, -0.29889715, -0.3628...","[[-3.5760026, -0.99570036, 0.27545172, -1.0006...","[apollo, creed, is, a, fictional, character, f...","[how, did, apollo, creed, die]","[(apollo, NNS), (creed, VBP), (is, VBZ), (a, D...","[(how, WRB), (did, VBD), (apollo, VB), (creed,...","[(apollo, ORG), (creed, ), (is, ), (a, ), (fic...","[(how, ), (did, ), (apollo, ORG), (creed, ), (..."
3,"in the united states, the title of federal jud...",how long is the term for federal judges,"[[-0.6306072, 0.5474009, 1.064857, -1.73173, -...","[[-3.5760026, -0.99570036, 0.27545172, -1.0006...","[in, the, united, states, ,, the, title, of, f...","[how, long, is, the, term, for, federal, judges]","[(in, IN), (the, DT), (united, JJ), (states, N...","[(how, WRB), (long, JJ), (is, VBZ), (the, DT),...","[(in, ), (the, GPE), (united, GPE), (states, G...","[(how, ), (long, ), (is, ), (the, ), (term, ),..."
4,the beretta 21a bobcat is a small pocket-sized...,how a beretta model 21 pistols magazines works,"[[0.65172535, -1.1209415, 0.3399397, -0.155244...","[[-3.5760026, -0.99570036, 0.27545172, -1.0006...","[the, beretta, 21a, bobcat, is, a, small, pock...","[how, a, beretta, model, 21, pistols, magazine...","[(the, DT), (beretta, NN), (21a, CD), (bobcat,...","[(how, WRB), (a, DT), (beretta, NN), (model, N...","[(the, ), (beretta, ), (21a, ), (bobcat, ), (i...","[(how, ), (a, ), (beretta, PRODUCT), (model, )..."


In [41]:
# reduce the dataframe to just tokens and embeddings:
doc_emb_train = train_doc_ques[['Doc_Tokens','Doc_Embeddings']]
doc_pos_ner = pd.DataFrame({'Doc_POS':train_doc_pos_ohv,
              'Doc_NER':train_doc_ner_ohv})
doc_emb_train = pd.concat([doc_emb_train, doc_pos_ner], axis=1)

q_emb_train = train_doc_ques[['Q_Tokens','Q_Embeddings']]
q_pos_ner = pd.DataFrame({'Q_POS':train_q_pos_ohv,
              'Q_NER':train_q_ner_ohv})
q_emb_train = pd.concat([q_emb_train, q_pos_ner], axis=1)

In [42]:
doc_emb_test = test_doc_ques[['Doc_Tokens','Doc_Embeddings']]
doc_pos_ner = pd.DataFrame({'Doc_POS':test_doc_pos_ohv,
              'Doc_NER':test_doc_ner_ohv})
doc_emb_test = pd.concat([doc_emb_test, doc_pos_ner], axis=1)

q_emb_test = test_doc_ques[['Q_Tokens','Q_Embeddings']]
q_pos_ner = pd.DataFrame({'Q_POS':test_q_pos_ohv,
              'Q_NER':test_q_ner_ohv})
q_emb_test = pd.concat([q_emb_test, q_pos_ner], axis=1)

### Word Embeddings (Doc and Qn)
- Still have to add TF-IDF.

The embeddings of the questions and answers of the train and test set can be found here:

- Train Document - doc_emb_train
- Train Q - q_emb_train
- Test Document - doc_emb_test
- Test Q - q_emb_test

In [234]:
q_emb_test

Unnamed: 0,Q_Tokens,Q_Embeddings,Q_POS,Q_NER
0,"[how, african, americans, were, immigrated, to...","[[-3.2678661, -1.5818017, 0.79780596, -2.83997...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,"[how, large, were, early, jails]","[[-3.2678661, -1.5818017, 0.79780596, -2.83997...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,"[how, a, water, pump, works]","[[-3.2678661, -1.5818017, 0.79780596, -2.83997...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,"[how, old, was, sue, lyon, when, she, made, lo...","[[-3.2678661, -1.5818017, 0.79780596, -2.83997...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,"[how, are, antibodies, used, in]","[[-3.2678661, -1.5818017, 0.79780596, -2.83997...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
...,...,...,...,...
625,"[where, is, the, brisket, from]","[[-0.8713485, 1.0382011, -0.09509377, 0.432915...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
626,"[what, is, arm, chipset]","[[-0.4813437, -3.4788215, -2.8871129, -1.69282...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
627,"[what, is, the, life, span, of, june, bugs]","[[-0.4813437, -3.4788215, -2.8871129, -1.69282...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
628,"[who, is, the, youngest, female, to, give, bir...","[[0.9287824, -1.5731605, 2.5839703, -3.4665534...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [247]:
# find max_length of the document
max_len_doc = 0
for i in doc_emb_train['Doc_Tokens']:
    if len(i) > max_len_doc:
        max_len_doc = len(i)
    else:
        continue
        
for i in doc_emb_test['Doc_Tokens']:
    if len(i) > max_len_doc:
        max_len_doc = len(i)
    else:
        continue
max_len_doc

1755

In [252]:
# find max_length of question
max_len_qn = 0
for i in q_emb_train['Q_Tokens']:
    if len(i) > max_len_qn:
        max_len_qn = len(i)
    else:
        continue

for i in q_emb_test['Q_Tokens']:
    if len(i) > max_len_qn:
        max_len_qn = len(i)
    else:
        continue
max_len_qn

23

In [279]:
questions = test_doc_ques[-2:]['Question'].values.tolist()

['what', 'is', 'an', 'open', 'mare', '?']