In [195]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize
import pandas as pd

[nltk_data] Downloading package punkt to /Users/daniel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [196]:
# reading in the data
train_data = pd.read_csv('WikiQA-train.tsv', sep='\t')
test_data = pd.read_csv('WikiQA-test.tsv', sep='\t')

Extract the unique questions from the train and test data frames, including the documentID and the DocumentTitle

In [268]:
def get_questions_documenttag(data):
    qd = data[['Question', 'QuestionID', 'DocumentID','DocumentTitle']].drop_duplicates()
    return qd
train_question_doctag = get_questions_documenttag(train_data)
test_question_doctag = get_questions_documenttag(test_data)

In [269]:
# get unique questions
train_questions = train_question_doctag['Question']
test_questions = test_question_doctag['Question']

In [270]:
# get the unique document ids
train_docid = train_question_doctag['DocumentID']
test_docid = test_question_doctag['DocumentID']

Extract the answers to those questions.

In [328]:
def get_answers(data, questions, documentids): #(done by Finn, tweaked by Dan)
    answers = [] # list of answers
    for q in range(len(questions)):
        question = questions.iloc[q]
        doc_id = documentids.iloc[q] # add the document id
        df = data[data['Question'] == question]
        index = df.loc[df['Label'] == 1]['Sentence'].index.values
        if len(index) == 0: # if no answer found
            answers.append([question, doc_id, 'No answer'])
        elif len(index) == 1: # if 1 answer found
            answers.append([question, doc_id, df.loc[index[0], "Sentence"]])
        elif len(index) > 1: # if more than 1 answer found
            for item in range(len(index)):
                answers.append([question, doc_id, df.loc[index[item], "Sentence"]])
    return answers

train_answers = pd.DataFrame(get_answers(train_data, train_questions, train_docid))
test_answers = pd.DataFrame(get_answers(test_data, test_questions, test_docid))

The above get_answers returns train_answers and test_answers which, gives us in the following columns
- Question
- Related Document ID
- Answer (if no answer to that question, return no answer)

In [329]:
def get_documents(data, questions, documentids): # (done by Finn, tweaked by Dan)
    documents = []
    for q in range(len(questions)):
        question = questions.iloc[q]
        doc_id = documentids.iloc[q] # add the document id
        df = data[data['Question'] == question]
        sentences = df['Sentence'].tolist()
        for i in range(0, len(sentences) - 1):
            sentences[i] = sentences[i] + ' '
        documents.append([doc_id,''.join(sentences)])
    return documents

train_documents = pd.DataFrame(get_documents(train_data, train_questions, train_docid)) # return the individual document in list
test_documents = pd.DataFrame(get_documents(test_data, test_questions, test_docid)) # return the individual document in list

The above train_documents and test_documents called from the get_documents gives us in the following columns
- Document ID
- Full Document

In [334]:
# renaming all the columns for more standardised access
train_answers.columns = ['Question','DocumentID','Answer']
test_answers.columns = ['Question','DocumentID','Answer']
train_documents.columns = ['DocumentID','Document']
test_documents.columns = ['DocumentID','Document']

In [337]:
# result is 2284, 2117, 680, 630
# this is because there are questions with multiple answers
len(train_answers),len(train_documents), len(test_answers),len(test_documents)

(2284, 2117, 680, 630)

In [442]:
train_documents # for ref

Unnamed: 0,DocumentID,Document
0,D1,A partly submerged glacier cave on Perito More...
1,D2,"In physics , circular motion is a movement of ..."
2,D5,Apollo Creed is a fictional character from the...
3,D6,"In the United States, the title of federal jud..."
4,D7,The Beretta 21A Bobcat is a small pocket-sized...
...,...,...
2112,D2805,Blue Mountain State is an American comedy seri...
2113,D2806,"Apple Inc., formerly Apple Computer, Inc., is ..."
2114,D2807,Section 8 housing in the South Bronx Section 8...
2115,D2808,Restaurants categorized by type and informatio...


In [444]:
train_answers # for ref

Unnamed: 0,Question,DocumentID,Answer
0,how are glacier caves formed?,D1,A glacier cave is a cave formed within the ice...
1,How are the directions of the velocity and for...,D2,No answer
2,how did apollo creed die,D5,No answer
3,how long is the term for federal judges,D6,No answer
4,how a beretta model 21 pistols magazines works,D7,No answer
...,...,...,...
2279,When was Apple Computer founded,D2806,"The company was founded on April 1, 1976, and ..."
2280,what is section eight housing,D2807,"Section 8 of the Housing Act of 1937 (), often..."
2281,what is section eight housing,D2807,"It operates through several programs, the larg..."
2282,what is the main type of restaurant,D2808,No answer


**Prior to tagging, we should maybe clean the document and answers first:** (stopped here)

Maybe? 
- lowercase
- removing any punctuation or weird symbols
- removal of stop words? (probably not)

Make sure that the pre-processing is standardised to be the same throughout doc and ans.

In [438]:
# Method 1: Indivually tag the document (done by Finn, tweaked by Dan)

def labelling(documents, answers):
    tagged_documents = []
    for q in range(len(answers)):
        tagged_document = []
        qn = (answers['Question'].loc[q])
        doc_id = answers['DocumentID'].loc[q]
        content = documents.loc[documents['DocumentID'] == doc_id,'Document'].values[0]
        answer = answers['Answer'].loc[q]
        # if there is no answer to the question
        if answer == 'No answer':
            tokens = word_tokenize(content)
            for j in range(len(tokens)):
                tagged_document.append('N') # none 
        else:
            parts = content.partition(answer)
            for j in range(len(parts)):
                tokens = word_tokenize(parts[j])
                if j == 1:
                    tagged_document.append('S') # start of answer
                    for k in range(len(tokens) - 2):
                        tagged_document.append('I') # inside of answer
                    tagged_document.append('E') # end of answer
                else:
                    for k in range(len(tokens)):
                        tagged_document.append('N') # outside answer
        tagged_documents.append(tagged_document)
    return tagged_documents

train_doc_ans_labels = labelling(train_documents, train_answers)

In [440]:
def testing_tokens(ind, labels, documents, answers):
    tokens = labels[ind]
    doc_id = answers.loc[ind][1]
    content = documents.loc[documents['DocumentID'] == doc_id,'Document'].values[0]
    # still working on this
    return(content)

testing_tokens(28, train_doc_ans_labels, train_documents, train_answers)

'Francis Albert "Frank" Sinatra, , (December 12, 1915 – May 14, 1998) was an American singer and film actor. Beginning his musical career in the swing era with Harry James and Tommy Dorsey , Sinatra found unprecedented success as a solo artist from the early to mid-1940s after being signed to Columbia Records in 1943. Being the idol of the " bobby soxers ", he released his first album, The Voice of Frank Sinatra in 1946. His professional career had stalled by the 1950s, but it was reborn in 1953 after he won the Academy Award for Best Supporting Actor for his performance in From Here to Eternity . He signed with Capitol Records in 1953 and released several critically lauded albums (such as In the Wee Small Hours , Songs for Swingin\' Lovers! , Come Fly with Me , Only the Lonely and Nice \'n\' Easy ). Sinatra left Capitol to found his own record label, Reprise Records in 1961 (finding success with albums such as Ring-a-Ding-Ding! , Sinatra at the Sands and Francis Albert Sinatra & Anton