<h3>0. Exploratory analysis</h3>

In [1]:
import sys


In [2]:
import json
import pandas as pd

df_training=pd.read_json('project_files/training.json', encoding = 'utf8')
df_devel=pd.read_json('project_files/devel.json')
df_docs=pd.read_json('project_files/documents.json')
df_testing=pd.read_json('project_files/testing.json')


In [None]:
for item in df_training['text'][:1]:
    print(item)

In [18]:
df_training.head()

Unnamed: 0,answer_paragraph,docid,question,text
0,23,0,A kilogram could be definined as having a Plan...,6966662606895999999♠6.62606896×10−34 j⋅s
1,22,0,What is the shape of the object that establish...,cylinder
2,12,0,What example is given as another paired relati...,time vs. energy
3,1,0,What does the Planck Constant refer to?,quantum of action
4,10,0,When was the first quantized model of the atom...,1913


In [None]:
df_training.dtypes

In [8]:
df_docs.head()

Unnamed: 0,docid,text
0,0,"[First recognized in 1900 by Max Planck, it wa..."
1,1,[Public policy and political leadership helps ...
2,2,"[In a career spanning more than four decades, ..."
3,3,[Anti-aircraft warfare or counter-air defence ...
4,4,[The Endangered Species Act of 1973 (ESA; 16 U...


In [None]:
df_docs.iloc[0]['text']

In [None]:
df_testing.head()

In [None]:
df_devel.head()

<h3>1. Question processing</h3>

In [None]:
from collections import defaultdict
import nltk

def process_question(words):
    ## question type: classify the type of question
    # hand writter rule vs supervised ML - start with rules then we can move to supervised using datasets like.
    
    # default values
    question_type='UNK'
    answer_type='UNK'
    query=''
    
    # handwriten rules
    y_n_question=['is','can','could']
    wh_question=['what','who','where','when']
    
    for word in words:
        word=word.lower()
        if word in y_n_question:
            question_type='YN'
            break
        elif word in wh_question:
            question_type='WH'
            break
        else:
            question_type='UNK'
            
    
    ## focus: strings possible to be replaced in the answer
    # identify with the NER - use NLP taggin from CoreNLP
    
    ## answer type: kind of entity
    # from NER
    
    # just to start, the answer type is the same as question -- DELETE
    answer_type=question_type
    
    ## query: keywords to be used for the IR system to search in documents
    # do we need to do semantic parsing?
    # query should be built according to question type
    
    
    
    return query,answer_type

# process questions
queries=[]
for index, row in df_training.iterrows():
    question_training=nltk.word_tokenize(row['question'])
    query,answer_type=process_question(question_training)
    queries.append((query,answer_type))
    
    

In [None]:
queries[1:4]

NER

In [15]:
df_testing.tail()

Unnamed: 0,docid,id,question
3613,440,3613,Which number of ranks wore purple clothing in ...
3614,440,3614,In what year did nations standardize on red as...
3615,440,3615,In what year was the Community Party of China ...
3616,440,3616,What was Turkey red called in France?
3617,440,3617,What other pigment was Turkey red compared to?


In [3]:
import nltk
tokenized_sentences = [nltk.word_tokenize(sentence.strip()) for sentence in df_training['question']]
print "Done"

Done


In [4]:
print tokenized_sentences[0]

[u'A', u'kilogram', u'could', u'be', u'definined', u'as', u'having', u'a', u'Planck', u'constant', u'of', u'what', u'value', u'?']


In [None]:
# nltk.download('maxent_ne_chunker')
# nltk.download('words')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

In [5]:
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
print tagged_sentences[0]

[(u'A', 'DT'), (u'kilogram', 'NN'), (u'could', 'MD'), (u'be', 'VB'), (u'definined', 'VBN'), (u'as', 'IN'), (u'having', 'VBG'), (u'a', 'DT'), (u'Planck', 'NNP'), (u'constant', 'NN'), (u'of', 'IN'), (u'what', 'WP'), (u'value', 'NN'), (u'?', '.')]


In [6]:
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
print "Done"

Done


In [None]:
def extract_entity_names(t):
    entity_names = []

    if hasattr(t, 'label') and t.label:
        if t.label() == 'NE':
            entity_names.append(' '.join([child[0] for child in t]))
        else:
            for child in t:
                entity_names.extend(extract_entity_names(child))

    return entity_names

entity_names = []
entity_name = []
for tree in chunked_sentences:
    # Print results per sentence
    print tree
    print extract_entity_names(tree)
    entity_names.extend(extract_entity_names(tree))
    entity_name.append(extract_entity_names(tree))

# Print all entity names
#print entity_names

# Print unique entity names
# print set(entity_names)

1. Find Keywords
2. Answer types - Using answer type taxonomy
3. Query formulation -> Keywords
4. Go to each document and check the frequency distribution of words and pick the document if one of the query words are present in document. Create a rank with that score
5. Find the paragraphs -> Discard irrelevant paragraphs. Use NE,Keywords, longest exact keywords. Put same weight for now and calculate the score of paragraphs. Rank each of the paragraphs in the document. We have to use the original answer and match the answer type
6. Find candidate answers -> Use supervised ML method
7. Merge candidate answers -> Use NER
8. Pick the best answer -> Logistic regression

<h3>2. Candidate answering generation</h3>

<h4> Get a score for the passage to filter the most relevant passages</h4>


In [93]:
## features relevant to this part
# number of named entities of the right type in the passage
# number of question keywords in the passage
# the longest exact sequence of question keywords
# rank of the document where the passage was extracted
# proximity of the keywords from the original query
# ngram overlap between the passage and the question

First, we will set up useful functions to extract term frequencies to build the vector space model

In [98]:
import nltk
from collections import defaultdict
from collections import Counter
from math import log

stopwords = set(nltk.corpus.stopwords.words('english')) # wrap in a set() (see below)
stemmer = nltk.stem.PorterStemmer() 

# get the terms for a passage
def get_terms(passage):
    terms = set()
    for token in nltk.word_tokenize(passage):
        if token not in stopwords: 
            terms.add(stemmer.stem(token.lower()))
    return terms
    
# get document_term 
def get_document_term_passsages(ds_documents):
    document_term={}
    passageID=0
    for index, row in ds_documents.iterrows():
        # every row is a document
        list_of_passages=row['text']
        for passage in list_of_passages:
            terms=get_terms(passage)
            document_term[passageID]=terms
            passageID+=1
        
    return document_term

# get the term frequency
def extract_term_freqs(doc):
    tfs = Counter()
    for token in doc:
        if token not in stopwords: 
            tfs[stemmer.stem(token.lower())] += 1
    return tfs
        
# compute idf
def compute_doc_freqs(doc_term_freqs):
    dfs = Counter()
    for tfs in doc_term_freqs.values():
        for term in tfs.keys():
            dfs[term] += 1
    return dfs
    

In [99]:
# create a document-term matrix
docs=get_document_term_passsages(df_docs.iloc[0:4])

In [100]:
# create a vector space model we need to define a score function
# first I will use tf-idf
doc_term_freqs = {}
for docid, terms in docs.items():
    term_freqs = extract_term_freqs(terms)
    doc_term_freqs[docid] = term_freqs

M = len(doc_term_freqs)

doc_freqs = compute_doc_freqs(doc_term_freqs)

<b>Improvement:</b> Use BM25

Create an inverted index for query processing. Inverted index will not change from query to query. Here we can improve how the weight is defined for the posting list tuple for each term (docid,weight)

In [101]:
## Code from WSTA_N16_information_retrieval
vsm_inverted_index = defaultdict(list)
for docid, term_freqs in doc_term_freqs.items():
    N = sum(term_freqs.values())
    length = 0
    
    # find tf*idf values and accumulate sum of squares 
    tfidf_values = []
    for term, count in term_freqs.items():
        tfidf = float(count) / N * log(M / float(doc_freqs[term]))
        tfidf_values.append((term, tfidf))
        length += tfidf ** 2

    # normalise documents by length and insert into index
    length = length ** 0.5
    for term, tfidf in tfidf_values:
        # note the inversion of the indexing, to be term -> (doc_id, score)
        vsm_inverted_index[term].append([docid, tfidf / length])
        
# ensure posting lists are in sorted order (less important here cf above)
for term, docids in vsm_inverted_index.items():
    docids.sort()


Query the VSM creating a score for each document (passage) and returning the top k

In [102]:
def query_vsm(query, index, k=10):
    accumulator = Counter()
    for term in query:
        postings = index[term]
        for docid, weight in postings:
            accumulator[docid] += weight
    return accumulator.most_common(k)

## end copied code

Test the query:

In [103]:
results = query_vsm([stemmer.stem(term.lower()) for term in "First recognized in 1900 by Max Planck".split()], vsm_inverted_index)
results

[(0, 0.680116589444509),
 (1, 0.30340274351915997),
 (6, 0.24364584220578492),
 (65, 0.17637606987370624),
 (105, 0.16152465265290256),
 (10, 0.14946948590599873),
 (3, 0.14114017042707586),
 (14, 0.13829184288746416),
 (11, 0.13227012047661246),
 (19, 0.10701743930986322)]

<h3>3. Candidate answering scoring</h3>

<h3>4. Answer and confidence</h3>