<h3>0. Exploratory analysis</h3>

In [288]:
import json
import pandas as pd

#df_training=pd.read_json('project_files/training.json', encoding = 'utf8')
df_devel=pd.read_json('project_files/devel.json')
df_docs=pd.read_json('project_files/documents.json')
df_testing=pd.read_json('project_files/testing.json')

df_training=pd.read_pickle('project_files/df_training.pkl')

1. Find Keywords
2. Answer types - Using answer type taxonomy
3. Query formulation -> Keywords
4. Go to each document and check the frequency distribution of words and pick the document if one of the query words are present in document. Create a rank with that score
5. Find the paragraphs -> Discard irrelevant paragraphs. Use NE,Keywords, longest exact keywords. Put same weight for now and calculate the score of paragraphs. Rank each of the paragraphs in the document. We have to use the original answer and match the answer type
6. Find candidate answers -> Use supervised ML method
7. Merge candidate answers -> Use NER
8. Pick the best answer -> Logistic regression

<h3>1. Question processing</h3>

Configuring Stanford CoreNLP . Link -> https://blog.manash.me/configuring-stanford-parser-and-stanford-ner-tagger-with-nltk-in-python-on-windows-f685483c374a

In [269]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.tag.stanford import CoreNLPNERTagger
from itertools import groupby

stopwords = set(nltk.corpus.stopwords.words('english')) 
stemmer = nltk.stem.PorterStemmer() 

def get_Name_Entity_NLTK(data):
    results=[]
    for sentence in data:
        ne_chunked_sents = ne_chunk(pos_tag(word_tokenize(sentence)))
        result = []

        for tagged_tree in ne_chunked_sents:

            if hasattr(tagged_tree, 'label'):
                entity_name = ' '.join(c[0] for c in tagged_tree.leaves()) #
                entity_type = tagged_tree.label() # get NE category
                result.append((entity_name, entity_type))
        results.append(result)

    return results

def get_Name_Entity_Sentence(sentence):
    st = CoreNLPNERTagger(url='http://localhost:9000')
    tokenized_text = nltk.word_tokenize(sentence)
    classified_text = st.tag(tokenized_text)
    result = {}
    
    for res in classified_text:
        if len(res) > 0:
            for tag, chunk in groupby(classified_text, lambda x:x[1]):
               if tag != "O":
                    word = " ".join(w for w, t in chunk)
                    result[word.lower()] = tag
    
    
    return result

def get_Name_Entity_paragraph(paragraph):
    result = []
    tokenized_sentence = nltk.sent_tokenize(paragraph)
    for sentence in tokenized_sentence:
        result.append(get_Name_Entity_Sentence(sentence))
    
    return result

def get_Name_Entity_StanfordCoreNLP(data):
    st = CoreNLPNERTagger(url='http://localhost:9000')
    results=[]
    for sentence in data:
        tokenized_text = nltk.word_tokenize(sentence)
        classified_text = st.tag(tokenized_text)
        result = []
        for tag, chunk in groupby(classified_text, lambda x:x[1]):
            if tag != "O":
                word = " ".join(w for w, t in chunk)
                result.append((word.lower(),tag))
       
        results.append(result)
        
    return results

def addNameEntity(df,feature,func):
    if 'NE'+"_"+feature in df:
        df = df.drop('NE'+"_"+feature, axis=1)
    df["NE"+"_"+feature] = func(df[feature])
    
    return df

In [270]:
#df_training = addNameEntity(df_training,"question",get_Name_Entity_StanfordCoreNLP)
#df_training = addNameEntity(df_training,"text",get_Name_Entity_StanfordCoreNLP)

Get Keywords

In [271]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')
POS = set(["NN","NNS","NNP","NNPS","CD","JJ","VB","VBD","VBG","VBN","VBP","VBZ"]) 

stopwords = set(nltk.corpus.stopwords.words('english')) 
stemmer = nltk.stem.PorterStemmer() 

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,wn.NOUN)
    if (lemma == word):
        lemma = lemmatizer.lemmatize(word,wn.VERB)
        
    return lemma

def get_keyword(data):
    result = []
    sentence=data
    tokenized_text = tokenizer.tokenize(sentence)
    tagged = nltk.pos_tag(tokenized_text)
    for text,pos in tagged:
        text = lemmatize(text.lower())
        if text not in stopwords:
            if pos in POS:
                result.append(text)
                
    return result

def get_keyword_paragraph(data):
    results=[]
    tokenized_sentence = nltk.sent_tokenize(data)
    for sentence in tokenized_sentence:
        result = get_keyword(sentence)
        results.append(result)
        
    return results

def get_keyword_all(data):
    results=[]
    for sentence in data:
        result = get_keyword(sentence)
        results.append(result)
        
    return results

def add_keywords(df,feature):
    if 'keywords'+"_"+feature in df:
        df = df.drop('keywords'+"_"+feature, axis=1)
    df['keywords'+"_"+feature]=get_keyword_all(df[feature])
    return df

def get_number_of_common_kewyords(question_keywords,answer_sentence_keywords):
    sum_keywords=0
    for qkey in question_keywords:
        if qkey in answer_sentence_keywords:
            sum_keywords+=1
    
    return sum_keywords

<h4>Train a classifier</h4>

In [272]:
import nltk
def get_answer_features(paragraph,answer,ner_answer,ner_paragraph):
    dict_answer_ner={}
    for ner in ner_answer:
        dict_answer_ner[ner[0]]=ner[1]
    
    dict_answer_sentence_ner={}
    ner_paragraph_list=[]
    for ner_list in ner_paragraph:
        for ner in ner_list:
            dict_answer_ner[ner[0]]=ner[1]
        ner_paragraph_list.append(dict_answer_ner)
            
    
    sents_passage = nltk.sent_tokenize(paragraph)
    answer_sentence_ner={'UNKNOWN':'UNKNOWN'}
    answer_found='UNKNOWN'
    answer_sentence_keywords=[]
    common_entities=tuple()
    for sentence_index in range(len(sents_passage)):
        if answer.lower() in sents_passage[sentence_index].lower():
            answer_found=sents_passage[sentence_index]
            dict_answer_sentence_ner=ner_paragraph_list[sentence_index]
            common_entities = set(dict_answer_sentence_ner.items()) & set(dict_answer_ner.items())
            
            break
    
    return answer_found,dict_answer_sentence_ner,common_entities

In [293]:
from collections import defaultdict

# get the training data set as we need it
answer_type={}
def prepare_training_data():
    
    #df_training['answer_type']='UNKNOWN'
    df_training['answer_found']='UNKNOWN'
    #df_training['common_entities']='UNKNOWN'
    #df_training['answer_sentence_ner']='UNKNOWN'
    for index, row in df_training.iterrows():
        question=row['question']
        raw_answer=row['text']
        
        paragraph=df_docs.iloc[row['docid']]['text'][row['answer_paragraph']]
        
        answer_found,dict_answer_sentence_ner,common_entities=get_answer_features(paragraph,raw_answer,row['NE_text'],row['NE_paragraph'])
        
        #if (len(common_entities)>0):
        #    answer_type=list(common_entities)[0][1]
            
        #else:
        #    answer_type='UNKNOWN'
            
        #df_training.at[index,'answer_type']=answer_type
        df_training.at[index,'answer_found']=answer_found
        #df_training.at[index,'answer_sentence_ner']=dict_answer_sentence_ner
        #df_training.at[index,'common_entities']=common_entities
        
        
    
        
        

In [294]:
prepare_training_data()

In [275]:
#pd.set_option('display.max_colwidth', -1)
#df_training[['question','text','answer_found','answer_sentence_ner','answer_type','NE_text','NE_question','common_entities']][0:10]

In [301]:
# BOW extraction for passages and questions


def get_passages_bow(passages):
    passage_bow={}
    for passage in passages:
        for token in nltk.word_tokenize(passage):
            if token not in stopwords: 
                word=stemmer.stem(token.lower())
                passage_bow[word] = passage_bow.get(word, 0) +  1
    
    return passage_bow

def get_sentences_bow(sentences):
    sentence_bow={}
    
    for sentence in sentences:
        for token in nltk.word_tokenize(sentence):
            if token not in stopwords:
                word=stemmer.stem(token.lower())
                sentence_bow[word] = sentence_bow.get(word, 0) +  1
    
    return sentence_bow

def get_question_bow(question):
    question_bow={}
    for token in nltk.word_tokenize(question):
        if token not in stopwords: 
            word=stemmer.stem(token.lower())
            question_bow[word] = question_bow.get(word, 0) +  1
                
    return question_bow

def get_training_question_bow(question,keywords,qt):
    question_bow={}
    question_bow[qt]=1
    for token in nltk.word_tokenize(question):
        if token not in stopwords: 
            word=stemmer.stem(token.lower())
            if word in keywords:
                question_bow[word] = question_bow.get(word, 0) +  1
                
    return question_bow

In [286]:
df_training.answer_type.unique()

array(['NUMBER', 'DATE', 'PERSON', 'IDEOLOGY', 'MISC', 'RELIGION',
       'UNKNOWN', 'ORGANIZATION', 'LOCATION', 'COUNTRY', 'TITLE',
       'PERCENT', 'DURATION', 'MONEY', 'CAUSE_OF_DEATH',
       'STATE_OR_PROVINCE', 'CITY', 'NATIONALITY', 'ORDINAL',
       'CRIMINAL_CHARGE', 'TIME', 'SET'], dtype=object)

In [304]:
def filter_questions():
    questions=[]
    questions_not_found=[]
    answers_not_found=[]
    classes=[]
    for index, row in df_training.iterrows():
        if len(row['common_entities'])>0 and row['answer_type']!='UNKNOWN':
            questions.append(row['question'])
            classes.append(row['answer_type'])
        else:
            questions_not_found.append(row['question'])
            answers_not_found.append(row['text'])
            
    return questions,classes,questions_not_found,answers_not_found
        

def get_feature_questions(questions, keywords,qt):
    qs = []
    for i,question in enumerate(questions):
        q_bow = get_training_question_bow(question,keywords,qt[i])
        qs.append(q_bow)
        
    return qs

In [289]:
question_learning_dataset = df_training[df_training.answer_type.notnull()]


24982


In [278]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def check_results(predictions, classifications):
    print("Accuracy:")
    print(accuracy_score(classifications,predictions))
    print(classification_report(classifications,predictions))

In [305]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction import DictVectorizer


    
# get the most common words from answer sentences (we can twek this for paragraph)
answer_sentences_bow=get_sentences_bow(df_training['answer_found'])
answer_keywords = set([word for word, count in answer_sentences_bow.items()])

#filter questions with common entities
#questions,classes,questions_not_found,answers_not_found=filter_questions()


#qs_training=get_feature_questions(questions,answer_keywords)
qs_training=get_feature_questions(list(question_learning_dataset.question),answer_keywords,list(question_learning_dataset.question_type))




In [306]:
if (len(qs)>0 and len(classes)>0):
    # fit vectorizer
    vectorizer = DictVectorizer()
    
    X_train_dtm = vectorizer.fit_transform(qs_training)
    
    # tag the answers
    # fit a logistic regression model to the data 
    # build classifier
    model = MultinomialNB(2, False, None)

    # train the model using X_train_dtm 
    model.fit(X_train_dtm, list(question_learning_dataset.answer_type))
    
    y_predicted_class = model.predict(X_train_dtm)
    
    check_results(y_predicted_class,list(question_learning_dataset.answer_type))

Accuracy:
0.7229205027619886
                   precision    recall  f1-score   support

   CAUSE_OF_DEATH       0.93      0.08      0.15       327
             CITY       0.00      0.00      0.00        12
          COUNTRY       0.80      0.39      0.52      1058
  CRIMINAL_CHARGE       0.00      0.00      0.00        64
             DATE       0.69      0.97      0.80      5801
         DURATION       0.94      0.16      0.28       464
         IDEOLOGY       1.00      0.06      0.11       232
         LOCATION       0.70      0.85      0.77      1738
             MISC       1.00      0.01      0.01       133
            MONEY       0.98      0.21      0.34       462
      NATIONALITY       0.73      0.26      0.39       858
           NUMBER       0.72      0.89      0.79      4644
          ORDINAL       0.93      0.07      0.12       406
     ORGANIZATION       0.88      0.20      0.33       496
          PERCENT       0.85      0.55      0.67       751
           PERSON       0.

  'precision', 'predicted', average, warn_for)


In [300]:
X_train_dtm.shape

(24982, 13435)

<h3>2. Candidate answering generation</h3>

<h4> Get a score for the passage to filter the most relevant passages</h4>


In [242]:
## features relevant to this part
# number of named entities of the right type in the passage
# number of question keywords in the passage
# the longest exact sequence of question keywords
# rank of the document where the passage was extracted
# proximity of the keywords from the original query
# ngram overlap between the passage and the question

First, we will set up useful functions to extract term frequencies to build the vector space model

In [309]:
import nltk
from collections import defaultdict
from collections import Counter
from math import log

stopwords = set(nltk.corpus.stopwords.words('english')) # wrap in a set() (see below)
stemmer = nltk.stem.PorterStemmer() 

# get the terms for a passage
def get_terms(passage):
    terms = set()
    for token in nltk.word_tokenize(passage):
        if token not in stopwords: 
            terms.add(stemmer.stem(token.lower()))
    return terms
    
# get document_term 
def get_document_term_passsages(ds_documents):
    document_term={}
    passageID=0
    for index, row in ds_documents.iterrows():
        passageID=0
        terms={}
        # every row is a document
        list_of_passages=row['text']
        for passage in list_of_passages:
            terms[passageID]=get_terms(passage)
            passageID+=1
            
        document_term[row['docid']]=terms
    return document_term

# get the term frequency
def extract_term_freqs(doc):
    tfs = Counter()
    for token in doc:
        if token not in stopwords: 
            tfs[stemmer.stem(token.lower())] += 1
    return tfs
        
# compute idf
def compute_doc_freqs(doc_term_freqs):
    doc_dic = {}
    for key, value in doc_term_freqs.items():
        dfs = Counter()
        for passage_id,tfs in value.items():
            for term in tfs.keys():
                dfs[term] += 1
        doc_dic[key] = dfs
        
    return doc_dic
    

In [310]:
# create a document-term matrix
docs=get_document_term_passsages(df_docs)
#docs

In [311]:
# create a vector space model we need to define a score function
# first I will use tf-idf
doc_term_freqs = {}
for docid,dic_passages in docs.items():
    passage_dic = {}
    for passage_id, terms in dic_passages.items():
        term_freqs = extract_term_freqs(terms)
        passage_dic[passage_id] = term_freqs
    doc_term_freqs[docid] = passage_dic

doc_freqs = compute_doc_freqs(doc_term_freqs)


In [312]:
#doc_term_freqs

<b>Improvement:</b> Use BM25

Create an inverted index for query processing. Inverted index will not change from query to query. Here we can improve how the weight is defined for the posting list tuple for each term (docid,weight)

In [313]:
def count_words(freqs):
    p_count=0
    for counter in freqs.values():
        p_count+=sum(counter.values())
    
    #print(p_count)
    return p_count

In [314]:
## Code from WSTA_N16_information_retrieval
vsm_inverted_index_all = defaultdict()
for docid, passage_freqs in doc_term_freqs.items():
    vsm_inverted_index = defaultdict(list)
    
    #N = sum(passage_freqs.values())
    N = count_words(passage_freqs)
    #print(N,passage_freqs)
    for passage_id, term_freqs in passage_freqs.items():
        length = 0
        # find tf*idf values and accumulate sum of squares 
        tfidf_values = []
        M = len(passage_freqs)
        for term, count in term_freqs.items():
            tfidf = float(count) / N * log(M / float(doc_freqs[docid][term]))
            tfidf_values.append((term, tfidf))
            length += tfidf ** 2

        # normalise documents by length and insert into index
        length = length ** 0.5
        for term, tfidf in tfidf_values:
            # note the inversion of the indexing, to be term -> (doc_id, score)
            vsm_inverted_index[term].append([passage_id, tfidf / length])
    vsm_inverted_index_all[docid] = vsm_inverted_index

# ensure posting lists are in sorted order (less important here cf above)
for key, value in vsm_inverted_index_all.items():
    for term, docids in value.items():
        docids.sort()


In [317]:
import pickle

def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)
    
save_obj(vsm_inverted_index_all,'index')

Query the VSM creating a score for each document (passage) and returning the top k

In [250]:
# get a list of paragraphs ordered by relevance on the question
def query_vsm(query, index):
    accumulator = Counter()
    for term in query:
        postings = index[term]
        for docid, weight in postings:
            accumulator[docid] += weight
    return accumulator

## end copied code

<h3>3. Candidate answering scoring</h3>

In [331]:
df_result=pd.DataFrame(columns=['id','answer'])
df_devel=df_devel.iloc[0:20]

In [332]:

for index, row in df_devel.iterrows():
    question=row['question']
    docid=row['docid']
    expected_answer=row['text']
    #print('Question: ',question)
    #print('Expected Answer:',expected_answer)
    #print('Docid:',docid)
    question_keywords=get_keyword(question)
    
    # get the most relevant documents for the question
    results = query_vsm(question_keywords, vsm_inverted_index_all[docid])
    documents_ranked=results.most_common(10) 
    #print('Top 10 paragraphs: ',documents_ranked)
    q_bow=get_question_bow(question)
    x = vectorizer.transform(q_bow)
    answer_type=model.predict(x)
    #print('Predicted answer type: ',answer_type)
    
    candidate_passages={}
    list_of_passages=[]
    answer=''
    if len(documents_ranked)>0:
        for document in documents_ranked:
            # perform a paragraph segmentation
            paragraph=df_docs.iloc[docid]['text'][document[0]]
            passages = nltk.sent_tokenize(paragraph)
            for passage in passages:
                list_of_passages.append(passage)



        ## PARAMETERS TO GET FROM TESTING DATASET AND USE A MODEL TO GET THE ANSWER PASSAGE CANDIDATES. 
        #question= df_training.loc[(df_training["docid"] == docid_query) & (df_training["answer_paragraph"] ==document[0] ),"question"][0]
        #answer_type=df_training.loc[(df_training["docid"] == docid_query) & (df_training["answer_paragraph"] ==document[0] ),"answer_type"][0]
        #print(question)
        #print(answer_type) 
        #print(sorted(get_keyword(question)))
        ###

        ## FOR NOW USING KEYWORDS AND GET JUST ONE DEFINITE ANSWER PASSAGE CANDIDATE
        indexPassage=0
        for indexPassage in range(len(list_of_passages)):
            NER_passage=get_Name_Entity_Sentence(list_of_passages[indexPassage])
            for entity in NER_passage.items():
                if (entity[1]==answer_type):
                    candidate_passages[indexPassage]=get_number_of_common_kewyords(get_keyword(question),get_keyword(list_of_passages[indexPassage]))
                    break


        if len(candidate_passages)>0:
            best_candidate_passage=list_of_passages[max(candidate_passages, key=candidate_passages.get)]
        else:
            if len(list_of_passages)>0:
                best_candidate_passage=list_of_passages[0]
        #print("Candidate Passage Answer:")
        #print(best_candidate_passage)

        NER_answer_passage=get_Name_Entity_Sentence(best_candidate_passage)
        for entity in NER_answer_passage.items():
                if (entity[1]==answer_type):
                    answer=entity[0]

        #print('Predicted answer:',answer)
        df_result.loc[len(df_result)]=[index,answer]
    break
    
df_result.to_csv('prediction/output.csv',index=False)
    
    

In [333]:
df_testing.head()

Unnamed: 0,docid,id,question
0,410,0,Modern browser support standards-based and defacto what?
1,410,1,What do people typically call a web browser?
2,410,2,What is it called when content is changed from markup to an interactive document?
3,410,3,What platform is a browser used on?
4,410,4,When was Firefox released?


In [None]:
for index, row in df_testing.iterrows():
    question=row['question']
    docid=row['docid']
    ida=row['id']
    #print('Question: ',question)
    #print('Expected Answer:',expected_answer)
    #print('Docid:',docid)
    question_keywords=get_keyword(question)
    
    # get the most relevant documents for the question
    results = query_vsm(question_keywords, vsm_inverted_index_all[docid])
    documents_ranked=results.most_common(10) 
    #print('Top 10 paragraphs: ',documents_ranked)
    q_bow=get_question_bow(question)
    x = vectorizer.transform(q_bow)
    answer_type=model.predict(x)
    #print('Predicted answer type: ',answer_type)
    
    candidate_passages={}
    list_of_passages=[]
    answer=''
    if len(documents_ranked)>0:
        for document in documents_ranked:
            # perform a paragraph segmentation
            paragraph=df_docs.iloc[docid]['text'][document[0]]
            passages = nltk.sent_tokenize(paragraph)
            
            for passage in passages:
                list_of_passages.append(passage)



        ## PARAMETERS TO GET FROM TESTING DATASET AND USE A MODEL TO GET THE ANSWER PASSAGE CANDIDATES. 
        #question= df_training.loc[(df_training["docid"] == docid_query) & (df_training["answer_paragraph"] ==document[0] ),"question"][0]
        #answer_type=df_training.loc[(df_training["docid"] == docid_query) & (df_training["answer_paragraph"] ==document[0] ),"answer_type"][0]
        #print(question)
        #print(answer_type) 
        #print(sorted(get_keyword(question)))
        ###

        ## FOR NOW USING KEYWORDS AND GET JUST ONE DEFINITE ANSWER PASSAGE CANDIDATE
        indexPassage=0
        for indexPassage in range(len(list_of_passages)):
            NER_passage=get_Name_Entity_Sentence(list_of_passages[indexPassage])
            for entity in NER_passage.items():
                if (entity[1]==answer_type):
                    candidate_passages[indexPassage]=get_number_of_common_kewyords(get_keyword(question),get_keyword(list_of_passages[indexPassage]))
                    break


        if len(candidate_passages)>0:
            best_candidate_passage=list_of_passages[max(candidate_passages, key=candidate_passages.get)]
        else:
            if len(list_of_passages)>0:
                best_candidate_passage=list_of_passages[0]
        #print("Candidate Passage Answer:")
        #print(best_candidate_passage)

       
        NER_answer_passage=get_Name_Entity_Sentence(best_candidate_passage)
        for entity in NER_answer_passage.items():
                if (entity[1]==answer_type):
                    answer=entity[0]
    
    #print('Predicted answer:',answer)
    print(ida)
    df_result.loc[len(df_result)]=[ida,answer]
    
    

    

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89


In [None]:
print("done")

In [324]:
df_result.to_csv('prediction/output.csv',index=False)

Unnamed: 0,id,answer
0,0,"june 16 , 1911"
