<h3>0. Exploratory analysis</h3>

In [7]:
# tools
import pickle

def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [8]:
import json
import pandas as pd
import spacy

nlp = spacy.load('en_core_web_md')

df_devel=pd.read_json('project_files/devel.json')
df_docs=pd.read_json('project_files/documents.json')
df_testing=pd.read_json('project_files/testing.json')

df_training=pd.read_pickle('project_files/df_training.pkl')


question_learning_dataset = df_training[df_training.answer_type.notnull()]

NER_corpus=load_obj('ner_corpus')

In [9]:
from nltk.tokenize.punkt import PunktSentenceTokenizer,PunktTrainer

tokenizer = load_obj('punk_tokenizer')
tokenizer._params.abbrev_types.add('ii')
tokenizer._params.abbrev_types.add('dr')

questionwords = set(["who", "what", "where", "when", "why", "how", "whose", "which", "whom","whats","what's","whos"])
passiveQuestions = set(["can", "could", "would", 
                   "was", "were","am","is", "are", "will","shall",
                   "did","do","does",
                   "had", "have","has",
                   "as","that","in",
                   "give an example","name"])


1. Find Keywords
2. Answer types - Using answer type taxonomy
3. Query formulation -> Keywords
4. Go to each document and check the frequency distribution of words and pick the document if one of the query words are present in document. Create a rank with that score
5. Find the paragraphs -> Discard irrelevant paragraphs. Use NE,Keywords, longest exact keywords. Put same weight for now and calculate the score of paragraphs. Rank each of the paragraphs in the document. We have to use the original answer and match the answer type
6. Find candidate answers -> Use supervised ML method
7. Merge candidate answers -> Use NER
8. Pick the best answer -> Logistic regression

<h3>1. Question processing</h3>

Configuring Stanford CoreNLP . Link -> https://blog.manash.me/configuring-stanford-parser-and-stanford-ner-tagger-with-nltk-in-python-on-windows-f685483c374a

In [10]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.tag.stanford import CoreNLPNERTagger
from itertools import groupby

stopwords = set(nltk.corpus.stopwords.words('english')) 


def get_Name_Entity_NLTK(data):
    results=[]
    for sentence in data:
        ne_chunked_sents = ne_chunk(pos_tag(word_tokenize(sentence)))
        result = []

        for tagged_tree in ne_chunked_sents:

            if hasattr(tagged_tree, 'label'):
                entity_name = ' '.join(c[0] for c in tagged_tree.leaves()) #
                entity_type = tagged_tree.label() # get NE category
                result.append((entity_name, entity_type))
        results.append(result)

    return results

def get_Name_Entity_Sentence(sentence):
    st = CoreNLPNERTagger(url='http://localhost:9000')
    tokenized_text = nltk.word_tokenize(sentence)
    classified_text = st.tag(tokenized_text)
    result = []
    
    for tag, chunk in groupby(classified_text, lambda x:x[1]):
       if tag != "O":
            word = " ".join(w for w, t in chunk)
            result.append((word.lower(), tag))
    
    return result


def addNameEntity(df,feature,func):
    if 'NE'+"_"+feature in df:
        df = df.drop('NE'+"_"+feature, axis=1)
    df["NE"+"_"+feature] = func(df[feature])
    
    return df

In [11]:
def get_question_type(question):
    found  = False
    result = 'other'
    question_tokens = nltk.word_tokenize(question)
    for token in question_tokens:
        if token in questionwords:
            found = True
            result = token
    if not found:
        for token in question_tokens:
            if token in passiveQuestions:
                found = True
                result = token

    return result

Get Keywords

In [12]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords

lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

POS = set(["NN","NNS","NNP","NNPS","CD","JJ","VB","VBD","VBG","VBN","VBP","VBZ"]) 

stopwords = set(nltk.corpus.stopwords.words('english')) 


def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,wn.NOUN)
    if (lemma == word):
        lemma = lemmatizer.lemmatize(word,wn.VERB)
        
    return lemma

def get_keyword(data):
    result = []
    sentence=data
    tokenized_text = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokenized_text)
    for text,pos in tagged:
        text = lemmatize(text.lower())
        if text not in stopwords:
            if pos in POS:
                result.append(text)
                
    return result

def get_keyword_paragraph(data):
    results=[]
    tokenized_sentence = tokenizer.tokenize(data)
    for sentence in tokenized_sentence:
        result = get_keyword(sentence)
        results.append(result)
        
    return results

def get_keyword_all(data):
    results=[]
    for sentence in data:
        result = get_keyword(sentence)
        results.append(result)
        
    return results

def add_keywords(df,feature):
    if 'keywords'+"_"+feature in df:
        df = df.drop('keywords'+"_"+feature, axis=1)
    df['keywords'+"_"+feature]=get_keyword_all(df[feature])
    return df

def get_number_of_common_kewyords(question_keywords,answer_sentence_keywords):
    sum_keywords=0
    for qkey in question_keywords:
        if qkey in answer_sentence_keywords:
            sum_keywords+=1
    
    return sum_keywords

<h4>Train a classifier</h4>

In [13]:
# BOW extraction for passages and questions
def get_passages_bow(passages):
    passage_bow={}
    for passage in passages:
        for token in nltk.word_tokenize(passage):
            if token not in stopwords: 
                word=lemmatize(token.lower())
                passage_bow[word] = passage_bow.get(word, 0) +  1
    
    return passage_bow

def get_sentences_bow(sentences):
    sentence_bow={}
    
    for sentence in sentences:
        for token in nltk.word_tokenize(sentence):
            if token not in stopwords:
                word=lemmatize(token.lower())
                sentence_bow[word] = sentence_bow.get(word, 0) +  1
    
    return sentence_bow

def get_question_bow(question):
    question_bow={}
    question_bow[get_question_type(question)]=1
    for token in nltk.word_tokenize(question):
        if token not in stopwords: 
            word=lemmatize(token.lower())
            question_bow[word] = question_bow.get(word, 0) +  1
                
    return question_bow

def get_training_question_bow(question,keywords,qt):
    question_bow={}
    question_bow[qt]=1
    for token in nltk.word_tokenize(question):
        if token not in stopwords: 
            word=lemmatize(token.lower())
            if word in keywords:
                question_bow[word] = question_bow.get(word, 0) +  1
                
    return question_bow

In [14]:
def get_feature_questions(questions, keywords,qt):
    qs = []
    for i,question in enumerate(questions):
        q_bow = get_training_question_bow(question,keywords,qt[i])
        qs.append(q_bow)
        
    return qs

In [52]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def check_results(predictions, classifications):
    print("Accuracy:")
    print(accuracy_score(classifications,predictions))
    print(classification_report(classifications,predictions))

In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction import DictVectorizer


    
# get the most common words from answer sentences (we can twek this for paragraph)
answer_sentences_bow=get_sentences_bow(question_learning_dataset[question_learning_dataset['answer_found'].notnull()]['answer_found'])
answer_keywords = set([word for word, count in answer_sentences_bow.items()])

#qs_training=get_feature_questions(questions,answer_keywords)
qs_training=get_feature_questions(list(question_learning_dataset.question),answer_keywords,list(question_learning_dataset.question_type))




In [17]:
from sklearn.ensemble import RandomForestClassifier

if (len(qs_training)>0 and len(list(question_learning_dataset.question_type))>0):
    # fit vectorizer
    vectorizer = DictVectorizer()
    
    X_train_dtm = vectorizer.fit_transform(qs_training)
    
    

    model=RandomForestClassifier(n_estimators = 300, max_depth = 60, criterion = 'entropy')
    
    # tag the answers
    # fit a logistic regression model to the data 
    # build classifier
    #model = MultinomialNB(2, False, None)

    # train the model using X_train_dtm 
    model.fit(X_train_dtm, list(question_learning_dataset.answer_type))
    
    y_predicted_class = model.predict(X_train_dtm)
    
    check_results(y_predicted_class,list(question_learning_dataset.answer_type))

Accuracy:
0.8369626130814186
                   precision    recall  f1-score   support

   CAUSE_OF_DEATH       1.00      0.47      0.64       327
             CITY       1.00      0.08      0.15        12
          COUNTRY       0.94      0.54      0.69      1058
  CRIMINAL_CHARGE       1.00      0.33      0.49        64
             DATE       0.76      0.99      0.86      5801
         DURATION       0.96      0.64      0.77       464
         IDEOLOGY       1.00      0.56      0.71       232
         LOCATION       0.79      0.91      0.85      1738
             MISC       1.00      0.50      0.67       133
            MONEY       1.00      0.83      0.91       462
      NATIONALITY       0.99      0.44      0.61       858
           NUMBER       0.93      0.91      0.92      4644
          ORDINAL       1.00      0.65      0.79       406
     ORGANIZATION       1.00      0.51      0.68       496
          PERCENT       0.98      0.85      0.91       751
           PERSON       0.

<h3>2. Candidate answering generation</h3>

<h4> Get a score for the passage to filter the most relevant passages</h4>


In [18]:
## features relevant to this part
# number of named entities of the right type in the passage
# number of question keywords in the passage
# the longest exact sequence of question keywords
# rank of the document where the passage was extracted
# proximity of the keywords from the original query
# ngram overlap between the passage and the question

First, we will set up useful functions to extract term frequencies to build the vector space model

In [19]:
import nltk
from collections import defaultdict
from collections import Counter
from math import log

stopwords = set(nltk.corpus.stopwords.words('english')) # wrap in a set() (see below)


# get the terms for a passage
def get_terms(passage):
    terms = set()
    for token in nltk.word_tokenize(passage):
        if token not in stopwords: 
            terms.add(lemmatize(token.lower()))
    return terms
    
# get document_term 
def get_document_term_passsages(ds_documents):
    document_term={}
    passageID=0
    for index, row in ds_documents.iterrows():
        passageID=0
        terms={}
        # every row is a document
        list_of_passages=row['text']
        for passage in list_of_passages:
            terms[passageID]=get_terms(passage)
            passageID+=1
            
        document_term[row['docid']]=terms
    return document_term

# get the term frequency
def extract_term_freqs(doc):
    tfs = Counter()
    for token in doc:
        if token not in stopwords: 
            tfs[lemmatize(token.lower())] += 1
    return tfs
        
# compute idf
def compute_doc_freqs(doc_term_freqs):
    doc_dic = {}
    for key, value in doc_term_freqs.items():
        dfs = Counter()
        for passage_id,tfs in value.items():
            for term in tfs.keys():
                dfs[term] += 1
        doc_dic[key] = dfs
        
    return doc_dic
    

In [23]:
# create a document-term matrix
docs=get_document_term_passsages(df_docs)
#docs

In [24]:
# create a vector space model we need to define a score function
# first I will use tf-idf
doc_term_freqs = {}
for docid,dic_passages in docs.items():
    passage_dic = {}
    for passage_id, terms in dic_passages.items():
        term_freqs = extract_term_freqs(terms)
        passage_dic[passage_id] = term_freqs
    doc_term_freqs[docid] = passage_dic

doc_freqs = compute_doc_freqs(doc_term_freqs)


In [25]:
#doc_term_freqs

<b>Improvement:</b> Use BM25

Create an inverted index for query processing. Inverted index will not change from query to query. Here we can improve how the weight is defined for the posting list tuple for each term (docid,weight)

In [26]:
def count_words(freqs):
    p_count=0
    for counter in freqs.values():
        p_count+=sum(counter.values())
    
    #print(p_count)
    return p_count

In [314]:
## Code from WSTA_N16_information_retrieval
vsm_inverted_index_all = defaultdict()
for docid, passage_freqs in doc_term_freqs.items():
    vsm_inverted_index = defaultdict(list)
    
    #N = sum(passage_freqs.values())
    N = count_words(passage_freqs)
    #print(N,passage_freqs)
    for passage_id, term_freqs in passage_freqs.items():
        length = 0
        # find tf*idf values and accumulate sum of squares 
        tfidf_values = []
        M = len(passage_freqs)
        for term, count in term_freqs.items():
            tfidf = float(count) / N * log(M / float(doc_freqs[docid][term])) # should be number of documents (paragraphs) with term
            tfidf_values.append((term, tfidf))
            length += tfidf ** 2

        # normalise documents by length and insert into index
        length = length ** 0.5
        for term, tfidf in tfidf_values:
            # note the inversion of the indexing, to be term -> (doc_id, score)
            vsm_inverted_index[term].append([passage_id, tfidf / length])
    vsm_inverted_index_all[docid] = vsm_inverted_index

# ensure posting lists are in sorted order (less important here cf above)
for key, value in vsm_inverted_index_all.items():
    for term, docids in value.items():
        docids.sort()


In [20]:
vsm_inverted_index_all=load_obj('vsm_inverted_index_corpus')

Query the VSM creating a score for each document (passage) and returning the top k

In [21]:
from collections import Counter
# get a list of paragraphs ordered by relevance on the question
def query_vsm(query, index):
    accumulator = Counter()
    for term in query:
        postings = index[term]
        for docid, weight in postings:
            accumulator[docid] += weight
    return accumulator

## end copied code

<h3>3. Candidate answering scoring</h3>

In [221]:
import re
def correct_answer_space(predicted,predicted_answer_sentence):
    
    tokens=nltk.word_tokenize(predicted)
    pattern='.*('
    for token in  tokens:
        pattern=pattern+token+'\s*'
    pattern=pattern+').*'

    reg=re.compile(pattern,re.IGNORECASE)
    if len(re.findall(reg,predicted_answer_sentence)):
        predicted=re.findall(reg,predicted_answer_sentence)[0].strip()
    
    return predicted

def correct_answer_pattern(predicted):
    corrected=predicted
    
    # symbol % based pattern
    pattern_percentaje='(.*[0-9])(%.*)'
    reg=re.compile(pattern_percentaje,re.IGNORECASE)
    result=re.findall(reg,predicted)
    if len(result)>0:
        groups=result[0]
        corrected=groups[0]+' '+groups[1]
    
    
    # date pattern
    #pattern_date='(Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?|Sep(tember)?|Oct(ober)?|Nov(ember)?|Dec(ember)?)\s+(\d{1,2})(,.*)'
    pattern_date='(January|February|March|April|May|June|July|August|September|October|November|December)\s+([0-9]{1,2})(,.*)([0-9]{4})'


    reg=re.compile(pattern_date,re.IGNORECASE)
    result=re.findall(reg,predicted)

    if len(result)>0:
        groups=result[0]   
        corrected=(groups[0]+' '+groups[1]+' '+groups[2]+groups[3])


    return corrected
    
    
def isAnswerInSentence(answer,answer_sentence):
    #print('Eval:',answer)
    #print('In:',answer_sentence)
    tokens=nltk.word_tokenize(answer)
    pattern='.*('
    for token in  tokens:
        pattern=pattern+token+'\s*'
    pattern=pattern+').*'

    
    reg=re.compile(pattern,re.IGNORECASE)
    if len(re.findall(reg,answer_sentence))>0:
        #print('True:',re.findall(reg,answer_sentence))
        return True
    
    return False




In [229]:
df_devel=load_obj('df_devel_predicted_answer_type')

In [231]:
import time

#df_devel=pd.read_json('project_files/devel.json')
df_result_devel=pd.DataFrame(columns=['id','question','paragraph','retrieved paras','predicted_paragraph','paragraph_found','sentence','predicted_sentence','answer','predicted_answer'])
df_devel=df_devel.iloc[0:100]

for index, row in df_devel.iterrows():
    t=time.process_time()
    question=row['question']
    docid=row['docid']
    ida=index
    
    
    question_keywords=get_keyword(question)
    
    
    
    # get the most relevant documents for the question
    results = query_vsm(question_keywords, vsm_inverted_index_all[docid])
    documents_ranked=results.most_common(10) 
    
    
    # prediction --not necessary if loading from pickle
    #q_bow=get_question_bow(question)
    #x = vectorizer.transform(q_bow)
    #answer_type=model.predict(x)
    answer_type=row['predicted_answer_type']
    
    passages_dict={}
    sentences_dict={}
    score_dict={}
    answer=''
    df_best_sentences=pd.DataFrame(columns=['doc_id','para_id','sentence_id','sentence_text','score'])
    if len(documents_ranked)>0:
        
        for document in documents_ranked:
            # perform a paragraph segmentation
            paragraph_id=document[0]
            paragraph_text=df_docs.iloc[docid]['text'][paragraph_id]
            sentences_dict[paragraph_id]=tokenizer.tokenize(paragraph_text)
        
        score=0
        for paragraph_dict in sentences_dict.items():
            paragraph_id=paragraph_dict[0]
            sentences_list=paragraph_dict[1]
            
            for sentence_index in range(len(sentences_list)):
                NER_sentence=NER_corpus[docid][paragraph_id][sentence_index]
                common_keywords=get_number_of_common_kewyords(question_keywords,get_keyword(sentences_list[sentence_index]))
                similarity=nlp(question).similarity(nlp(sentences_list[sentence_index]))
                score=common_keywords/len(question_keywords)+similarity
                
                for entity in NER_sentence:
                    if (entity[1]==answer_type):
                        score+=1
                        break
                        
                df_best_sentences.loc[len(df_best_sentences)]=[docid,paragraph_id,sentence_index,sentences_list[sentence_index],score]
                
            
        
        
        if len(df_best_sentences)>0: # answer='' otherwise
                   
            best_paragraph_id=df_best_sentences.loc[df_best_sentences.score.idxmax()]['para_id']
            best_sentence_id=df_best_sentences.loc[df_best_sentences.score.idxmax()]['sentence_id']
            best_sentence_text=df_best_sentences.loc[df_best_sentences.score.idxmax()]['sentence_text']
            NER_answer_passage=NER_corpus[docid][best_paragraph_id][best_sentence_id]
            for entity in NER_answer_passage:
                if (entity[1]==answer_type):
                    #print('answer:',entity[0])
                    answer=correct_answer_pattern(entity[0])
                    #print('corrected:',answer)
                    break
                    
        
    ## only for testing purposes - get the answer sentence and check the retrieved paragraph agains the selected
    possible_par=[par[0] for par in documents_ranked]
    par_retrieved=False
    if row['answer_paragraph'] in possible_par:
        par_retrieved=True
        
    sent_ans=''
    
    
    for para in df_docs.iloc[docid]['text']:
        sent_doc=tokenizer.tokenize(para)
        #print(sent_doc)
        for sent in sent_doc:
            
            if isAnswerInSentence(row['text'],sent):       
                sent_ans=sent_doc
                break
    ## END -only for testing purposes
    
    print(ida,time.process_time()-t)
    df_result_devel.loc[len(df_result_devel)]=[docid,question,row['answer_paragraph'],possible_par,best_paragraph_id,par_retrieved,sent_ans,best_sentence_text,row['text'],answer]

print('done')

0 3.5392589999974007
1 3.3307419999982812
2 3.3604879999984405
3 3.9082460000026913
4 3.0011059999997087
5 3.475613999999041
6 3.3105370000012044
7 3.996875999997428
8 3.899578999997175
9 1.514630000001489
10 3.8269880000007106
11 2.999355000003561
12 3.1588709999959974
13 3.085860999999568
14 2.8706829999973706
15 2.9493809999985388
16 3.6533020000024408
17 3.508529999999155
18 3.928050999998959
19 3.2745749999994587
20 3.4435240000020713
21 3.1076389999980165
22 3.6096010000001115
23 3.0504929999988235
24 3.417021000001114
25 3.8880890000000363
26 2.785570000003645
27 3.3679830000000948
28 3.0594180000007327
29 2.7411190000020724
30 2.8585219999986293
31 3.849736000000121
32 0.5951779999995779
33 3.82673799999975
34 3.145799000001716
35 3.437248999998701
36 3.229137999998784
37 3.0559509999984584
38 3.2198129999997036
39 2.562997999997606
40 4.312031999997998
41 2.6488329999992857
42 3.9322269999975106
43 3.954521999999997
44 3.182372999999643
45 3.3912950000012643
46 3.5637460000034

In [236]:
df_best_sentences

Unnamed: 0,doc_id,para_id,sentence_id,sentence_text,score
0,440,16,0,As the Industrial Revolution spread across Eur...,1.047488
1,440,16,1,One popular color imported into Europe from Tu...,2.441764
2,440,16,2,"Beginning in the 1740s, this bright red color ...",2.266892
3,440,16,3,"Turkey red used madder as the colorant, but th...",1.4842
4,440,16,4,The fabric was more expensive but resulted in ...,1.267104
5,440,16,5,The fabric was widely exported from Europe to ...,1.003722
6,440,16,6,"In 19th century America, it was widely used in...",2.001293
7,440,19,0,Rothko also began using the new synthetic pigm...,1.064508
8,440,19,1,In 1962 he donated to Harvard University a ser...,1.818375
9,440,19,2,He mixed mostly traditional colors to make the...,1.051713


In [232]:
print('Accuracy - paragraph found:',df_result_devel.loc[df_result_devel['paragraph_found']==True]['id'].count()/len(df_result_devel)*100,'%')
print('Accuracy - sentence predicted',len(df_result_devel.loc[df_result_devel.paragraph==df_result_devel.predicted_paragraph])/len(df_result_devel)*100,'%')
print('Accuracy - answer found:',len(df_result_devel.loc[df_result_devel.predicted_answer==df_result_devel.answer])/len(df_result_devel)*100,'%')

Accuracy - paragraph found: 99.0 %
Accuracy - sentence predicted 57.99999999999999 %
Accuracy - answer found: 17.0 %


In [None]:
df_result_devel

<h3>Testing Dataset</h3>

In [None]:
#### DELETE CELL
df_result=pd.DataFrame(columns=['id','answer'])
df_testing=pd.read_json('project_files/testing.json')
df_testing=df_testing.iloc[0:10]
NER_dict={}
for index, row in df_testing.iterrows():
    question=row['question']
    docid=row['docid']
    ida=row['id']
    
    #print('Question: ',question)
    #print('Expected Answer:',expected_answer)
    #print('Docid:',docid)
    question_keywords=get_keyword(question)
    
    # get the most relevant documents for the question
    results = query_vsm(question_keywords, vsm_inverted_index_all[docid])
    documents_ranked=results.most_common(10) 
    #print('Top 10 paragraphs: ',documents_ranked)
    q_bow=get_question_bow(question)
    x = vectorizer.transform(q_bow)
    answer_type=model.predict(x)
    #print('Predicted answer type: ',answer_type)
    
    candidate_passages={}
    list_of_passages=[]
    answer=''
    
    if len(documents_ranked)>0:
        for document in documents_ranked:
            # perform a paragraph segmentation
            paragraph=df_docs.iloc[docid]['text'][document[0]]
            passages = tokenizer.tokenize(paragraph)
            
            for i in range(len(passages)):
                list_of_passages.append(passages[i])
                
                


        ## PARAMETERS TO GET FROM TESTING DATASET AND USE A MODEL TO GET THE ANSWER PASSAGE CANDIDATES. 
        #question= df_training.loc[(df_training["docid"] == docid_query) & (df_training["answer_paragraph"] ==document[0] ),"question"][0]
        #answer_type=df_training.loc[(df_training["docid"] == docid_query) & (df_training["answer_paragraph"] ==document[0] ),"answer_type"][0]
        #print(question)
        #print(answer_type) 
        #print(sorted(get_keyword(question)))
        ###

        ## FOR NOW USING KEYWORDS AND GET JUST ONE DEFINITE ANSWER PASSAGE CANDIDATE
        indexPassage=0
        score=0
        for indexPassage in range(len(list_of_passages)):
            NER_passage=get_Name_Entity_Sentence(list_of_passages[indexPassage])
            common_keywords=get_number_of_common_kewyords(get_keyword(question),get_keyword(list_of_passages[indexPassage]))
            similarity=nlp(question).similarity(nlp(list_of_passages[indexPassage]))
            score=common_keywords+similarity
            
            for entity in NER_passage:
                if (entity[1]==answer_type):
                    score+=1
                    break
                
            candidate_passages[indexPassage]=score

        
        if len(candidate_passages)>0:
            best_candidate_passage=list_of_passages[max(candidate_passages, key=candidate_passages.get)]
        else:
            if len(list_of_passages)>0:
                best_candidate_passage=list_of_passages[0]
        #print("Candidate Passage Answer:")
        #print(best_candidate_passage)
        
        
       
        NER_answer_passage=get_Name_Entity_Sentence(best_candidate_passage)
        for entity in NER_answer_passage:
                if (entity[1]==answer_type):
                    answer=entity[0]
    
    #print('Predicted answer:',answer)
    print(ida)
    
    df_result.loc[len(df_result)]=[ida,answer]
    
    



In [233]:
df_result=pd.DataFrame(columns=['id','answer'])
df_testing=pd.read_json('project_files/testing.json')
#df_testing=df_testing.iloc[0:2]
NER_dict={}
for index, row in df_testing.iterrows():
    t=time.process_time()
    question=row['question']
    docid=row['docid']
    ida=row['id']
    question_keywords=get_keyword(question)
    
    
    
    # get the most relevant documents for the question
    results = query_vsm(question_keywords, vsm_inverted_index_all[docid])
    documents_ranked=results.most_common(10) 
    
    
    # prediction --not necessary if loading from pickle
    q_bow=get_question_bow(question)
    x = vectorizer.transform(q_bow)
    answer_type=model.predict(x)
    
    passages_dict={}
    sentences_dict={}
    score_dict={}
    answer=''
    df_best_sentences=pd.DataFrame(columns=['doc_id','para_id','sentence_id','sentence_text','score'])
    if len(documents_ranked)>0:
        
        for document in documents_ranked:
            # perform a paragraph segmentation
            paragraph_id=document[0]
            paragraph_text=df_docs.iloc[docid]['text'][paragraph_id]
            sentences_dict[paragraph_id]=tokenizer.tokenize(paragraph_text)
        
        score=0
        for paragraph_dict in sentences_dict.items():
            paragraph_id=paragraph_dict[0]
            sentences_list=paragraph_dict[1]
            
            for sentence_index in range(len(sentences_list)):
                NER_sentence=NER_corpus[docid][paragraph_id][sentence_index]
                common_keywords=get_number_of_common_kewyords(question_keywords,get_keyword(sentences_list[sentence_index]))
                similarity=nlp(question).similarity(nlp(sentences_list[sentence_index]))
                score=common_keywords/len(question_keywords)+similarity
            
                for entity in NER_sentence:
                    if (entity[1]==answer_type):
                        score+=1
                        break
                        
                df_best_sentences.loc[len(df_best_sentences)]=[docid,paragraph_id,sentence_index,sentences_list[sentence_index],score]
                
            
        
        
        if len(df_best_sentences)>0: # answer='' otherwise
                   
            best_paragraph_id=df_best_sentences.loc[df_best_sentences.score.idxmax()]['para_id']
            best_sentence_id=df_best_sentences.loc[df_best_sentences.score.idxmax()]['sentence_id']
            best_sentence_text=df_best_sentences.loc[df_best_sentences.score.idxmax()]['sentence_text']
            NER_answer_passage=NER_corpus[docid][best_paragraph_id][best_sentence_id]
            for entity in NER_answer_passage:
                if (entity[1]==answer_type):
                    #print('answer:',entity[0])
                    answer=correct_answer_pattern(entity[0])
                    #print('corrected:',answer)
                    break
                    
            
                    
        
            
    if (answer==''):
        
        for entity in NER_answer_passage:
            answer=correct_answer(entity[0],best_sentence_text)
            break
                    
    print(ida,time.process_time()-t)
    df_result.loc[len(df_result)]=[ida,answer]
print('done')

0 2.7543970000006084
1 2.565867999997863
2 1.5330140000005485
3 2.5455389999988256
4 2.254015000002255
5 2.9898969999958354
6 2.3659389999993437
7 1.4402770000015153
8 1.4750459999995655
9 2.0905970000021625
10 0.6452010000029986
11 2.3589349999965634
an 0
12 0.2549350000008417
13 2.448075999996945
14 2.400934999997844
15 2.1029149999994843
16 0.7967410000019299
an 0
17 0.5714179999995395
18 3.3107120000022405
an 4
19 0.086738999998488
20 2.652033999998821
21 2.893501999998989
22 1.3205999999954656
23 2.918040000000474
24 2.833560999999463
25 2.4954479999978503
26 2.7794279999980063
27 1.8130249999994703
28 3.039138999996794
29 2.8151050000014948
30 3.7789649999976973
an 1
31 0.6693999999988591
32 0.8507559999998193
33 3.9725929999985965
34 3.0715929999969376
35 2.659086000003299
36 1.4299960000025749
37 2.871159999998781
38 2.8100019999983488
39 1.2943799999993644
40 1.9294540000009874
41 2.5153489999938756
42 1.4867290000038338
43 3.724852000003011
44 2.151464999995369
an 1
45 0.5558

In [235]:
df_result

Unnamed: 0,id,answer
0,0,1993
1,1,marc andreessen
2,2,once
3,3,1998
4,4,late 2004
5,5,andreessen
6,6,1993
7,7,once
8,8,marc andreessen
9,9,tim berners-lee


In [234]:
df_result.to_csv('prediction/output.csv',index=False)

In [216]:
df_last_result=df_result.copy()

In [220]:
len(df_last_result[df_last_result.answer.isnull()])

0

In [495]:
def balanced_subsample(x,y,subsample_size=1.0):
    
    class_xs = []
    min_elems = None

    for yi in np.unique(y):
        elems = x[(y == yi)]
        class_xs.append((yi, elems))
        if min_elems == None or elems.shape[0] < min_elems:
            min_elems = elems.shape[0]

    use_elems = min_elems
    if subsample_size < 1:
        use_elems = int(min_elems*subsample_size)

    xs = []
    ys = []

    for ci,this_xs in class_xs:
        if len(this_xs) > use_elems:
            np.random.shuffle(this_xs)

        x_ = this_xs[:use_elems]
        y_ = np.empty(use_elems)
        y_.fill(ci)

        xs.append(x_)
        ys.append(y_)

    xs = np.concatenate(xs)
    ys = np.concatenate(ys)

    return xs,ys

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 

df_devel=pd.read_json('project_files/devel.json')

    
def get_answer_rank_features(dataset):
    X=[]
    Y=[]
    for index, row in dataset.iterrows():
        question=row['question']
        raw_answer=row['text']

        paragraph=df_docs.iloc[row['docid']]['text'][row['answer_paragraph']]
        
        #answer_found,dict_answer_sentence_ner,common_entities=get_answer_features(paragraph,raw_answer,row['NE_text'],row['NE_paragraph'])

        # number of named entities in the passage
        num_entities=len(common_entities)

        # number of question keywords in the passage
        question_keywords=get_keyword(question)
        answer_passage_keywords=get_keyword(answer_found)
        qk_passage=[]
        for qk in question_keywords:
            if qk in answer_passage_keywords:
                qk_passage.append(qk)
        num_qkp=len(qk_passage)   

        # longest exact sequence of keywords
        longest_exact_sequence=0

        for i in range(len(question_keywords)):
            if i < len(answer_passage_keywords):
                if question_keywords[i] in answer_passage_keywords[i]:
                    longest_exact_sequence+=1

        # rank of the paragraph where the answer sentence was extracted
        results = query_vsm(question_keywords, vsm_inverted_index_all[row['docid']])
        documents_ranked=results.most_common(10) 
        rank_of_paragraph=0
        for document in documents_ranked:
            if (document[0]==row['answer_paragraph']):
                break
            else:
                rank_of_paragraph+=1

        #print('Question:',question)
        #print('answer:',answer_found)

        #print (num_entities,num_qkp,longest_exact_sequence,rank_of_paragraph)

        tokenized_sentence = nltk.sent_tokenize(paragraph)
        for sentence in tokenized_sentence:
            X.append([num_entities,num_qkp,longest_exact_sequence,rank_of_paragraph])
            #print(sentence)
            if (sentence==answer_found):
                Y.append(1)
            else:
                Y.append(0)

        #print(Y_train)
    
    return X,Y


    
def get_answer_rank_features_devel(dataset):
    X=[]
    Y=[]
    for index, row in dataset.iterrows():
        question=row['question']
        raw_answer=row['text']
        print(index)
        paragraph=df_docs.iloc[row['docid']]['text'][row['answer_paragraph']]
        
        NE_answer=get_Name_Entity_Sentence(raw_answer)
        NE_paragraph=get_Name_Entity_paragraph(paragraph)
        #print(NE_answer)
        answer_found,dict_answer_sentence_ner,common_entities=get_answer_features(paragraph,raw_answer,NE_answer,NE_paragraph)

        # number of named entities in the passage
        num_entities=len(common_entities)

        # number of question keywords in the passage
        question_keywords=get_keyword(question)
        answer_passage_keywords=get_keyword(answer_found)
        qk_passage=[]
        for qk in question_keywords:
            if qk in answer_passage_keywords:
                qk_passage.append(qk)
        num_qkp=len(qk_passage)   

        # longest exact sequence of keywords
        longest_exact_sequence=0

        for i in range(len(question_keywords)):
            if i < len(answer_passage_keywords):
                if question_keywords[i] in answer_passage_keywords[i]:
                    longest_exact_sequence+=1

        # rank of the paragraph where the answer sentence was extracted
        results = query_vsm(question_keywords, vsm_inverted_index_all[row['docid']])
        documents_ranked=results.most_common(10) 
        rank_of_paragraph=0
        for document in documents_ranked:
            if (document[0]==row['answer_paragraph']):
                break
            else:
                rank_of_paragraph+=1

        #print('Question:',question)
        #print('answer:',answer_found)

        #print (num_entities,num_qkp,longest_exact_sequence,rank_of_paragraph)

        tokenized_sentence = nltk.sent_tokenize(paragraph)
        for sentence in tokenized_sentence:
            
            X.append([num_entities,num_qkp,longest_exact_sequence,rank_of_paragraph])
            if (answer_found in sentence):
                Y.append(1)
            else:
                Y.append(0)

        #print(Y_train)
        
    return X,Y    

LogReg = LogisticRegression()

X_train,Y_train=get_answer_rank_features(df_training)
#X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, stratify=Y_train, test_size=0.2)


#print(X_train)
#print(Y_train)
LogReg.fit(X_train, Y_train)
print('done training')
X,Y=get_answer_rank_features_devel(df_devel)


y_predicted_class = LogReg.predict(X)


classifications=Y
predictions=y_predicted_class

print("Accuracy:")
print(accuracy_score(classifications,predictions))
print(classification_report(classifications,predictions))

In [504]:
def get_passage_features(passage,question,answer_type):
    # number of named entities in the passage
        num_entities=len(common_entities)

        # number of question keywords in the passage
        question_keywords=get_keyword(question)
        answer_passage_keywords=get_keyword(answer_found)
        qk_passage=[]
        for qk in question_keywords:
            if qk in answer_passage_keywords:
                qk_passage.append(qk)
        num_qkp=len(qk_passage)   

        # longest exact sequence of keywords
        longest_exact_sequence=0

        for i in range(len(question_keywords)):
            if i < len(answer_passage_keywords):
                if question_keywords[i] in answer_passage_keywords[i]:
                    longest_exact_sequence+=1

        # rank of the paragraph where the answer sentence was extracted
        results = query_vsm(question_keywords, vsm_inverted_index_all[row['docid']])
        documents_ranked=results.most_common(10) 
        rank_of_paragraph=0
        for document in documents_ranked:
            if (document[0]==row['answer_paragraph']):
                break
            else:
                rank_of_paragraph+=1

        return num_entities,num_qkp,longest_exact_sequence,rank_of_paragraph]
    

5396

In [None]:
print(sum(Y))

In [394]:
df_devel.head()

Unnamed: 0,answer_paragraph,docid,question,text
0,5,380,On what date did the companies that became the Computing-Tabulating-Recording Company get consolidated?,"june 16 , 1911"
1,22,380,What percentage of its desktop PCs does IBM plan to install Open Client on to?,5 %
2,16,380,What year did IBM hire its first black salesman?,1946
3,4,380,"IBM made an acquisition in 2009, name it.",spss
4,2,380,"This IBM invention is known by the acronym UPC, what is the full name?",universal product code


In [426]:
a={'a':1,'b':2,'c':3}

In [427]:
a[0]

KeyError: 0

In [428]:
df_training.head()

Unnamed: 0,answer_paragraph,docid,question,text,NE_question,NE_text,NE_paragraph,answer_type,keywords_question,question_type,POS_questions,answer_found
0,23,0,A kilogram could be definined as having a Planck constant of what value?,6966662606895999999♠6.62606896×10−34 j⋅s,[],"[(6966662606895999999 ♠ 6.62606896, NUMBER), (10 − 34, NUMBER)]","[[(general, TITLE), (2011, DATE)], [], [(one, NUMBER)], [(7050135639273999999 ♠ 135639274 ×, NUMBER), (1042, DATE), (6966662606895999999 ♠ 6.62606896, NUMBER), (10 − 34, NUMBER), (⋅, NUMBER)]]",NUMBER,"[kilogram, definined, planck, constant, value]",what,"[NN, VBN, NNP, NN, NN]","Possible new definitions include ""the mass of a body at rest whose equivalent energy equals the energy of photons whose frequencies sum to 7050135639273999999♠135639274×1042 Hz"", or simply ""the kilogram is defined so that the Planck constant equals 6966662606895999999♠6.62606896×10−34 J⋅s""."
1,22,0,What is the shape of the object that establishes the base unit of the kilogram?,cylinder,[],[],"[[], [], [(1889, DATE), (paris, CITY)], [(1889, DATE), (1, NUMBER), (one, NUMBER), (million, NUMBER)], [(one, NUMBER), (current, DATE), (planck, LOCATION)]]",,"[shape, object, establish, base, unit, kilogram]",what,"[NN, NN, VBZ, JJ, NN, NN]","The most urgent unit on the list for redefinition is the kilogram, whose value has been fixed for all science (since 1889) by the mass of a small cylinder of platinum–iridium alloy kept in a vault just outside Paris."
2,12,0,What example is given as another paired relationship of uncertainly related to standard deviation?,time vs. energy,[],[],"[[], [], [(one, NUMBER)], [(fourier, LOCATION)]]",,"[example, give, pair, relationship, relate, standard, deviation]",what,"[NN, VBN, JJ, NN, VBN, JJ, NN]",One example is time vs. energy.
3,1,0,What does the Planck Constant refer to?,quantum of action,[],[],"[[], [(planck, PERSON)], [(now, DATE)], [], []]",,"[doe, planck, constant, refer]",what,"[VBZ, NNP, NNP, NN]","Instead, it must be some multiple of a very small quantity, the ""quantum of action"", now called the Planck constant."
4,10,0,When was the first quantized model of the atom introduced?,1913,"[(first, ORDINAL), (model, TITLE)]","[(1913, DATE)]","[[(niels bohr, PERSON), (first, ORDINAL), (model, TITLE), (1913, DATE), (rutherford, PERSON), (model, TITLE)], [], [], [(bohr, PERSON), (planck, PERSON), (bohr, PERSON)]]",DATE,"[wa, first, quantize, model, atom, introduce]",when,"[VBD, JJ, JJ, NN, NN, VBD]","Niels Bohr introduced the first quantized model of the atom in 1913, in an attempt to overcome a major shortcoming of Rutherford's classical model."


In [534]:
len(df_training[df_training.answer_found=='UNKNOWN'])

2743

In [542]:
df_training.iloc[3105]

answer_paragraph     19                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
docid                28                                                                                                                                                                                         

In [538]:
a='The winner of the 2014 Nobel Prize in Literature, Patrick Modiano–who lives in Paris–, based most of his literary work on the depiction of the city during World War II and the 1960s-1970s.'

if 'patrick modiano' in a.lower():
    print(True)

True
