<h3>0. Exploratory analysis</h3>

In [7]:
# tools
import pickle

def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [8]:
import json
import pandas as pd
import spacy

nlp = spacy.load('en_core_web_md')

df_devel=pd.read_json('project_files/devel.json')
df_docs=pd.read_json('project_files/documents.json')
df_testing=pd.read_json('project_files/testing.json')

df_training=pd.read_pickle('project_files/df_training.pkl')


question_learning_dataset = df_training[df_training.answer_type.notnull()]

NER_corpus=load_obj('ner_corpus')

In [9]:
from nltk.tokenize.punkt import PunktSentenceTokenizer,PunktTrainer

tokenizer = load_obj('punk_tokenizer')
tokenizer._params.abbrev_types.add('ii')
tokenizer._params.abbrev_types.add('dr')

questionwords = set(["who", "what", "where", "when", "why", "how", "whose", "which", "whom","whats","what's","whos"])
passiveQuestions = set(["can", "could", "would", 
                   "was", "were","am","is", "are", "will","shall",
                   "did","do","does",
                   "had", "have","has",
                   "as","that","in",
                   "give an example","name"])


1. Find Keywords
2. Answer types - Using answer type taxonomy
3. Query formulation -> Keywords
4. Go to each document and check the frequency distribution of words and pick the document if one of the query words are present in document. Create a rank with that score
5. Find the paragraphs -> Discard irrelevant paragraphs. Use NE,Keywords, longest exact keywords. Put same weight for now and calculate the score of paragraphs. Rank each of the paragraphs in the document. We have to use the original answer and match the answer type
6. Find candidate answers -> Use supervised ML method
7. Merge candidate answers -> Use NER
8. Pick the best answer -> Logistic regression

<h3>1. Question processing</h3>

Configuring Stanford CoreNLP . Link -> https://blog.manash.me/configuring-stanford-parser-and-stanford-ner-tagger-with-nltk-in-python-on-windows-f685483c374a

In [10]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.tag.stanford import CoreNLPNERTagger
from itertools import groupby

stopwords = set(nltk.corpus.stopwords.words('english')) 


def get_Name_Entity_NLTK(data):
    results=[]
    for sentence in data:
        ne_chunked_sents = ne_chunk(pos_tag(word_tokenize(sentence)))
        result = []

        for tagged_tree in ne_chunked_sents:

            if hasattr(tagged_tree, 'label'):
                entity_name = ' '.join(c[0] for c in tagged_tree.leaves()) #
                entity_type = tagged_tree.label() # get NE category
                result.append((entity_name, entity_type))
        results.append(result)

    return results

def get_Name_Entity_Sentence(sentence):
    st = CoreNLPNERTagger(url='http://localhost:9000')
    tokenized_text = nltk.word_tokenize(sentence)
    classified_text = st.tag(tokenized_text)
    result = []
    
    for tag, chunk in groupby(classified_text, lambda x:x[1]):
       if tag != "O":
            word = " ".join(w for w, t in chunk)
            result.append((word.lower(), tag))
    
    return result


def addNameEntity(df,feature,func):
    if 'NE'+"_"+feature in df:
        df = df.drop('NE'+"_"+feature, axis=1)
    df["NE"+"_"+feature] = func(df[feature])
    
    return df

In [11]:
def get_question_type(question):
    found  = False
    result = 'other'
    question_tokens = nltk.word_tokenize(question)
    for token in question_tokens:
        if token in questionwords:
            found = True
            result = token
    if not found:
        for token in question_tokens:
            if token in passiveQuestions:
                found = True
                result = token

    return result

Get Keywords

In [12]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords

lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

POS = set(["NN","NNS","NNP","NNPS","CD","JJ","VB","VBD","VBG","VBN","VBP","VBZ"]) 

stopwords = set(nltk.corpus.stopwords.words('english')) 


def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,wn.NOUN)
    if (lemma == word):
        lemma = lemmatizer.lemmatize(word,wn.VERB)
        
    return lemma

def get_keyword(data):
    result = []
    sentence=data
    tokenized_text = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokenized_text)
    for text,pos in tagged:
        text = lemmatize(text.lower())
        if text not in stopwords:
            if pos in POS:
                result.append(text)
                
    return result

def get_keyword_paragraph(data):
    results=[]
    tokenized_sentence = tokenizer.tokenize(data)
    for sentence in tokenized_sentence:
        result = get_keyword(sentence)
        results.append(result)
        
    return results

def get_keyword_all(data):
    results=[]
    for sentence in data:
        result = get_keyword(sentence)
        results.append(result)
        
    return results

def add_keywords(df,feature):
    if 'keywords'+"_"+feature in df:
        df = df.drop('keywords'+"_"+feature, axis=1)
    df['keywords'+"_"+feature]=get_keyword_all(df[feature])
    return df

def get_number_of_common_kewyords(question_keywords,answer_sentence_keywords):
    sum_keywords=0
    for qkey in question_keywords:
        if qkey in answer_sentence_keywords:
            sum_keywords+=1
    
    return sum_keywords

<h4>Train a classifier</h4>

In [13]:
# BOW extraction for passages and questions
def get_passages_bow(passages):
    passage_bow={}
    for passage in passages:
        for token in nltk.word_tokenize(passage):
            if token not in stopwords: 
                word=lemmatize(token.lower())
                passage_bow[word] = passage_bow.get(word, 0) +  1
    
    return passage_bow

def get_sentences_bow(sentences):
    sentence_bow={}
    
    for sentence in sentences:
        for token in nltk.word_tokenize(sentence):
            if token not in stopwords:
                word=lemmatize(token.lower())
                sentence_bow[word] = sentence_bow.get(word, 0) +  1
    
    return sentence_bow

def get_question_bow(question):
    question_bow={}
    question_bow[get_question_type(question)]=1
    for token in nltk.word_tokenize(question):
        if token not in stopwords: 
            word=lemmatize(token.lower())
            question_bow[word] = question_bow.get(word, 0) +  1
                
    return question_bow

def get_training_question_bow(question,keywords,qt):
    question_bow={}
    question_bow[qt]=1
    for token in nltk.word_tokenize(question):
        if token not in stopwords: 
            word=lemmatize(token.lower())
            if word in keywords:
                question_bow[word] = question_bow.get(word, 0) +  1
                
    return question_bow

In [14]:
def get_feature_questions(questions, keywords,qt):
    qs = []
    for i,question in enumerate(questions):
        q_bow = get_training_question_bow(question,keywords,qt[i])
        qs.append(q_bow)
        
    return qs

In [52]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def check_results(predictions, classifications):
    print("Accuracy:")
    print(accuracy_score(classifications,predictions))
    print(classification_report(classifications,predictions))

In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction import DictVectorizer


    
# get the most common words from answer sentences (we can twek this for paragraph)
answer_sentences_bow=get_sentences_bow(question_learning_dataset[question_learning_dataset['answer_found'].notnull()]['answer_found'])
answer_keywords = set([word for word, count in answer_sentences_bow.items()])

#qs_training=get_feature_questions(questions,answer_keywords)
qs_training=get_feature_questions(list(question_learning_dataset.question),answer_keywords,list(question_learning_dataset.question_type))




In [17]:
from sklearn.ensemble import RandomForestClassifier

if (len(qs_training)>0 and len(list(question_learning_dataset.question_type))>0):
    # fit vectorizer
    vectorizer = DictVectorizer()
    
    X_train_dtm = vectorizer.fit_transform(qs_training)
    
    

    model=RandomForestClassifier(n_estimators = 300, max_depth = 60, criterion = 'entropy')
    
    # tag the answers
    # fit a logistic regression model to the data 
    # build classifier
    #model = MultinomialNB(2, False, None)

    # train the model using X_train_dtm 
    model.fit(X_train_dtm, list(question_learning_dataset.answer_type))
    
    y_predicted_class = model.predict(X_train_dtm)
    
    check_results(y_predicted_class,list(question_learning_dataset.answer_type))

Accuracy:
0.8369626130814186
                   precision    recall  f1-score   support

   CAUSE_OF_DEATH       1.00      0.47      0.64       327
             CITY       1.00      0.08      0.15        12
          COUNTRY       0.94      0.54      0.69      1058
  CRIMINAL_CHARGE       1.00      0.33      0.49        64
             DATE       0.76      0.99      0.86      5801
         DURATION       0.96      0.64      0.77       464
         IDEOLOGY       1.00      0.56      0.71       232
         LOCATION       0.79      0.91      0.85      1738
             MISC       1.00      0.50      0.67       133
            MONEY       1.00      0.83      0.91       462
      NATIONALITY       0.99      0.44      0.61       858
           NUMBER       0.93      0.91      0.92      4644
          ORDINAL       1.00      0.65      0.79       406
     ORGANIZATION       1.00      0.51      0.68       496
          PERCENT       0.98      0.85      0.91       751
           PERSON       0.

<h3>2. Candidate answering generation</h3>

<h4> Get a score for the passage to filter the most relevant passages</h4>


In [18]:
## features relevant to this part
# number of named entities of the right type in the passage
# number of question keywords in the passage
# the longest exact sequence of question keywords
# rank of the document where the passage was extracted
# proximity of the keywords from the original query
# ngram overlap between the passage and the question

First, we will set up useful functions to extract term frequencies to build the vector space model

In [19]:
import nltk
from collections import defaultdict
from collections import Counter
from math import log

stopwords = set(nltk.corpus.stopwords.words('english')) # wrap in a set() (see below)


# get the terms for a passage
def get_terms(passage):
    terms = set()
    for token in nltk.word_tokenize(passage):
        if token not in stopwords: 
            terms.add(lemmatize(token.lower()))
    return terms
    
# get document_term 
def get_document_term_passsages(ds_documents):
    document_term={}
    passageID=0
    for index, row in ds_documents.iterrows():
        passageID=0
        terms={}
        # every row is a document
        list_of_passages=row['text']
        for passage in list_of_passages:
            terms[passageID]=get_terms(passage)
            passageID+=1
            
        document_term[row['docid']]=terms
    return document_term

# get the term frequency
def extract_term_freqs(doc):
    tfs = Counter()
    for token in doc:
        if token not in stopwords: 
            tfs[lemmatize(token.lower())] += 1
    return tfs
        
# compute idf
def compute_doc_freqs(doc_term_freqs):
    doc_dic = {}
    for key, value in doc_term_freqs.items():
        dfs = Counter()
        for passage_id,tfs in value.items():
            for term in tfs.keys():
                dfs[term] += 1
        doc_dic[key] = dfs
        
    return doc_dic
    

In [23]:
# create a document-term matrix
docs=get_document_term_passsages(df_docs)
#docs

In [24]:
# create a vector space model we need to define a score function
# first I will use tf-idf
doc_term_freqs = {}
for docid,dic_passages in docs.items():
    passage_dic = {}
    for passage_id, terms in dic_passages.items():
        term_freqs = extract_term_freqs(terms)
        passage_dic[passage_id] = term_freqs
    doc_term_freqs[docid] = passage_dic

doc_freqs = compute_doc_freqs(doc_term_freqs)


In [25]:
#doc_term_freqs

<b>Improvement:</b> Use BM25

Create an inverted index for query processing. Inverted index will not change from query to query. Here we can improve how the weight is defined for the posting list tuple for each term (docid,weight)

In [26]:
def count_words(freqs):
    p_count=0
    for counter in freqs.values():
        p_count+=sum(counter.values())
    
    #print(p_count)
    return p_count

In [314]:
## Code from WSTA_N16_information_retrieval
vsm_inverted_index_all = defaultdict()
for docid, passage_freqs in doc_term_freqs.items():
    vsm_inverted_index = defaultdict(list)
    
    #N = sum(passage_freqs.values())
    N = count_words(passage_freqs)
    #print(N,passage_freqs)
    for passage_id, term_freqs in passage_freqs.items():
        length = 0
        # find tf*idf values and accumulate sum of squares 
        tfidf_values = []
        M = len(passage_freqs)
        for term, count in term_freqs.items():
            tfidf = float(count) / N * log(M / float(doc_freqs[docid][term])) # should be number of documents (paragraphs) with term
            tfidf_values.append((term, tfidf))
            length += tfidf ** 2

        # normalise documents by length and insert into index
        length = length ** 0.5
        for term, tfidf in tfidf_values:
            # note the inversion of the indexing, to be term -> (doc_id, score)
            vsm_inverted_index[term].append([passage_id, tfidf / length])
    vsm_inverted_index_all[docid] = vsm_inverted_index

# ensure posting lists are in sorted order (less important here cf above)
for key, value in vsm_inverted_index_all.items():
    for term, docids in value.items():
        docids.sort()


In [20]:
vsm_inverted_index_all=load_obj('vsm_inverted_index_corpus')

Query the VSM creating a score for each document (passage) and returning the top k

In [21]:
from collections import Counter
# get a list of paragraphs ordered by relevance on the question
def query_vsm(query, index):
    accumulator = Counter()
    for term in query:
        postings = index[term]
        for docid, weight in postings:
            accumulator[docid] += weight
    return accumulator

## end copied code

<h3>3. Candidate answering scoring</h3>

In [221]:
import re
def correct_answer_space(predicted,predicted_answer_sentence):
    
    tokens=nltk.word_tokenize(predicted)
    pattern='.*('
    for token in  tokens:
        pattern=pattern+token+'\s*'
    pattern=pattern+').*'

    reg=re.compile(pattern,re.IGNORECASE)
    if len(re.findall(reg,predicted_answer_sentence)):
        predicted=re.findall(reg,predicted_answer_sentence)[0].strip()
    
    return predicted

def correct_answer_pattern(predicted):
    corrected=predicted
    
    # symbol % based pattern
    pattern_percentaje='(.*[0-9])(%.*)'
    reg=re.compile(pattern_percentaje,re.IGNORECASE)
    result=re.findall(reg,predicted)
    if len(result)>0:
        groups=result[0]
        corrected=groups[0]+' '+groups[1]
    
    
    # date pattern
    #pattern_date='(Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?|Sep(tember)?|Oct(ober)?|Nov(ember)?|Dec(ember)?)\s+(\d{1,2})(,.*)'
    pattern_date='(January|February|March|April|May|June|July|August|September|October|November|December)\s+([0-9]{1,2})(,.*)([0-9]{4})'


    reg=re.compile(pattern_date,re.IGNORECASE)
    result=re.findall(reg,predicted)

    if len(result)>0:
        groups=result[0]   
        corrected=(groups[0]+' '+groups[1]+' '+groups[2]+groups[3])


    return corrected
    
    
def isAnswerInSentence(answer,answer_sentence):
    #print('Eval:',answer)
    #print('In:',answer_sentence)
    tokens=nltk.word_tokenize(answer)
    pattern='.*('
    for token in  tokens:
        pattern=pattern+token+'\s*'
    pattern=pattern+').*'

    
    reg=re.compile(pattern,re.IGNORECASE)
    if len(re.findall(reg,answer_sentence))>0:
        return True
    
    return False




In [None]:
import time

#df_devel=pd.read_json('project_files/devel.json')
df_result_devel=pd.DataFrame(columns=['id','question','paragraph','retrieved paras','predicted_paragraph','paragraph_found','sentence','predicted_sentence','answer','predicted_answer'])
df_devel=df_devel.iloc[0:100]

for index, row in df_devel.iterrows():
    t=time.process_time()
    question=row['question']
    docid=row['docid']
    ida=index
    
    
    question_keywords=get_keyword(question)
    
    
    
    # get the most relevant documents for the question
    results = query_vsm(question_keywords, vsm_inverted_index_all[docid])
    documents_ranked=results.most_common(10) 
    
    
    # prediction --not necessary if loading from pickle
    q_bow=get_question_bow(question)
    x = vectorizer.transform(q_bow)
    answer_type=model.predict(x)
    #answer_type=row['predicted_answer_type']
    
    passages_dict={}
    sentences_dict={}
    score_dict={}
    answer=''
    df_best_sentences=pd.DataFrame(columns=['doc_id','para_id','sentence_id','sentence_text','score'])
    if len(documents_ranked)>0:
        
        for document in documents_ranked:
            # perform a paragraph segmentation
            paragraph_id=document[0]
            paragraph_text=df_docs.iloc[docid]['text'][paragraph_id]
            sentences_dict[paragraph_id]=tokenizer.tokenize(paragraph_text)
        
        score=0
        for paragraph_dict in sentences_dict.items():
            paragraph_id=paragraph_dict[0]
            sentences_list=paragraph_dict[1]
            
            for sentence_index in range(len(sentences_list)):
                NER_sentence=NER_corpus[docid][paragraph_id][sentence_index]
                common_keywords=get_number_of_common_kewyords(question_keywords,get_keyword(sentences_list[sentence_index]))
                similarity=nlp(question).similarity(nlp(sentences_list[sentence_index]))
                score=common_keywords/len(question_keywords)+similarity
                
                for entity in NER_sentence:
                    if (entity[1]==answer_type):
                        score+=1
                        break
                        
                df_best_sentences.loc[len(df_best_sentences)]=[docid,paragraph_id,sentence_index,sentences_list[sentence_index],score]
                
            
        
        
        if len(df_best_sentences)>0: # answer='' otherwise
                   
            best_paragraph_id=df_best_sentences.loc[df_best_sentences.score.idxmax()]['para_id']
            best_sentence_id=df_best_sentences.loc[df_best_sentences.score.idxmax()]['sentence_id']
            best_sentence_text=df_best_sentences.loc[df_best_sentences.score.idxmax()]['sentence_text']
            NER_answer_passage=NER_corpus[docid][best_paragraph_id][best_sentence_id]
            for entity in NER_answer_passage:
                if (entity[1]==answer_type):
                    #print('answer:',entity[0])
                    answer=correct_answer_pattern(entity[0])
                    #print('corrected:',answer)
                    break
                    
        
    ## only for testing purposes - get the answer sentence and check the retrieved paragraph agains the selected
    possible_par=[par[0] for par in documents_ranked]
    par_retrieved=False
    if row['answer_paragraph'] in possible_par:
        par_retrieved=True
        
    sent_ans=''
    
    
    for para in df_docs.iloc[docid]['text']:
        sent_doc=tokenizer.tokenize(para)
        #print(sent_doc)
        for sent in sent_doc:
            
            if isAnswerInSentence(row['text'],sent):       
                sent_ans=sent_doc
                break
    ## END -only for testing purposes
    
    print(ida,time.process_time()-t)
    df_result_devel.loc[len(df_result_devel)]=[docid,question,row['answer_paragraph'],possible_par,best_paragraph_id,par_retrieved,sent_ans,best_sentence_text,row['text'],answer]

print('done')

In [None]:
df_best_sentences

In [232]:
print('Accuracy - paragraph found:',df_result_devel.loc[df_result_devel['paragraph_found']==True]['id'].count()/len(df_result_devel)*100,'%')
print('Accuracy - sentence predicted',len(df_result_devel.loc[df_result_devel.paragraph==df_result_devel.predicted_paragraph])/len(df_result_devel)*100,'%')
print('Accuracy - answer found:',len(df_result_devel.loc[df_result_devel.predicted_answer==df_result_devel.answer])/len(df_result_devel)*100,'%')

Accuracy - paragraph found: 99.0 %
Accuracy - sentence predicted 57.99999999999999 %
Accuracy - answer found: 17.0 %


In [None]:
df_result_devel

<h3>Testing Dataset</h3>

In [None]:
df_result=pd.DataFrame(columns=['id','answer'])
df_testing=pd.read_json('project_files/testing.json')
#df_testing=df_testing.iloc[0:2]
NER_dict={}
for index, row in df_testing.iterrows():
    t=time.process_time()
    question=row['question']
    docid=row['docid']
    ida=row['id']
    question_keywords=get_keyword(question)
    
    
    
    # get the most relevant documents for the question
    results = query_vsm(question_keywords, vsm_inverted_index_all[docid])
    documents_ranked=results.most_common(10) 
    
    
    # prediction --not necessary if loading from pickle
    q_bow=get_question_bow(question)
    x = vectorizer.transform(q_bow)
    answer_type=model.predict(x)
    
    passages_dict={}
    sentences_dict={}
    score_dict={}
    answer=''
    df_best_sentences=pd.DataFrame(columns=['doc_id','para_id','sentence_id','sentence_text','score'])
    if len(documents_ranked)>0:
        
        for document in documents_ranked:
            # perform a paragraph segmentation
            paragraph_id=document[0]
            paragraph_text=df_docs.iloc[docid]['text'][paragraph_id]
            sentences_dict[paragraph_id]=tokenizer.tokenize(paragraph_text)
        
        score=0
        for paragraph_dict in sentences_dict.items():
            paragraph_id=paragraph_dict[0]
            sentences_list=paragraph_dict[1]
            
            for sentence_index in range(len(sentences_list)):
                NER_sentence=NER_corpus[docid][paragraph_id][sentence_index]
                common_keywords=get_number_of_common_kewyords(question_keywords,get_keyword(sentences_list[sentence_index]))
                similarity=nlp(question).similarity(nlp(sentences_list[sentence_index]))
                score=common_keywords/len(question_keywords)+similarity
            
                for entity in NER_sentence:
                    if (entity[1]==answer_type):
                        score+=1
                        break
                        
                df_best_sentences.loc[len(df_best_sentences)]=[docid,paragraph_id,sentence_index,sentences_list[sentence_index],score]
                
            
        
        
        if len(df_best_sentences)>0: # answer='' otherwise
                   
            best_paragraph_id=df_best_sentences.loc[df_best_sentences.score.idxmax()]['para_id']
            best_sentence_id=df_best_sentences.loc[df_best_sentences.score.idxmax()]['sentence_id']
            best_sentence_text=df_best_sentences.loc[df_best_sentences.score.idxmax()]['sentence_text']
            NER_answer_passage=NER_corpus[docid][best_paragraph_id][best_sentence_id]
            for entity in NER_answer_passage:
                if (entity[1]==answer_type):
                    answer=correct_answer_pattern(entity[0])
                    break
                    
            
                    
        
            
    if (answer==''):
        
        for entity in NER_answer_passage:
            answer=correct_answer(entity[0],best_sentence_text)
            break
                    
    print(ida,time.process_time()-t)
    df_result.loc[len(df_result)]=[ida,answer]
print('done')

In [234]:
df_result.to_csv('prediction/output.csv',index=False)

In [216]:
df_last_result=df_result.copy()

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn import svm
import time
import numpy as np

df_training=df_training[0:200]

    
def get_answer_rank_features_train():
    X=[]
    Y=[]
    t=time.process_time()
    for index, row in df_training.iterrows():
        question=row['question']
        raw_answer=row['text']
        answer_sentence=row['answer_found']
        paragraph=df_docs.iloc[row['docid']]['text'][row['answer_paragraph']]
        common_entities=0
        
        if (answer_sentence != None):
            
            #nlp
            question_nlp=nlp(question)
            answer_sentence_nlp=nlp(answer_sentence)

            for entity in answer_sentence_nlp.sents:
                if entity in question_nlp.ents:
                    common_entities+=1


            # number of question keywords in the passage
            question_keywords=get_keyword(question)
            answer_sentence_keywords=get_keyword(answer_sentence)
            common_keywords=0
            for qk in question_keywords:
                if qk in answer_sentence_keywords:
                    common_keywords+=1


            # longest exact sequence of keywords
            longest_exact_sequence=0
            for i in range(len(question_keywords)):
                if i < len(answer_sentence_keywords):
                    if question_keywords[i] in answer_sentence_keywords[i]:
                        longest_exact_sequence+=1


            # similarity 
            similarity=question_nlp.similarity(answer_sentence_nlp)

            # rank of the paragraph where the answer sentence was extracted
            results = query_vsm(question_keywords, vsm_inverted_index_all[row['docid']])
            documents_ranked=results.most_common(10) 
            rank_of_paragraph=0
            for document in documents_ranked:
                if (document[0]==row['answer_paragraph']):
                    break
                else:
                    rank_of_paragraph+=1

            # proximity
            proximity=0
            question_keywords_span=question_keywords.copy()
            index_qk=0
            while len(question_keywords_span)>0 and index_qk<len(question_keywords_span):
                proximity+=1
                if question_keywords_span[index_qk] in answer_sentence_keywords:
                    question_keywords_span.pop(index_qk)
                index_qk+=1

            # n-gram overlap
            bigrams_question =  nltk.bigrams([lemmatize(token) for token in nltk.word_tokenize(question)])
            ngram_overlap=0

            for bigram_question in bigrams_question:
                bigrams_sentence = nltk.bigrams([lemmatize(token) for token in nltk.word_tokenize(answer_sentence)])
                for bigram_sentence in bigrams_sentence:
                    if bigram_question == bigram_sentence:
                        ngram_overlap+=1



            #print (num_entities,num_qkp,longest_exact_sequence,rank_of_paragraph)

            tokenized_sentence = tokenizer.tokenize(paragraph)
            for sentence in tokenized_sentence:
                X.append([common_entities,common_keywords,longest_exact_sequence,rank_of_paragraph,proximity,ngram_overlap,similarity])
                #print(sentence)
                if (sentence==answer_sentence):
                    Y.append(1)
                else:
                    Y.append(0)

        print(index,time.process_time()-t)
    
    return X,Y

X_train,y_train=get_answer_rank_features_train()

# fit the model and get the separating hyperplane using weighted classes
wclf = svm.SVC(kernel='linear', class_weight={1:4})
wclf.fit(X, y)

y_predicted_class = wclf.predict(X)


classifications=y
predictions=y_predicted_class

print("Accuracy:")
print(accuracy_score(classifications,predictions))
print(classification_report(classifications,predictions))

In [None]:
X=[]
y=[]
t=time.process_time()
df_devel=df_devel[0:50]
for index, row in df_devel.iterrows():
    question=row['question']
    raw_answer=row['text']
    answer_sentence=row['answer_found']
    paragraph=df_docs.iloc[row['docid']]['text'][row['answer_paragraph']]
    common_entities=0

    if (answer_sentence != None):

        #nlp
        question_nlp=nlp(question)
        answer_sentence_nlp=nlp(answer_sentence)

        for entity in answer_sentence_nlp.sents:
            if entity in question_nlp.ents:
                common_entities+=1


        # number of question keywords in the passage
        question_keywords=get_keyword(question)
        answer_sentence_keywords=get_keyword(answer_sentence)
        common_keywords=0
        for qk in question_keywords:
            if qk in answer_sentence_keywords:
                common_keywords+=1


        # longest exact sequence of keywords
        longest_exact_sequence=0
        for i in range(len(question_keywords)):
            if i < len(answer_sentence_keywords):
                if question_keywords[i] in answer_sentence_keywords[i]:
                    longest_exact_sequence+=1


        # similarity 
        similarity=question_nlp.similarity(answer_sentence_nlp)

        # rank of the paragraph where the answer sentence was extracted
        results = query_vsm(question_keywords, vsm_inverted_index_all[row['docid']])
        documents_ranked=results.most_common(10) 
        rank_of_paragraph=0
        for document in documents_ranked:
            if (document[0]==row['answer_paragraph']):
                break
            else:
                rank_of_paragraph+=1

        # proximity
        proximity=0
        question_keywords_span=question_keywords.copy()
        index_qk=0
        while len(question_keywords_span)>0 and index_qk<len(question_keywords_span):
            proximity+=1
            if question_keywords_span[index_qk] in answer_sentence_keywords:
                question_keywords_span.pop(index_qk)
            index_qk+=1

        # n-gram overlap
        bigrams_question =  nltk.bigrams([lemmatize(token) for token in nltk.word_tokenize(question)])
        ngram_overlap=0

        for bigram_question in bigrams_question:
            bigrams_sentence = nltk.bigrams([lemmatize(token) for token in nltk.word_tokenize(answer_sentence)])
            for bigram_sentence in bigrams_sentence:
                if bigram_question == bigram_sentence:
                    ngram_overlap+=1



       

        tokenized_sentence = tokenizer.tokenize(paragraph)
        for sentence in tokenized_sentence:
            X.append([common_entities,common_keywords,longest_exact_sequence,rank_of_paragraph,proximity,ngram_overlap,similarity])
            #print(sentence)
            if (sentence==answer_sentence):
                y.append(1)
            else:
                y.append(0)
                
                

y_predicted_class = wclf.predict(X)


classifications=y
predictions=y_predicted_class

print("Accuracy:")
print(accuracy_score(classifications,predictions))
print(classification_report(classifications,predictions))