In [1]:
import json,os,time
import spacy
import csv
import unicodedata
from collections import Counter
from whoosh.qparser import QueryParser
from whoosh import scoring,qparser
from whoosh.index import open_dir
from whoosh.collectors import TimeLimitCollector, TimeLimit
import warnings
warnings.simplefilter("ignore", UserWarning)
nlp = spacy.load("en_core_web_sm")
nlp_large = spacy.load("en_core_web_lg")

In [28]:
'''
These functions are used to extract key information in the claim to make it as a query.
'''
def spacy_ner(string):
        doc = nlp(string)
        result = []
        for w in doc.ents:
            if w.label_ in ['PERSON']:
                for token in w:
                    result.append(str(token))
            elif w.label_ in ['LOC','EVENT','WORK_OF_ART', 'NORP', 'FAC', 'ORG', 'GPE', 'LAW', 'LANGUAGE', 'PRODUCT']:
                text = ''
                for token in w:
                    if not(token.is_stop) and not(token.is_punct):
                        text = ' '.join([text,str(token)]).strip()
                result.append(text)
        #result = [(w.text, w.label_) for w in doc.ents]
        return result
    
def spacy_pos(string):
    doc = nlp(string)
    result = []
    text = ''
    for token in doc:
        if token.pos_ in ['PROPN', 'NOUN'] and not(token.is_stop) and not (token.is_punct):
            if token.shape_[0] == 'X':#begin with uppercase
                if text != '':
                    result.append(text)
                    text  = str(token)
                else:
                    text = (text+' '+str(token)).strip()
            else:
                text = (text+' '+str(token)).strip()
        else:
            if text != '':
                result.append(text)
                text = ''
    if text != '':
        result.append(text)
    return result
    
def spacy_chunk(string):
    doc = nlp(string)
    result = []
    for chunk in doc.noun_chunks:
        text = ''
        for token in chunk:
            if not(token.is_stop) and not(token.is_punct):
                if token.shape_[0] == 'X':
                    if text != '':
                        result.append(text)
                        text = str(token)
                    else:
                        text = (text+' '+str(token)).strip()
                else:
                    text = (text+' '+str(token)).strip()
        if text != '':
            result.append(text)
    return result

def spacy_pos_single(string):
    doc = nlp(string)
    result = []
    for token in doc:
        if token.pos_ in ['PROPN', 'NOUN'] and token.shape_[0] == 'X' and not(token.is_stop) and not (token.is_punct):
            result.append(token.text)
    return result

def spacy_pos_final(string):
    doc = nlp(string)
    result = []
    for token in doc:
        if token.pos_ in ['ADJ','INTJ','NUM','VERB','PRON'] and token.shape_[0] == 'X' and not(token.is_stop) and not (token.is_punct):
            result.append(token.text)
        if token.pos_ in ['PROPN', 'NOUN'] and not(token.is_stop) and not (token.is_punct):
            result.append(token.text)
    return result

def spacy_chunk_single(string):
    doc = nlp(string)
    result = []
    for chunk in doc.noun_chunks:
        for token in chunk:
            if not(token.is_stop) and not(token.is_punct):
                result.append(str(token))
    return result

In [3]:
#For solving encoding issue in this project
import unicodedata

def nfd(string):
    return unicodedata.normalize('NFD',string)

def nfc(string):
    return unicodedata.normalize('NFC',string)

In [4]:
#A method to calculate the similarity between two sentences
#spacy similarity
def similarity_spacy_large(sentence1,sentence2):
    doc1 = nlp_large(sentence1)
    doc2 = nlp_large(sentence2)
    return doc1.similarity(doc2)

In [5]:
#A method to calculate the similarity between two sentences
#gensim - word2vec - average weight

import numpy as np
import scipy
from scipy.linalg import norm
from gensim.models import Word2Vec

def vector_similarity(claim, sentence):
    text_dict = [sentence.lower().split()+claim.lower().split()]
    model = Word2Vec(min_count=1)
    model.build_vocab(text_dict)  # prepare the model vocabulary
    model.train(text_dict, total_examples=model.corpus_count, epochs=model.epochs)  # train word vectors
    
    def sentence_vector(s):
        words = s.lower().split()
        v = np.zeros(100)
        for word in words:
            v += model.mv[word]
        v /= len(words)
        return v
    
    v1, v2 = sentence_vector(claim), sentence_vector(sentence)
    return np.dot(v1, v2) / (norm(v1) * norm(v2))

In [6]:
#A method to calculate the similarity between two sentences
#gensim - word2vec -cosine

import numpy as np
import scipy
from gensim.models import Word2Vec

def cosine_similarity(claim, sentence):
    text_dict = [sentence.lower().split()+claim.lower().split()]
    model = Word2Vec(min_count=1)
    model.build_vocab(text_dict)  # prepare the model vocabulary
    model.train(text_dict, total_examples=model.corpus_count, epochs=model.epochs)  # train word vectors
    
    vector_1 = np.mean([model.mv[word] for word in claim.lower().split()],axis=0)
    vector_2 = np.mean([model.mv[word] for word in sentence.lower().split()],axis=0)
    cosine = scipy.spatial.distance.cosine(vector_1, vector_2)
    return (1-cosine)

In [7]:
#A method to rank the sentences by the similarity with the claim
#gensim - lsi
#many evidence input to text_dict

from gensim import corpora, models, similarities
from gensim import models

def gensim_lsi(claim,sentences):
    text_dict = [sentence.lower().split() for sentence in sentences]
    #print(text_dict)
    dictionary = corpora.Dictionary(text_dict)
    corpus = [dictionary.doc2bow(text) for text in text_dict]
    tfidf = models.TfidfModel(corpus)#[corpus]
    lsi_model = models.LsiModel(corpus, id2word=dictionary,num_topics=2)
    claim_bow = dictionary.doc2bow(claim.lower().split())    
    query_vec = lsi_model[claim_bow]
    documents = lsi_model[corpus]
    index = similarities.MatrixSimilarity(documents)
    sims = index[query_vec]
    result = list(enumerate(sims)) # (document_number, document_similarity) 2-tuples
    result.sort(key=lambda x:x[1],reverse=True)
    return [result[i][0] for i in range(len(result))] # print sorted document_number

In [8]:
'''
A dumb function that searches the topN documents by the input claim.
return a dict in form: value = {"claim":claim, "label":"SUPPORTS", "evidence":evidence}
where the evidence is a list, in which every doc's first sentence is taken.
transformer is the method name for extracting informantion. e.g. spacy_pos
'''
def search_by_query_list(claim, transformer, topN, parser, searcher):
    query_str_list = transformer(claim)
    
    if len(query_str_list) == 0:
        value = {"claim":claim, "label":"NOT ENOUGH INFO", "evidence":[]}
        return value
    
    # query_str_list to query_str
    query_str = "\'"+query_str_list[0]+"\'"        
    for i in range(len(query_str_list)-1):
        query_str += " OR \'"+ query_str_list[i+1] +"\'"
        
    query = parser.parse(query_str)
    evidence = []
    
    #search with collector
    my_collector = searcher.collector(limit=topN,optimize=False)
    tlc = TimeLimitCollector(my_collector, timelimit=30)
    try:
        searcher.search_with_collector(query,tlc)
    except TimeLimit:
        pass
    results=tlc.results()
            
    for i in range(min(topN,len(results))):
        doc = results[i]['content']
        sentence = doc.split("\n")[0]
        number = int(sentence.split()[1])
        title = unicodedata.normalize('NFD',results[i]['title'])
        evidence.append([title,number])
    value = {"claim":claim, "label":"SUPPORTS", "evidence":evidence}
    
    return value

In [33]:
'''
The function searches the top N documents for document recall test
Input is json file containing claim
Output file is a corresponding json in standard format.
'''
def find_doc(openfile, savefile, transfomer, topN):
    ix = open_dir("indexdir")
    searcher = ix.searcher(weighting=scoring.BM25F(B=0.75, K1=1.2))
    parser = QueryParser(None, ix.schema) 
    parser.add_plugin(qparser.MultifieldPlugin(["content"]))
    
    with open(openfile,'r') as f:
        with open(savefile,"w",encoding='ascii') as dump_f:
            dump_f.write('{')
            load_dict = json.load(f)
            for key,val in load_dict.items():
                claim_number = key
                claim = val['claim']

                #search by claim
                value = search_by_query_list(claim, transfomer, topN, parser, searcher)

                dump_f.write(json.dumps(claim_number)+':'+json.dumps(value))
                dump_f.write(',')
                dump_f.write('\n')

        with open(savefile,"rb+") as filehandler:
            filehandler.seek(-3, os.SEEK_END)
            filehandler.truncate()
            filehandler.write(b'\n')
            filehandler.write(b'}')

    searcher.close()

In [10]:
'''
Find required number(specified by sentence_number) of most relevant sentences with the claim
sentence processed by simple similarity
sim_matrix are methods those take one pair of sentences as input. e.g.similarity_spacy_large

'''
def search_sentence_simple(claim, transformer, topN, parser, searcher, sim_matrix, sentence_number):
    query_str_list = transformer(claim)
    
    if len(query_str_list) == 0:
        value = {"claim":claim, "label":"NOT ENOUGH INFO", "evidence":[]}
        return value
    
    # query_str_list to query_str
    query_str = "\'"+query_str_list[0]+"\'"        
    for i in range(len(query_str_list)-1):
        query_str += " OR \'"+ query_str_list[i+1] +"\'"
        
    query = parser.parse(query_str)
    evidence = []
    
    #search with collector
    my_collector = searcher.collector(limit=topN,optimize=False)
    tlc = TimeLimitCollector(my_collector, timelimit=30)
    try:
        searcher.search_with_collector(query,tlc)
    except TimeLimit:
        pass
    results=tlc.results()
    
    sim_counter = Counter()
    for i in range(min(topN,len(results))):
        doc = results[i]['content']
        sentence_list = doc.split("\n")
        
        for line in sentence_list:
            if line != "":
                line = line.split(" ")
                identifier = line[0]
                pagenumber = int(line[1])
                sentence = " ".join(line[2:])
                #print(sentence)
                #print(claim)
                score = sim_matrix(sentence,claim)
                sim_counter.update({(nfd(identifier),pagenumber):score})
                
    for sentence_tuple,score in sim_counter.most_common(sentence_number):
            evidence.append(list(sentence_tuple))

    value = {"claim":claim, "label":"SUPPORTS", "evidence":evidence}
    
    return value

In [11]:
'''
Find required number(specified by sentence_number) of most relevant sentences with the claim
sentence processed by complex similarity(tf-idf)
sim_matrix are methods those take a set of sentences and a claim as input. e.g.gensim_lsi

'''
def search_sentence_complex(claim, transformer, topN, parser, searcher, sim_matrix, sentence_number):
    query_str_list = transformer(claim)
    if len(query_str_list) == 0:
        value = {"claim":claim, "label":"NOT ENOUGH INFO", "evidence":[]}
        return value
    
    # query_str_list to query_str
    query_str = "\'"+query_str_list[0]+"\'"        
    for i in range(len(query_str_list)-1):
        query_str += " OR \'"+ query_str_list[i+1] +"\'"
        
    query = parser.parse(query_str)
    evidence = []
    
    #search with collector
    my_collector = searcher.collector(limit=topN,optimize=False)
    tlc = TimeLimitCollector(my_collector, timelimit=30)
    try:
        searcher.search_with_collector(query,tlc)
    except TimeLimit:
        pass
    results=tlc.results()
    
    sentence_dict = {}
    whole_sentence_list = []
    count = 0
    for i in range(min(topN,len(results))):
        doc = results[i]['content']
        sentence_list = doc.split("\n")
        
        for line in sentence_list:
            if line != "":
                line = line.split(" ")
                identifier = line[0]
                pagenumber = int(line[1])
                sentence = " ".join(line[2:])
                
                sentence_dict[count] = [nfd(identifier), pagenumber]
                whole_sentence_list.append(sentence)
                count += 1
    
    if len(whole_sentence_list) == 0:
        return {"claim":claim, "label":"NOT ENOUGH INFO", "evidence":[]}
        
    sequence = sim_matrix(claim, whole_sentence_list)
    for j in range(min(len(sequence),sentence_number)):
        evidence.append(sentence_dict[sequence[j]])

    value = {"claim":claim, "label":"SUPPORTS", "evidence":evidence}
    
    return value

In [12]:
'''
Function that calls the above two methods to process a file.
Output is the sentences searched by the similarity matrix.
'''

def find_sentence(openfile, savefile, transformer, topN, sim_type, sim_matrix, sentence_number):
    ix = open_dir("indexdir")
    searcher = ix.searcher(weighting=scoring.BM25F(B=0.75, K1=1.2))
    parser = QueryParser(None, ix.schema) 
    parser.add_plugin(qparser.MultifieldPlugin(["title_remove_underline"]))
    
    with open(openfile,'r') as f:
        with open(savefile,"w",encoding='ascii') as dump_f:
            dump_f.write('{')
            load_dict = json.load(f)
            for key,val in load_dict.items():
                claim_number = key
                claim = val['claim']

                #search by claim
                value = sim_type(claim, transformer, topN, parser, searcher, sim_matrix, sentence_number)

                dump_f.write(json.dumps(claim_number)+':'+json.dumps(value))
                dump_f.write(',')
                dump_f.write('\n')

        with open(savefile,"rb+") as filehandler:
            filehandler.seek(-3, os.SEEK_END)
            filehandler.truncate()
            filehandler.write(b'\n')
            filehandler.write(b'}')

    searcher.close()

In [13]:
'''
For training the data, create train.tsv and evaluation.tsv for the model 1 (first layer) used in BERT
'''

import csv
def create_evidence_train_tsv(train_file,tsv_file,transformer, topN, sim_matrix, sentence_number):
    ix = open_dir("indexdir")
    searcher = ix.searcher(weighting=scoring.BM25F(B=0.75, K1=1.2))
    parser = QueryParser(None, ix.schema) 
    parser.add_plugin(qparser.MultifieldPlugin(["title_remove_underline"]))
    
    with open(train_file,'r') as f:
        with open(tsv_file,"w",newline='',encoding='utf-8') as out_file:
            tsv_writer = csv.writer(out_file, delimiter='\t')
            load_dict = json.load(f)
            for key,val in load_dict.items():
                claim_number = key
                claim = val['claim']
                true_label = val['label']
                true_evidence = val['evidence']
                
                query_str_list = transformer(claim)

                if len(query_str_list) == 0:
                    continue

                # query_str_list to query_str
                query_str = "\'"+query_str_list[0]+"\'"
                for i in range(len(query_str_list)-1):
                    query_str += " OR \'"+ query_str_list[i+1] +"\'"

                query = parser.parse(query_str)

                #search with collector
                my_collector = searcher.collector(limit=topN,optimize=False)
                tlc = TimeLimitCollector(my_collector, timelimit=30)
                try:
                    searcher.search_with_collector(query,tlc)
                except TimeLimit:
                    pass
                results=tlc.results()

                sim_counter = Counter()
                for i in range(min(topN,len(results))):
                    doc = results[i]['content']
                    sentence_list = doc.split("\n")

                    for line in sentence_list:
                        if line != "":
                            line = line.split(" ")
                            identifier = line[0]
                            pagenumber = int(line[1])
                            sentence = " ".join(line[2:])
                            score = sim_matrix(sentence,claim)
                            sim_counter.update({(nfd(identifier),pagenumber,sentence):score})

                for sentence_tuple,score in sim_counter.most_common(sentence_number):
                        if [sentence_tuple[0],sentence_tuple[1]] in true_evidence:
                            tsv_writer.writerow([claim_number,sentence_tuple[0],sentence_tuple[1],'yes', claim, sentence_tuple[2]])
                        else:
                            tsv_writer.writerow([claim_number,sentence_tuple[0],sentence_tuple[1],'no', claim, sentence_tuple[2]])
    searcher.close()

In [14]:
'''
function used to create training file for second layer(label judgment).
tsv_file_1 includes all the claims by "SUPPORTS" and "REFUTES" labels.
tsv_file_2 includes all the claims by "NOT ENOUGH INFO" labels.
'''
def create_label_train_tsv(train_file,tsv_file_1,tsv_file_2,transformer, topN, sim_matrix, sentence_number):
    ix = open_dir("indexdir")
    searcher = ix.searcher(weighting=scoring.BM25F(B=0.75, K1=1.2))
    parser = QueryParser(None, ix.schema) 
    parser.add_plugin(qparser.MultifieldPlugin(["title_remove_underline"]))
    parser2 = QueryParser(None, ix.schema)
    parser2.add_plugin(qparser.MultifieldPlugin(["title"]))
    
    with open(train_file,'r') as f:
        with open(tsv_file_1,"w",newline='',encoding='utf-8') as out_file_1:
            with open(tsv_file_2,"w",newline='',encoding='utf-8') as out_file_2:
                tsv_writer_1 = csv.writer(out_file_1, delimiter='\t')
                tsv_writer_2 = csv.writer(out_file_2, delimiter='\t')
                load_dict = json.load(f)
                for key,val in load_dict.items():
                    claim_number = key
                    claim = val['claim']
                    true_label = val['label']
                    true_evidence = val['evidence']

                    if true_label == "SUPPORTS" or true_label == "REFUTES":
                        sentence_combination = ""
                        for one_evidence in true_evidence:
                            query_str = nfc(one_evidence[0])
                            query_pagenumber = one_evidence[1]
                            query = parser2.parse(query_str)

                            #search with collector
                            my_collector = searcher.collector(limit=1,optimize=False)
                            tlc = TimeLimitCollector(my_collector, timelimit=30)
                            try:
                                searcher.search_with_collector(query,tlc)
                            except TimeLimit:
                                pass
                            results=tlc.results()

                            for i in range(min(1,len(results))):
                                doc = results[i]['content']
                                sentence_list = doc.split("\n")

                                for line in sentence_list:
                                    if line != "":
                                        line = line.split(" ")
                                        identifier = line[0]
                                        pagenumber = int(line[1])
                                        sentence = " ".join(line[2:])
                                        if pagenumber == query_pagenumber:
                                            sentence_combination = sentence_combination+" "+sentence
                                            break
                        tsv_writer_1.writerow([claim_number,true_label, claim, sentence_combination])

                    #the label is "NOT ENOUGH INFO"
                    else: 
                        query_str_list = transformer(claim)

                        if len(query_str_list) == 0:
                            continue

                        # query_str_list to query_str
                        query_str = "\'"+query_str_list[0]+"\'"
                        for i in range(len(query_str_list)-1):
                            query_str += " OR \'"+ query_str_list[i+1] +"\'"

                        query = parser.parse(query_str)

                        #search with collector
                        my_collector = searcher.collector(limit=topN,optimize=False)
                        tlc = TimeLimitCollector(my_collector, timelimit=30)
                        try:
                            searcher.search_with_collector(query,tlc)
                        except TimeLimit:
                            pass
                        results=tlc.results()

                        sim_counter = Counter()
                        for i in range(min(topN,len(results))):
                            doc = results[i]['content']
                            sentence_list = doc.split("\n")

                            for line in sentence_list:
                                if line != "":
                                    line = line.split(" ")
                                    identifier = line[0]
                                    pagenumber = int(line[1])
                                    sentence = " ".join(line[2:])
                                    score = sim_matrix(sentence,claim)
                                    sim_counter.update({(nfd(identifier),pagenumber,sentence):score})

                        for sentence_tuple,score in sim_counter.most_common(sentence_number):
                            tsv_writer_2.writerow([claim_number,sentence_tuple[0],sentence_tuple[1],'unknown', claim, sentence_tuple[2]])



In [15]:
'''
Process tsv_file_2 in last method, filter 20 sentences by model 1 for each claim
probility_file is the output of model 1 and input undone_file is tsv_file_2
prediction_file should be combined with tsv_file_1 as the input file for the second layer training in BERT(model 2)
'''
import csv
def create_no_enough_info_training_set(undone_file,probility_file,predition_file):
    with open(undone_file,'r',encoding='utf-8') as tsvfile_1:
        with open(probility_file,'r') as tsvfile_2:
            with open(predition_file,'w',newline='',encoding='utf-8') as tsvfile_3:
                tsv_reader_1 = csv.reader(tsvfile_1, delimiter="\t")
                tsv_reader_2 = csv.reader(tsvfile_2, delimiter="\t")
                tsv_writer = csv.writer(tsvfile_3, delimiter="\t")
                
                claim_number = ""
                claim = ""
                current_sentences = ""
                for undone_sentence,prob_sentence in zip(tsv_reader_1, tsv_reader_2):
                    if float(prob_sentence[0]) >= 0.5:
                        if undone_sentence[0] == claim_number: #append sentence
                            current_sentences = current_sentences + " " + undone_sentence[5]
                        elif claim_number != "": #output and reset claim number
                            tsv_writer.writerow([claim_number,"NOT ENOUGH INFO", claim, current_sentences])
                            claim_number = undone_sentence[0]
                            claim = undone_sentence[4]
                            current_sentences = undone_sentence[5]
                        else: #initial the first one
                            claim_number = undone_sentence[0]
                            claim = undone_sentence[4]
                            current_sentences = undone_sentence[5]
                tsv_writer.writerow([claim_number,"NOT ENOUGH INFO", claim, current_sentences])

In [16]:
'''
The function that process the test json file in the first step.
tsv_file_step1 is the input of the model 1, that can filter 20 sentences to evidence
json_file_step1 is a json file that includes those who have not response documents in the Whoosh
'''

def process_test_step1(test_file,tsv_file_step1,json_file_step1,transformer, topN, sim_matrix, sentence_number):
    ix = open_dir("indexdir")
    searcher = ix.searcher(weighting=scoring.BM25F(B=0.75, K1=1.2))
    parser = QueryParser(None, ix.schema) 
    parser.add_plugin(qparser.MultifieldPlugin(["title_remove_underline"]))
    
    with open(test_file,'r') as f:
        with open(tsv_file_step1,"w",newline='',encoding='utf-8') as out_file:
            with open(json_file_step1,"w") as jsonfile:
                tsv_writer = csv.writer(out_file, delimiter='\t')
                load_dict = json.load(f)
                new_dict = {}
                for key,val in load_dict.items():
                    claim_number = key
                    claim = val['claim']

                    query_str_list = transformer(claim)

                    if len(query_str_list) == 0:
                        new_dict[claim_number] = {"claim":claim, 'label':"NOT ENOUGH INFO", "evidence":[]}
                        continue

                    # query_str_list to query_str
                    query_str = "\'"+query_str_list[0]+"\'"
                    for i in range(len(query_str_list)-1):
                        query_str += " OR \'"+ query_str_list[i+1] +"\'"

                    query = parser.parse(query_str)

                    #search with collector
                    my_collector = searcher.collector(limit=topN,optimize=False)
                    tlc = TimeLimitCollector(my_collector, timelimit=30)
                    try:
                        searcher.search_with_collector(query,tlc)
                    except TimeLimit:
                        pass
                    results=tlc.results()
                    
                    if len(results) == 0:
                        new_dict[claim_number] = {"claim":claim, 'label':"NOT ENOUGH INFO", "evidence":[]}
                        continue

                    sim_counter = Counter()
                    for i in range(min(topN,len(results))):
                        doc = results[i]['content']
                        sentence_list = doc.split("\n")

                        for line in sentence_list:
                            if line != "":
                                line = line.split(" ")
                                identifier = line[0]
                                pagenumber = int(line[1])
                                sentence = " ".join(line[2:])
                                score = sim_matrix(sentence,claim)
                                sim_counter.update({(nfd(identifier),pagenumber,sentence):score})

                    for sentence_tuple,score in sim_counter.most_common(sentence_number):
                        tsv_writer.writerow([claim_number,sentence_tuple[0],sentence_tuple[1],'unknown', claim, sentence_tuple[2]])
                json.dump(new_dict,jsonfile,indent=2)
    searcher.close()

In [17]:
'''
The function that process the output of the step1 and model 1
prediction_of_step1 is the output tsv file of tsv_file_step1 through model 1
json_file_step2 is the output json file which records all the evidence, but no label.
'''

def process_test_step2(tsv_file_step1,json_file_step1,prediction_of_step1,tsv_file_step2, json_file_step2):
    with open(tsv_file_step1,'r',encoding='utf-8') as t_1, open(json_file_step1,'r') as j_1:
        with open(prediction_of_step1,'r') as p:
            with open(tsv_file_step2, 'w',newline='',encoding='utf-8') as t_2, open(json_file_step2, 'w', encoding='ascii') as j_2:
                tsv_reader_1 = csv.reader(t_1, delimiter="\t")
                tsv_reader_p = csv.reader(p, delimiter="\t")
                tsv_writer = csv.writer(t_2, delimiter="\t")
                load_dict = json.load(j_1)
                                
                claim_number = ""
                claim = ""
                current_sentences = ""
                current_evidence = []
                for unfiltered_sentence, prob_sentence in zip(tsv_reader_1, tsv_reader_p):
                    if unfiltered_sentence[0] == claim_number: #in the same claim
                        if float(prob_sentence[0]) >= 0.5:#append sentences set
                            current_sentences = current_sentences + " " + unfiltered_sentence[5]
                            current_evidence.append([nfd(unfiltered_sentence[1]),int(unfiltered_sentence[2])])
                    elif claim_number != "": #output last claim
                        if current_sentences == "": #no evidence
                            load_dict[claim_number] = {"claim":claim, 'label':"NOT ENOUGH INFO", "evidence":[]}
                            #update the claim info
                            claim_number = unfiltered_sentence[0]
                            claim = unfiltered_sentence[4]
                            if float(prob_sentence[0]) >= 0.5:
                                current_sentences = unfiltered_sentence[5]
                                current_evidence = [[nfd(unfiltered_sentence[1]),int(unfiltered_sentence[2])]]
                        else: #has evidence
                            load_dict[claim_number] = {"claim":claim, 'label':"SUPPORTS", "evidence":current_evidence}
                            tsv_writer.writerow([claim_number,"unknown", claim, current_sentences])
                            #update the claim info
                            claim_number = unfiltered_sentence[0]
                            claim = unfiltered_sentence[4]
                            if float(prob_sentence[0]) >= 0.5:
                                current_sentences = unfiltered_sentence[5]
                                current_evidence = [[nfd(unfiltered_sentence[1]),int(unfiltered_sentence[2])]]
                            else:
                                current_sentences = ""
                                current_evidence = []
                    else: #initialize
                        claim_number = unfiltered_sentence[0]
                        claim = unfiltered_sentence[4]
                        if float(prob_sentence[0]) >= 0.5:
                            current_sentences = unfiltered_sentence[5]
                            current_evidence = [[nfd(unfiltered_sentence[1]),int(unfiltered_sentence[2])]]
                            
                # process the last claim
                if current_sentences == "":
                    load_dict[claim_number] = {"claim":claim, 'label':"NOT ENOUGH INFO", "evidence":[]}
                else:
                    load_dict[claim_number] = {"claim":claim, 'label':"SUPPORTS", "evidence":current_evidence}
                    tsv_writer.writerow([claim_number,"unknown", claim, current_sentences])
                
                #new judgement after sentence filter
                json.dump(load_dict,j_2,indent=2)
                    

In [18]:
'''
The function that process the output of the step2 and model 2
prediction_of_step2 is the output tsv file of tsv_file_step2 through model 2
json_file_step3 is the final output json file which records all the evidence, and labels.
'''
def process_test_step3(tsv_file_step2, json_file_step2, prediction_of_step2, json_file_step3):
    with open(tsv_file_step2,'r',encoding='utf-8') as t_2, open(json_file_step2,'r') as j_2:
        with open(prediction_of_step2,'r') as p, open(json_file_step3, 'w', encoding='ascii') as j_3:
            tsv_reader_1 = csv.reader(t_2, delimiter="\t")
            tsv_reader_p = csv.reader(p, delimiter="\t")
            load_dict = json.load(j_2)
            
            for filtered_sentence, prob_sentence in zip(tsv_reader_1, tsv_reader_p):
                claim_number = filtered_sentence[0]
                claim = filtered_sentence[2]
                label = ""
                if float(prob_sentence[2]) >= max(float(prob_sentence[0]),float(prob_sentence[1])):
                    label = "NOT ENOUGH INFO"
                    load_dict[claim_number] = {"claim":claim, 'label':label, "evidence":[]}
                else:
                    if float(prob_sentence[0]) > float(prob_sentence[1]):
                        label = "SUPPORTS"
                    else:
                        label = "REFUTES"
                    load_dict[claim_number]['label'] = label
                    
            #new judgement after sentence filter
            json.dump(load_dict,j_3,indent=2)
                
                

In [19]:
'''
A test for binary classification in model 2
'''
def process_test_step3_binary(tsv_file_step2, json_file_step2, prediction_of_step2, json_file_step3):
    with open(tsv_file_step2,'r',encoding='utf-8') as t_2, open(json_file_step2,'r') as j_2:
        with open(prediction_of_step2,'r') as p, open(json_file_step3, 'w', encoding='ascii') as j_3:
            tsv_reader_1 = csv.reader(t_2, delimiter="\t")
            tsv_reader_p = csv.reader(p, delimiter="\t")
            load_dict = json.load(j_2)
            
            for filtered_sentence, prob_sentence in zip(tsv_reader_1, tsv_reader_p):
                claim_number = filtered_sentence[0]
                claim = filtered_sentence[2]
                label = ""
                
                if float(prob_sentence[0]) > float(prob_sentence[1]):
                    label = "SUPPORTS"
                else:
                    label = "REFUTES"
                load_dict[claim_number]['label'] = label
                    
            #new judgement after sentence filter
            json.dump(load_dict,j_3,indent=2)
                
                

In [20]:
'''
A dumb test for 'NEI' portion check in test file. For error analysis
'''

import json
def dumb_test_for_NEI(raw_json,dumb_json):
    with open(raw_json,'r') as j_1, open(dumb_json, 'w', encoding='ascii') as j_2:
        load_dict = json.load(j_1)
        for claim_number in load_dict:
            load_dict[claim_number]['label'] = 'NOT ENOUGH INFO'
            load_dict[claim_number]['evidence'] = []
            
        json.dump(load_dict,j_2,indent=2)

In [2]:
dumb_test_for_NEI('test-unlabelled.json','test-dumb-test.json')

In [None]:
'''
Below is the main process of training set.
'''

In [15]:
time_start=time.time()
create_evidence_train_tsv("devset2000.json","devset2000.tsv",spacy_pos_single,10,similarity_spacy_large,20)
time_end=time.time()
print('time cost',time_end-time_start,'s')

time cost 4122.98531627655 s


In [16]:
time_start=time.time()
create_label_train_tsv("train15000.json","train15000-1.tsv","train15000-2.tsv",spacy_pos_single,10,similarity_spacy_large,20)
time_end=time.time()
print('time cost',time_end-time_start,'s')

time cost 8682.938073396683 s


In [17]:
time_start=time.time()
create_evidence_train_tsv("devset2001-5001.json","devset2001-5001.tsv",spacy_pos_single,10,similarity_spacy_large,20)
time_end=time.time()
print('time cost',time_end-time_start,'s')

time cost 6336.4470064640045 s


In [18]:
time_start=time.time()
create_label_train_tsv("train35000-50000.json","train35000-50000-1.tsv","train35000-50000-2.tsv",spacy_pos_single,10,similarity_spacy_large,20)
time_end=time.time()
print('time cost',time_end-time_start,'s')

time cost 9246.978297710419 s


In [None]:
'''
Below is the main process of test set.(After model 1 and 2 are built in BERT)
'''

In [22]:
time_start=time.time()
process_test_step1("test-unlabelled.json","final-test-step1.tsv","testoutput-1.json", \
                   spacy_pos_single, 10, similarity_spacy_large, 20)
time_end=time.time()
print('time cost',time_end-time_start,'s')

time cost 33634.59951758385 s


In [38]:
time_start=time.time()
process_test_step2("final-test-step1.tsv","testoutput-1.json","final-test-step1-prob.tsv","final-test-step2.tsv", "testoutput-2.json")
time_end=time.time()
print('time cost',time_end-time_start,'s')

time cost 3.9154000282287598 s


In [39]:
time_start=time.time()
process_test_step3("final-test-step2.tsv", "testoutput-2.json", "final-test-step2-prob.tsv", "testoutput-3.json")
time_end=time.time()
print('time cost',time_end-time_start,'s')

time cost 0.6721642017364502 s


In [41]:
time_start=time.time()
process_test_step3_binary("final-test-step2.tsv", "testoutput-2.json", "final-test-step2-prob-binary.tsv", "testoutput-3-binary.json")
time_end=time.time()
print('time cost',time_end-time_start,'s')

time cost 0.8081531524658203 s


In [None]:
'''
Below are some document-recall tests in the early stage.
'''

In [5]:
time_start=time.time()
find_doc("train5000.json","train5000-pos-single-20-1.2-noascii.json", spacy_pos_single, 20)
time_end=time.time()
print('time cost',time_end-time_start,'s')


time cost 993.3926341533661 s


In [21]:
time_start=time.time()
find_doc("devset.json","devset-pos-single-10.json", spacy_pos_single, 10)
time_end=time.time()
print('time cost',time_end-time_start,'s')

time cost 917.1151354312897 s


In [32]:
time_start=time.time()
find_doc("devset.json","devset-pos-single-10-combination.json", spacy_pos_single, 10)
time_end=time.time()
print('time cost',time_end-time_start,'s')

time cost 6458.375021457672 s


In [34]:
time_start=time.time()
find_doc("devset.json","devset-pos-single-10-content.json", spacy_pos_single, 10)
time_end=time.time()
print('time cost',time_end-time_start,'s')

time cost 5078.554181814194 s


In [22]:
time_start=time.time()
find_doc("devset.json","devset-chunk-10.json", spacy_chunk, 10)
time_end=time.time()
print('time cost',time_end-time_start,'s')

time cost 1301.2530605793 s


In [23]:
time_start=time.time()
find_doc("devset.json","devset-ner-10.json", spacy_ner, 10)
time_end=time.time()
print('time cost',time_end-time_start,'s')

time cost 625.510498046875 s


In [24]:
time_start=time.time()
find_doc("devset.json","devset-pos-single-5.json", spacy_pos_single, 5)
time_end=time.time()
print('time cost',time_end-time_start,'s')

time cost 832.1719839572906 s


In [25]:
time_start=time.time()
find_doc("devset.json","devset-pos-single-20.json", spacy_pos_single, 20)
time_end=time.time()
print('time cost',time_end-time_start,'s')

time cost 969.7718818187714 s


In [29]:
time_start=time.time()
find_doc("devset.json","devset-pos-final-10.json", spacy_pos_final, 10)
time_end=time.time()
print('time cost',time_end-time_start,'s')

time cost 1867.9652078151703 s


In [30]:
time_start=time.time()
find_doc("devset.json","devset-chunk-single-10.json", spacy_chunk_single, 10)
time_end=time.time()
print('time cost',time_end-time_start,'s')

time cost 1977.4455802440643 s


In [None]:
'''
Below are some sentences-recall tests in the early stage
'''

In [17]:
#genism TF-IDF
time_start=time.time()
find_sentence("train5000.json","train5000-sentence-gensim_lsi.json", spacy_pos_single,10,\
              search_sentence_complex, gensim_lsi, 20)
time_end=time.time()
print('time cost',time_end-time_start,'s')

time cost 1186.3261201381683 s


In [1]:
time_start=time.time()
find_sentence("train5000.json","train5000-sentence-spacy.json", spacy_pos_single,10,\
              search_sentence_simple, similarity_spacy_large, 20)
time_end=time.time()
print('time cost',time_end-time_start,'s')

In [None]:
time_start=time.time()
find_sentence("train5000.json","train5000-sentence-vector.json", spacy_pos_single,10,\
              search_sentence_simple, vector_similarity, 20)
time_end=time.time()
print('time cost',time_end-time_start,'s')

In [None]:
time_start=time.time()
find_sentence("train5000.json","train5000-sentence-cosine.json", spacy_pos_single,10,\
              search_sentence_simple, cosine_similarity, 20)
time_end=time.time()
print('time cost',time_end-time_start,'s')

In [None]:
time_start=time.time()
find_sentence("test-unlabelled.json","test-sentence-spacy.json", spacy_pos_single,10,\
              search_sentence_simple, similarity_spacy_large, 20)
time_end=time.time()
print('time cost',time_end-time_start,'s')