#### The Data
The test data is in 'paper_test.jsonl'

The data is copied from the directory in the image to 'data/data/'

The annotations are available in the test dataset.

In [1]:
ls data/data/fever-data/

ls: cannot access 'data/data/fever-data/': No such file or directory


In [6]:
!wc -l data/data/fever-data/paper_test.jsonl

9999 data/data/fever-data/paper_test.jsonl


In [7]:
!head -2 data/data/fever-data/paper_test.jsonl

{"id": 113501, "verifiable": "NOT VERIFIABLE", "label": "NOT ENOUGH INFO", "claim": "Grease had bad reviews.", "evidence": [[[133128, null, null, null]]]}
{"id": 163803, "verifiable": "VERIFIABLE", "label": "SUPPORTS", "claim": "Ukrainian Soviet Socialist Republic was a founding participant of the UN.", "evidence": [[[296950, 288668, "Ukrainian_Soviet_Socialist_Republic", 7]], [[298602, 290067, "Ukrainian_Soviet_Socialist_Republic", 7], [298602, 290067, "United_Nations", 0]], [[300696, 291816, "Ukrainian_Soviet_Socialist_Republic", 7]], [[344347, 327887, "Ukrainian_Soviet_Socialist_Republic", 7]], [[344994, 328433, "Ukrainian_Soviet_Socialist_Republic", 7]], [[344997, 328435, "Ukrainian_Soviet_Socialist_Republic", 7]]]}


Load the index

In [9]:
from drqa import retriever
tdidf_npz_file = 'data/data/index/fever-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz'
ranker = retriever.get_class('tfidf')(tfidf_path=tdidf_npz_file)

In [28]:
ranker

<drqa.retriever.tfidf_doc_ranker.TfidfDocRanker at 0x7f63d9908390>

Format the data, sample randomly for not enough info class

In [19]:
import json
from tqdm import tqdm

def prepare_dataset(split, k=5):
    fever_root = 'data/'
    working_dir = 'working/data/'
    print("Saving prepared dataset to {}".format("training/{0}.ns.pages.p{1}.jsonl".format(split,k)))
    with open(fever_root + "data/fever-data/{0}.jsonl".format(split),"r") as f_in:
        with open(working_dir + "training/{0}.ns.pages.p{1}.jsonl".format(split,k),"w+") as f_out:
            for line in tqdm(f_in.readlines()):
                line = json.loads(line)
                if line["label"] == "NOT ENOUGH INFO":
                        doc_names, doc_scores = ranker.closest_docs(line['claim'], k)
                        pp = list(doc_names)

                        for idx,evidence_group in enumerate(line['evidence']):
                            for evidence in evidence_group:
                                if idx<len(pp):
                                    evidence[2] = pp[idx]
                                    evidence[3] = -1
                
                f_out.write(json.dumps(line) + "\n")

In [20]:
!rm -rf training/test.ns.pages.p5.jsonl
prepare_dataset('paper_test', 5)

  0%|          | 0/9999 [00:00<?, ?it/s]

Saving prepared dataset to training/paper_test.ns.pages.p5.jsonl


100%|██████████| 9999/9999 [02:36<00:00, 63.69it/s] 


In [22]:
ls working/data/training

paper_dev.ns.pages.p5.jsonl   train.ns.pages.p5.jsonl
paper_test.ns.pages.p5.jsonl  train.pages.p5.jsonl


In [23]:
!wc -l working/data/training/paper_test.ns.pages.p5.jsonl

9999 working/data/training/paper_test.ns.pages.p5.jsonl


In [24]:
from nltk import word_tokenize

class LabelSchema:
    def __init__(self,labels):
        self.labels = {self.preprocess(val):idx for idx,val in enumerate(labels)}
        self.idx = {idx:self.preprocess(val) for idx,val in enumerate(labels)}

    def get_id(self,label):
        if self.preprocess(label) in self.labels:
            return self.labels[self.preprocess(label)]
        return None

    def preprocess(self,item):
        return item.lower()

class FEVERLabelSchema(LabelSchema):
    def __init__(self):
        super().__init__(["supports", "refutes", "not enough info"])

def nltk_tokenizer(text):
    return " ".join(word_tokenize(text))

class training_line_formatter():
    def __init__(self):
        self.tokenize = nltk_tokenizer
        
    def format(self, lines):
        formatted = []
        for line in tqdm(lines):
            fl = self.format_line(line)
            if fl is not None:
                if isinstance(fl,list):
                    formatted.extend(fl)
                else:
                    formatted.append(fl)
        return formatted

    def format_line(self, line):
        label_schema = FEVERLabelSchema()
        # get the label, i.e. SUPPORTS etc.
        annotation = line["label"]
        if annotation is None:
            annotation = line["verifiable"]
        pages = []
        # did we get the closest sentences to the claim text? is this the sentence or the line number from the doc text?
        if 'predicted_sentences' in line:
            pages.extend([(ev[0], ev[1]) for ev in line["predicted_sentences"]])
        elif 'predicted_pages' in line:
            pages.extend([(ev[0], -1) for ev in line["predicted_pages"]])
        else:
            # these are the human annotated evidence available in the original training file
            for evidence_group in line["evidence"]:
                pages.extend([(ev[2], ev[3]) for ev in evidence_group])

        return {"claim": self.tokenize(line["claim"]), "evidence": pages, "label": label_schema.get_id(annotation),
                "label_text": annotation}
    
class Reader:
    def __init__(self,encoding="utf-8"):
        self.enc = encoding

    def read(self,file):
        with open(file,"r",encoding = self.enc) as f:
            return self.process(f)

    def process(self,f):
        pass

class JSONLineReader(Reader):
    def process(self,fp):
        data = []
        for line in tqdm(fp.readlines()):
            data.append(json.loads(line.strip()))
        return data

In [26]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [27]:
import json
from tqdm import tqdm
jlr = JSONLineReader()
split = 'paper_test'
working_dir = 'working/data/'
k = 5
test_data_file = working_dir + "training/{0}.ns.pages.p{1}.jsonl".format(split, k)
test_data = jlr.read(test_data_file)

formatter = training_line_formatter()
formatted_test_data = formatter.format(test_data)

test_data_formatted = []
test_data_formatted.extend(filter(lambda record: record is not None, formatted_test_data))
test_data_formatted[:3]

100%|██████████| 9999/9999 [00:00<00:00, 180319.31it/s]
100%|██████████| 9999/9999 [00:01<00:00, 8289.27it/s]


[{'claim': 'Grease had bad reviews .',
  'evidence': [('Grease_gun_-LRB-tool-RRB-', -1)],
  'label': 2,
  'label_text': 'NOT ENOUGH INFO'},
 {'claim': 'Ukrainian Soviet Socialist Republic was a founding participant of the UN .',
  'evidence': [('Ukrainian_Soviet_Socialist_Republic', 7),
   ('Ukrainian_Soviet_Socialist_Republic', 7),
   ('United_Nations', 0),
   ('Ukrainian_Soviet_Socialist_Republic', 7),
   ('Ukrainian_Soviet_Socialist_Republic', 7),
   ('Ukrainian_Soviet_Socialist_Republic', 7),
   ('Ukrainian_Soviet_Socialist_Republic', 7)],
  'label': 0,
  'label_text': 'SUPPORTS'},
 {'claim': '2 Hearts is a musical composition by Minogue .',
  'evidence': [('2_Hearts_-LRB-Kylie_Minogue_song-RRB-', 0),
   ('2_Hearts_-LRB-Kylie_Minogue_song-RRB-', 0),
   ('2_Hearts_-LRB-Kylie_Minogue_song-RRB-', 0),
   ('2_Hearts_-LRB-Kylie_Minogue_song-RRB-', 0)],
  'label': 0,
  'label_text': 'SUPPORTS'}]

##### PIPELINE setting dataset

In [101]:
import json
from tqdm import tqdm

def prepare_dataset_test(split, k=5):
    fever_root = 'data/'
    working_dir = 'working/data/'
    print("Saving prepared dataset to {}".format("training/{0}.ns.pages.p{1}.jsonl".format(split,k)))
    with open(fever_root + "data/fever-data/{0}.jsonl".format(split),"r") as f_in:
        with open(working_dir + "training/{0}_pipeline.ns.pages.p{1}.jsonl".format(split,k),"w+") as f_out:
            for line in tqdm(f_in.readlines()):
                line = json.loads(line)
                
                doc_names, doc_scores = ranker.closest_docs(line['claim'], k)
                pp = list(doc_names)

                for idx,evidence_group in enumerate(line['evidence']):
                    for evidence in evidence_group:
                        if idx<len(pp):
                            evidence[2] = pp[idx]
                            if line["label"] == "NOT ENOUGH INFO":
                                evidence[3] = -1
                            else:
                                evidence[3] = -2
                        else:
                            evidence[2] = pp[-1] #repeat the last one
                            evidence[3] = -2
                if len(pp) > idx:
                    for i in range(len(pp)-1-idx):
                        ev = [[-1, None, pp[i], -2]]
                        evidence_group.extend(ev)
                #setting evidence of all samples to -1 so that during dataset preparation, we sample lines from the document as per nearest match DrQA
                f_out.write(json.dumps(line) + "\n")

In [98]:
!rm -rf working/data/training/paper_test_pipeline.ns.pages.p5.jsonl
!wc -l working/data/training/paper_test_pipeline.ns.pages.p5.jsonl

wc: working/data/training/paper_test_pipeline.ns.pages.p5.jsonl: No such file or directory


In [102]:
!rm -rf training/paper_test_pipeline.ns.pages.p5.jsonl
prepare_dataset_test('paper_test', 5)

  0%|          | 0/9999 [00:00<?, ?it/s]

Saving prepared dataset to training/paper_test.ns.pages.p5.jsonl


100%|██████████| 9999/9999 [07:16<00:00, 22.90it/s]


In [1]:
!head working/data/training/paper_test_pipeline.ns.pages.p5.jsonl

{"id": 113501, "verifiable": "NOT VERIFIABLE", "label": "NOT ENOUGH INFO", "claim": "Grease had bad reviews.", "evidence": [[[133128, null, "Grease_gun_-LRB-tool-RRB-", -1], [-1, null, "Grease_gun_-LRB-tool-RRB-", -2], [-1, null, "Nasal_sebum", -2], [-1, null, "Grease", -2], [-1, null, "Thermal_interface_material", -2]]]}
{"id": 163803, "verifiable": "VERIFIABLE", "label": "SUPPORTS", "claim": "Ukrainian Soviet Socialist Republic was a founding participant of the UN.", "evidence": [[[296950, 288668, "Emblem_of_the_Ukrainian_Soviet_Socialist_Republic", -2]], [[298602, 290067, "Flag_of_the_Moldavian_Autonomous_Soviet_Socialist_Republic", -2], [298602, 290067, "Flag_of_the_Moldavian_Autonomous_Soviet_Socialist_Republic", -2]], [[300696, 291816, "Ukrainian_Republic", -2]], [[344347, 327887, "List_of_Presidents_of_Ukraine", -2]], [[344994, 328433, "United_Nations_General_Assembly_Resolution_377", -2]], [[344997, 328435, "United_Nations_General_Assembly_Resolution_377", -2]]]}
{"id": 70041, 

In [55]:
!head data/data/fever-data/paper_test.jsonl

{"id": 113501, "verifiable": "NOT VERIFIABLE", "label": "NOT ENOUGH INFO", "claim": "Grease had bad reviews.", "evidence": [[[133128, null, null, null]]]}
{"id": 163803, "verifiable": "VERIFIABLE", "label": "SUPPORTS", "claim": "Ukrainian Soviet Socialist Republic was a founding participant of the UN.", "evidence": [[[296950, 288668, "Ukrainian_Soviet_Socialist_Republic", 7]], [[298602, 290067, "Ukrainian_Soviet_Socialist_Republic", 7], [298602, 290067, "United_Nations", 0]], [[300696, 291816, "Ukrainian_Soviet_Socialist_Republic", 7]], [[344347, 327887, "Ukrainian_Soviet_Socialist_Republic", 7]], [[344994, 328433, "Ukrainian_Soviet_Socialist_Republic", 7]], [[344997, 328435, "Ukrainian_Soviet_Socialist_Republic", 7]]]}
{"id": 70041, "verifiable": "VERIFIABLE", "label": "SUPPORTS", "claim": "2 Hearts is a musical composition by Minogue.", "evidence": [[[225394, 230056, "2_Hearts_-LRB-Kylie_Minogue_song-RRB-", 0]], [[317953, 306972, "2_Hearts_-LRB-Kylie_Minogue_song-RRB-", 0]], [[319638

In [86]:
fever_root = 'data/'
working_dir = 'working/data/'
cnt = 0
print("Saving prepared dataset to {}".format("training/{0}.ns.pages.p{1}.jsonl".format(split,k)))
with open(fever_root + "data/fever-data/{0}.jsonl".format(split),"r") as f_in:
    with open(working_dir + "training/{0}_pipeline.ns.pages.p{1}.jsonl".format(split,k),"w+") as f_out:
        for line in tqdm(f_in.readlines()):
            line = json.loads(line)
            print("line...")
            print(line)
            print("....")
            doc_names, doc_scores = ranker.closest_docs(line['claim'], k)
            pp = list(doc_names)
            print("docs....")
            print(pp)
            print("LENGTH =", len(line['evidence']))
            for idx,evidence_group in enumerate(line['evidence']):
                print("for idx and evidence ....... {}".format(idx))
                print(idx,evidence_group)
                for evidence in evidence_group:
                    if idx<len(pp):
                        evidence[2] = pp[idx]
                        evidence[3] = -1
                    else:
                        evidence[2] = pp[-1]
                        evidence[3] = -1
                print(">>>>>")
                print(idx,evidence_group)
#             if (idx < len(line['evidence'])):
#                 ev = [-1, -1, pp[idx]]
            print("IDX = ", idx, len(pp))
            if len(pp) > idx:
                for i in range(len(pp)-1-idx):
                    ev = [[-1, None, pp[i], -1]]
                    evidence_group.extend(ev)
            print("************* FINAL LINE *********")
            print(line)
            cnt += 1
            if cnt > 3:
                break

  0%|          | 3/9999 [00:00<09:15, 17.99it/s]

Saving prepared dataset to training/paper_test.ns.pages.p5.jsonl
line...
{'id': 113501, 'verifiable': 'NOT VERIFIABLE', 'label': 'NOT ENOUGH INFO', 'claim': 'Grease had bad reviews.', 'evidence': [[[133128, None, None, None]]]}
....
docs....
['Grease_gun_-LRB-tool-RRB-', 'Nasal_sebum', 'Grease', 'Thermal_interface_material', 'Grease_trap']
LENGTH = 1
for idx and evidence ....... 0
0 [[133128, None, None, None]]
>>>>>
0 [[133128, None, 'Grease_gun_-LRB-tool-RRB-', -1]]
IDX =  0 5
************* FINAL LINE *********
{'id': 113501, 'verifiable': 'NOT VERIFIABLE', 'label': 'NOT ENOUGH INFO', 'claim': 'Grease had bad reviews.', 'evidence': [[[133128, None, 'Grease_gun_-LRB-tool-RRB-', -1], [-1, None, 'Grease_gun_-LRB-tool-RRB-', -1], [-1, None, 'Nasal_sebum', -1], [-1, None, 'Grease', -1], [-1, None, 'Thermal_interface_material', -1]]]}
line...
{'id': 163803, 'verifiable': 'VERIFIABLE', 'label': 'SUPPORTS', 'claim': 'Ukrainian Soviet Socialist Republic was a founding participant of the UN.',




#### Test dataset prepare

In [9]:
ls /local/fever-common/data/fever-data/ 

paper_dev.jsonl   shared_task_dev.jsonl   train.jsonl
paper_test.jsonl  shared_task_test.jsonl


In [10]:
!head -2 /local/fever-common/data/fever-data/paper_test.jsonl

{"id": 113501, "verifiable": "NOT VERIFIABLE", "label": "NOT ENOUGH INFO", "claim": "Grease had bad reviews.", "evidence": [[[133128, null, null, null]]]}
{"id": 163803, "verifiable": "VERIFIABLE", "label": "SUPPORTS", "claim": "Ukrainian Soviet Socialist Republic was a founding participant of the UN.", "evidence": [[[296950, 288668, "Ukrainian_Soviet_Socialist_Republic", 7]], [[298602, 290067, "Ukrainian_Soviet_Socialist_Republic", 7], [298602, 290067, "United_Nations", 0]], [[300696, 291816, "Ukrainian_Soviet_Socialist_Republic", 7]], [[344347, 327887, "Ukrainian_Soviet_Socialist_Republic", 7]], [[344994, 328433, "Ukrainian_Soviet_Socialist_Republic", 7]], [[344997, 328435, "Ukrainian_Soviet_Socialist_Republic", 7]]]}


In [6]:
### Get the ranker, that tfidf ranker that can rank a documenr given a claim
from drqa import retriever
tdidf_npz_file = '/local/fever-common/data/index/fever-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz'
ranker = retriever.get_class('tfidf')(tfidf_path=tdidf_npz_file)

In [9]:
from drqa.retriever import DocDB, utils
class FeverDocDB(DocDB):

    def __init__(self,path=None):
        super().__init__(path)

    def get_doc_lines(self, doc_id):
        """Fetch the raw text of the doc for 'doc_id'."""
        cursor = self.connection.cursor()
        cursor.execute(
            "SELECT lines FROM documents WHERE id = ?",
            (utils.normalize(doc_id),)
        )
        result = cursor.fetchone()
        cursor.close()
        return result if result is None else result[0]

    def get_non_empty_doc_ids(self):
        """Fetch all ids of docs stored in the db."""
        cursor = self.connection.cursor()
        cursor.execute("SELECT id FROM documents WHERE length(trim(text)) > 0")
        results = [r[0] for r in cursor.fetchall()]
        cursor.close()
        return results
database_path = '/local/fever-common/data/fever/fever.db'
#database_path = 'data/data/fever/fever.db'
database = FeverDocDB(database_path)

In [10]:
import random
import os

class SimpleRandom():
    instance = None

    def __init__(self,seed):
        self.seed = seed
        self.random = random.Random(seed)

    def next_rand(self,a,b):
        return self.random.randint(a,b)

    @staticmethod
    def get_instance():
        if SimpleRandom.instance is None:
            SimpleRandom.instance = SimpleRandom(SimpleRandom.get_seed())
        return SimpleRandom.instance

    @staticmethod
    def get_seed():
        return int(os.getenv("RANDOM_SEED", 12459))

    @staticmethod
    def set_seeds():

        torch.manual_seed(SimpleRandom.get_seed())
        if gpu():
            torch.cuda.manual_seed_all(SimpleRandom.get_seed())
        np.random.seed(SimpleRandom.get_seed())
        random.seed(SimpleRandom.get_seed())

In [6]:
import json
from tqdm import tqdm

def prepare_dataset_test(split, k=5):
    cnt = 0
    fever_root = '/local/fever-common/'
    working_dir = 'working/data/'
    print("Saving prepared dataset to {}".format("training/{0}.ns.pages.p{1}.jsonl".format(split,k)))
    with open(fever_root + "data/fever-data/{0}.jsonl".format(split),"r") as f_in:
        with open(working_dir + "training/{0}_pipeline.pp.pages.p{1}.jsonl".format(split,k),"w+") as f_out:
            for line in tqdm(f_in.readlines()):
                line = json.loads(line)
                doc_names, doc_scores = ranker.closest_docs(line['claim'], k)
                pp = list(doc_names)
                for idx,evidence_group in enumerate(line['evidence']):
                    for evidence in evidence_group:
                        if idx<len(pp):
                            evidence[2] = pp[idx]
                            if line["label"] == "NOT ENOUGH INFO":
                                evidence[3] = -1
                            else:
                                evidence[3] = -2
                        else:
                            evidence[2] = pp[-1] #repeat the last one
                            evidence[3] = -2
                if len(pp) > idx:
                    for i in range(len(pp)-1-idx):
                        ev = [[-1, None, pp[i], -2]]
                        evidence_group.extend(ev)
                #setting evidence of all samples to -1 so that during dataset preparation, we sample lines from the document as per nearest match DrQA
                f_out.write(json.dumps(line) + "\n")
                #line["predicted_pages"] = pp
                #nearest_lines = find_nearest_lines(line['claim'], pp)
#                 print(line['evidence'])
#                 nearest_lines = find_nearest_lines(line['claim'], line['evidence'][0])
#                 print(nearest_lines)
#                 cnt += 1
#                 #f_out.write(json.dumps(line) + "\n")
#                 print(line)
#                 if cnt > 3:
#                     break

In [23]:
#rm -rf working/data/training/paper_test_pipeline.pp.pages.p5.jsonl

In [22]:
prepare_dataset_test('paper_test')

  0%|          | 3/9999 [00:00<07:52, 21.14it/s]

Saving prepared dataset to training/paper_test.ns.pages.p5.jsonl


100%|██████████| 9999/9999 [08:33<00:00, 19.46it/s]


In [24]:
!head working/data/training/paper_test_pipeline.pp.pages.p5.jsonl

{"id": 113501, "verifiable": "NOT VERIFIABLE", "label": "NOT ENOUGH INFO", "claim": "Grease had bad reviews.", "evidence": [[[133128, null, "Grease_gun_-LRB-tool-RRB-", -1], [-1, null, "Grease_gun_-LRB-tool-RRB-", -2], [-1, null, "Nasal_sebum", -2], [-1, null, "Grease", -2], [-1, null, "Thermal_interface_material", -2]]]}
{"id": 163803, "verifiable": "VERIFIABLE", "label": "SUPPORTS", "claim": "Ukrainian Soviet Socialist Republic was a founding participant of the UN.", "evidence": [[[296950, 288668, "Emblem_of_the_Ukrainian_Soviet_Socialist_Republic", -2]], [[298602, 290067, "Flag_of_the_Moldavian_Autonomous_Soviet_Socialist_Republic", -2], [298602, 290067, "Flag_of_the_Moldavian_Autonomous_Soviet_Socialist_Republic", -2]], [[300696, 291816, "Ukrainian_Republic", -2]], [[344347, 327887, "List_of_Presidents_of_Ukraine", -2]], [[344994, 328433, "United_Nations_General_Assembly_Resolution_377", -2]], [[344997, 328435, "United_Nations_General_Assembly_Resolution_377", -2]]]}
{"id": 70041, 

In [137]:
import json
from tqdm import tqdm
def get_lines(a):
    lns = np.array([])
    if isinstance(a, str):
        return [a]
    for l in a:
        lns = np.append(lns, l)
    return lns.tolist()
def prepare_dataset_test_ps(split, k=5):
    cnt = 0
    fever_root = '/local/fever-common/'
    working_dir = 'working/data/'
    print("Saving prepared dataset to {}".format("training/{0}_pipeline.ps.pages.p{1}.jsonl".format(split,k)))
    with open(working_dir + "training/{0}_pipeline.pp.pages.p{1}.jsonl".format(split, k),"r") as f_in:
        with open(working_dir + "training/{0}_pipeline.ps.pages.p{1}.jsonl".format(split,k),"w+") as f_out:
            for line in tqdm(f_in.readlines()):
                line = json.loads(line)
                claim = line['claim']
                for idx,evidence_group in enumerate(line['evidence']):
                    claims = [claim for i in range(len(evidence_group))]
                    with ThreadPool(4) as threads:
                        results = threads.map(find_nearest, zip(claims, evidence_group))
                    docs = [r[0] for r in results]
                   
                    line_matches = [r[1] for r in results]
                
                    line_matches = [get_lines(ln) for ln in line_matches]
                    line_ids = [r[2] for r in results]
     
                    predicted_lines = [[a, b] for a, b in zip(line_matches, line_ids)]
                    ## match the number of lines matches to evidence
                    lines_needed = len(evidence_group)
                    for i in range(len(predicted_lines), lines_needed):
                        predicted_lines[i] = []
                   
                    for idx, evidence in enumerate(evidence_group):
                        evidence.append(predicted_lines[idx])
                f_out.write(json.dumps(line) + "\n")

In [138]:
prepare_dataset_test_ps("paper_test")

  0%|          | 0/9999 [00:00<?, ?it/s]

Saving prepared dataset to training/paper_test_pipeline.ps.pages.p5.jsonl


100%|██████████| 9999/9999 [12:49:26<00:00,  4.62s/it]   


In [24]:
!head working/data/training/paper_test_pipeline.ps.pages.p5.jsonl

{"id": 113501, "verifiable": "NOT VERIFIABLE", "label": "NOT ENOUGH INFO", "claim": "Grease had bad reviews.", "evidence": [[[133128, null, "Grease_gun_-LRB-tool-RRB-", -1], [-1, null, "Grease_gun_-LRB-tool-RRB-", -2], [-1, null, "Nasal_sebum", -2], [-1, null, "Grease", -2], [-1, null, "Thermal_interface_material", -2]]], "predicted_lines": ["The aperture may be of a type that fits closely with a receiving aperture on any number of mechanical devices ."]}
{"id": 163803, "verifiable": "VERIFIABLE", "label": "SUPPORTS", "claim": "Ukrainian Soviet Socialist Republic was a founding participant of the UN.", "evidence": [[[296950, 288668, "Emblem_of_the_Ukrainian_Soviet_Socialist_Republic", -2]], [[298602, 290067, "Flag_of_the_Moldavian_Autonomous_Soviet_Socialist_Republic", -2], [298602, 290067, "Flag_of_the_Moldavian_Autonomous_Soviet_Socialist_Republic", -2]], [[300696, 291816, "Ukrainian_Republic", -2]], [[344347, 327887, "List_of_Presidents_of_Ukraine", -2]], [[344994, 328433, "United_N

In [2]:
from drqascripts.retriever.build_tfidf_lines import OnlineTfidfDocRanker
import math
from multiprocessing.pool import ThreadPool
import numpy as np
class RankArgs:
    def __init__(self):
        self.ngram = 2
        self.hash_size = int(math.pow(2,24))
        self.tokenizer = "simple"
        self.num_workers = None
args = RankArgs()

In [41]:
def get_doc_line_test(data_map):
    
    docs, claims = data_map
    claim = claims[-1]
    print(docs)
    
    lines = database.get_doc_lines(doc)
    
    ### if this is from annotated evidences
    if line > -1: #we will not hit this for the test dataset, we have cleared off all the page indices from the test dataset
        return lines.split("\n")[line].split("\t")[1] #get all the lines from the document and match with the line ids that were annotated as evidence by the human annotators
    elif line <= -2:
        #TODO: nearest 5 sentences from the document
        non_empty_lines = [line.split("\t")[1] for line in lines.split("\n") if len(line.split("\t"))>1 and len(line.split("\t")[1].strip())]
        tfidf = OnlineTfidfDocRanker(args,[line for line in non_empty_lines],None)
        line_ids,scores = tfidf.closest_docs(claim,5)
        return non_empty_lines[SimpleRandom.get_instance().next_rand(0,len(non_empty_lines)-1)]
    else: ### if this is from not enough info evidences, NearestP method, to sample "a" single sentence randomly from the nearest page match
        non_empty_lines = [line.split("\t")[1] for line in lines.split("\n") if len(line.split("\t"))>1 and len(line.split("\t")[1].strip())]
        return non_empty_lines[SimpleRandom.get_instance().next_rand(0,len(non_empty_lines)-1)]
    

def find_nearest(claim_doc):
    claim, evidence = claim_doc
    doc = evidence[2]
    tag = evidence[3]
    lines = database.get_doc_lines(doc)
    non_empty_lines = [line.split("\t")[1] for line in lines.split("\n") if len(line.split("\t"))>1 and len(line.split("\t")[1].strip())]
    if tag == -2:
        tfidf = OnlineTfidfDocRanker(args,[line for line in non_empty_lines],None)
        res = tfidf.closest_docs(claim, 5)
        line_ids,scores = tfidf.closest_docs(claim, 5)
        return doc, np.array(non_empty_lines)[line_ids], line_ids
    else:
        return doc, non_empty_lines[SimpleRandom.get_instance().next_rand(0,len(non_empty_lines)-1)], []
    
    
    
def tfidf_claim(data_map):
    #print(data_map)
#     docs = data['docs']
#     claim = data['claim']
    claims, doc = data_map
    claim = claims[-1]
    #print(docs)
#     print(docs)
    ranked_lines = []
    for doc in docs:
        lines = database.get_doc_lines(doc)
        non_empty_lines = [line.split("\t")[1] for line in lines.split("\n") if len(line.split("\t"))>1 and len(line.split("\t")[1].strip())]
        tfidf = OnlineTfidfDocRanker(args,[line for line in non_empty_lines],None)
        line_ids,scores = tfidf.closest_docs(claim, 5)
#         print(line_ids)
#         print(non_empty_lines)
        ranked_lines.extend(np.array(non_empty_lines)[line_ids])
    return ranked_lines

# def find_nearest_lines(data):
#     claim = data['claim']
#     claims = [claim for i in range(len(data['evidence']))]
#     with ThreadPool(4) as threads:
#         results = threads.map(find_nearest, zip(claims, data['evidence']))
#     return results

def find_nearest_lines(claim, pp):
    #claim = data['claim']
    claims = [claim for i in range(len(pp))]
    with ThreadPool(4) as threads:
        results = threads.map(find_nearest, zip(claims, pp))
    return results


In [33]:
%%time
pp = [[[133128, None, 'Grease_gun_-LRB-tool-RRB-', -1], [-1, None, 'Grease_gun_-LRB-tool-RRB-', -2], 
       [-1, None, 'Nasal_sebum', -2], [-1, None, 'Grease', -2], [-1, None, 'Thermal_interface_material', -2]]]
print(pp[0])
claim = "Grease had bad reviews."
results = find_nearest_lines(claim, pp[0])
print("Line count = {}".format(len(results)))
print(results)
# print(matches)
# lines = np.array([])
# for l in matches:
#     lines = np.append(lines, l)

[[133128, None, 'Grease_gun_-LRB-tool-RRB-', -1], [-1, None, 'Grease_gun_-LRB-tool-RRB-', -2], [-1, None, 'Nasal_sebum', -2], [-1, None, 'Grease', -2], [-1, None, 'Thermal_interface_material', -2]]
Line count = 5
[('Grease_gun_-LRB-tool-RRB-', 'Hand-powered , where there is no trigger mechanism , and the grease is forced through the aperture by the back-pressure built up by pushing on the butt of the grease gun , which slides a piston through the body of the tool , pumping grease out of the aperture .', None), ('Grease_gun_-LRB-tool-RRB-', array([], dtype='<U261'), []), ('Nasal_sebum', array([], dtype='<U304'), []), ('Grease', array([], dtype='<U139'), []), ('Thermal_interface_material', array([], dtype='<U258'), [])]
CPU times: user 7.52 s, sys: 2.65 s, total: 10.2 s
Wall time: 3.03 s


In [17]:
lines

array(['Air-powered -LRB- pneumatic -RRB- , where compressed air is directed to the gun by hoses , the air pressure serving to force the grease through the aperture .'],
      dtype='<U304')