In [1]:
fname = "2.json"
split_length = 500
model = "deepset/minilm-uncased-squad2"
#"deepset/roberta-base-squad2"

scenarios = [
    {'Retriever': 100, 'Reader': 100},
    {'Retriever': 100, 'Reader': 20},
    {'Retriever': 100, 'Reader': 10},
    {'Retriever': 100, 'Reader': 5},
    {'Retriever': 20, 'Reader': 10},
    {'Retriever': 20, 'Reader': 5},
    {'Retriever': 10, 'Reader': 10},
    {'Retriever': 10, 'Reader': 5},
    {'Retriever': 5, 'Reader': 5}
]

In [2]:
import os
import json
from loguru import logger
import pandas as pd

In [3]:
def read_file(path):
    with open(path) as f:
        j = json.load(f)
        return j

In [4]:
from haystack.schema import Document
from haystack.nodes import PreProcessor

def file_to_doc(path, preprocessor):
    data = read_file(path)
    docs = [Document(content=t).to_dict() for t in data['texts']]
    
    return preprocessor.process(docs)

INFO - haystack.modeling.model.optimization -  apex not found, won't use it. See https://nvidia.github.io/apex/


In [5]:
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=split_length,
    split_respect_sentence_boundary=True,
)

In [6]:
from haystack.nodes import TfidfRetriever
from haystack.document_stores import InMemoryDocumentStore

def file_to_docstore(name):
    document_store = InMemoryDocumentStore()
    
    fpath = f'./data/{name}'
    print(f"Processing {name}") 
    docs = []
    for d in file_to_doc(fpath, preprocessor):
        d['id'] = f"{d['id']}-{d['meta']['_split_id']}"
        print(d['id'])
        docs.append(d)
    print(f"{len(docs)} documents found")
    data = read_file(fpath)
    document_store.write_documents(docs)
    retriever = TfidfRetriever(document_store=document_store)
    
    return document_store, retriever, data['query']

In [7]:
document_store, retriever, query = file_to_docstore(fname)
document_store

INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1


Processing 2.json


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 272.58docs/s]
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id 'c9835cb59cca32d4b844354a1c678823-0' already exists in index 'document'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id 'c9835cb59cca32d4b844354a1c678823-1' already exists in index 'document'
INFO - haystack.nodes.retriever.sparse -  Found 1716 candidate paragraphs from 46 docs in DB


2efe5ab632d27e5c2cb22473a301a199-0
2efe5ab632d27e5c2cb22473a301a199-1
9ea162454948a3205ef1988bc59ed085-0
9ea162454948a3205ef1988bc59ed085-1
6f6a3e88cdca10dd6cb564ef52b996fc-0
c4f86bdf2adcd239fe380b07885fb5fe-0
c4f86bdf2adcd239fe380b07885fb5fe-1
332729009d1883e04f8c9f8d73bb3999-0
332729009d1883e04f8c9f8d73bb3999-1
c9835cb59cca32d4b844354a1c678823-0
c9835cb59cca32d4b844354a1c678823-1
503562acab66597abe0af7717bd409e6-0
503562acab66597abe0af7717bd409e6-1
503562acab66597abe0af7717bd409e6-2
503562acab66597abe0af7717bd409e6-3
503562acab66597abe0af7717bd409e6-4
503562acab66597abe0af7717bd409e6-5
503562acab66597abe0af7717bd409e6-6
503562acab66597abe0af7717bd409e6-7
503562acab66597abe0af7717bd409e6-8
503562acab66597abe0af7717bd409e6-9
503562acab66597abe0af7717bd409e6-10
503562acab66597abe0af7717bd409e6-11
503562acab66597abe0af7717bd409e6-12
503562acab66597abe0af7717bd409e6-13
503562acab66597abe0af7717bd409e6-14
503562acab66597abe0af7717bd409e6-15
503562acab66597abe0af7717bd409e6-16
503562acab665

<haystack.document_stores.memory.InMemoryDocumentStore at 0x7f369487a5b0>

In [8]:
from haystack.nodes import FARMReader, TransformersReader
reader = FARMReader(model_name_or_path=model, use_gpu=True)

INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find deepset/minilm-uncased-squad2 locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...
INFO - haystack.modeling.model.language_model -  Loaded deepset/minilm-uncased-squad2
INFO - haystack.modeling.logger -  ML Logging is turned off. No parameters, metrics or artifacts will be logged to MLFlow.
INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.infer -  Got ya 15 parallel workers to do inference ...
INFO - haystack.modeling.infer -   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
INFO - haystack.modeling.infer -  /w\  /w\  /w\  /w\  /w\  /w\  /w\  /|\  /w\  /w\  /w\  /w\  /w\  /w\  /|\
IN

In [9]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)

In [10]:
import time
start = time.time()
prediction = pipe.run(
    query=query, params={"Retriever": {"top_k": 20}, "Reader": {"top_k": 10}}
)
end = time.time()
print("Total time: ", end - start)

  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.66 Batches/s]
Inferencing Samples: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 135.97 Batches/s]
Inferencing Samples: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 67.80 Batches/s]
Inferencing Samples: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:0

Total time:  0.9846179485321045





In [11]:
def get_scores(prediction):
    scores = [a.score for a in prediction['answers']]
    answers = [a.answer for a in prediction['answers']]
    df = pd.DataFrame(dict(scores=scores, answers=answers))
    return df           

In [12]:
get_scores(prediction)

Unnamed: 0,scores,answers
0,0.960126,12 %
1,0.811425,2.3 g CO2/km to 155.7 g CO2/km
2,0.557649,14.5 g
3,0.45409,significantly
4,0.106902,sharp
5,0.044859,average specific CO2 emissions below their 2020 target level
6,0.009043,2
7,0.008317,122.3
8,0.006359,15%
9,0.004562,2019


In [13]:
import time

bench = {'Retriever': [], 'Reader': [], 'time': []}

for scenario in scenarios:
    start = time.time()
    prediction = pipe.run(
        query=query, params={"Retriever": {"top_k": scenario['Retriever']}, "Reader": {"top_k": scenario['Reader']}}
    )
    end = time.time()
    
    bench['Retriever'].append(scenario['Retriever'])
    bench['Reader'].append(scenario['Reader'])
    bench['time'].append(end - start)
    
df = pd.DataFrame(bench)
df

  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  9.56 Batches/s]
Inferencing Samples: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.40 Batches/s]
Inferencing Samples: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 33.99 Batches/s]
Inferencing Samples: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:

Unnamed: 0,Retriever,Reader,time
0,100,100,1.905555
1,100,20,1.99528
2,100,10,1.967901
3,100,5,2.094036
4,20,10,0.439198
5,20,5,0.432379
6,10,10,0.193254
7,10,5,0.247128
8,5,5,0.122715


In [14]:
df

Unnamed: 0,Retriever,Reader,time
0,100,100,1.905555
1,100,20,1.99528
2,100,10,1.967901
3,100,5,2.094036
4,20,10,0.439198
5,20,5,0.432379
6,10,10,0.193254
7,10,5,0.247128
8,5,5,0.122715


In [15]:
df.corr()

Unnamed: 0,Retriever,Reader,time
Retriever,1.0,0.457081,0.998442
Reader,0.457081,1.0,0.419074
time,0.998442,0.419074,1.0
