In [1]:
fname = "2.json"
split_length = 100
model = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
#"deepset/minilm-uncased-squad2"
# model = "deepset/roberta-base-squad2-distilled"
# "deepset/electra-base-squad2"
# "deepset/tinyroberta-6l-768d"
# "distilbert-base-uncased-distilled-squad"
tokenizer_model = None
# "distilbert-base-uncased"
# "deepset/bert-base-cased-squad2"
# "deepset/electra-base-squad2"
# "distilbert-base-uncased-finetuned-sst-2-english"
#"distilbert-base-uncased"

scenarios = [
    {'Retriever': 100, 'Reader': 100},
    {'Retriever': 100, 'Reader': 20},
    {'Retriever': 100, 'Reader': 10},
    {'Retriever': 100, 'Reader': 5},
    {'Retriever': 20, 'Reader': 10},
    {'Retriever': 20, 'Reader': 5},
    {'Retriever': 10, 'Reader': 10},
    {'Retriever': 10, 'Reader': 5},
    {'Retriever': 5, 'Reader': 5}
]

In [2]:
import os
import json
from loguru import logger
import pandas as pd
from haystack.nodes import TfidfRetriever
from haystack.document_stores import InMemoryDocumentStore
from haystack.schema import Document
from haystack.nodes import PreProcessor

INFO - haystack.modeling.model.optimization -  apex not found, won't use it. See https://nvidia.github.io/apex/


In [3]:
def read_file(path):
    with open(path) as f:
        j = json.load(f)
        return j

In [4]:
def file_to_doc(path, preprocessor):
    data = read_file(path)
    docs = [Document(content=t).to_dict() for t in data['texts']]
    
    return preprocessor.process(docs)

In [5]:
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=split_length,
    split_respect_sentence_boundary=True,
)

In [6]:
def file_to_docstore(name):
    document_store = InMemoryDocumentStore()
    
    fpath = f'./data/{name}'
    print(f"Processing {name}") 
    docs = []
    for d in file_to_doc(fpath, preprocessor):
        d['id'] = f"{d['id']}-{d['meta']['_split_id']}"
        # print(d['id'])
        docs.append(d)
    print(f"{len(docs)} documents found")
    data = read_file(fpath)
    document_store.write_documents(docs)
    retriever = TfidfRetriever(document_store=document_store)
    
    return document_store, retriever, data['query']

In [7]:
def get_scores(prediction):
    scores = ["{:.3f}".format(a.score) for a in prediction['answers']]
    answers = [a.answer for a in prediction['answers']]
    df = pd.DataFrame(dict(scores=scores, answers=answers))
    return df           

In [8]:
document_store, retriever, query = file_to_docstore(fname)
document_store

INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1


Processing 2.json


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 264.50docs/s]
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id 'c9835cb59cca32d4b844354a1c678823-0' already exists in index 'document'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id 'c9835cb59cca32d4b844354a1c678823-1' already exists in index 'document'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id 'c9835cb59cca32d4b844354a1c678823-2' already exists in index 'document'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id 'c9835cb59cca32d4b844354a1c678823-3' already exists in index 'document'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id 'c9835cb59cca32d4b844354a1c678823-4' already exists in index 'document'
INFO - h

214 documents found


<haystack.document_stores.memory.InMemoryDocumentStore at 0x7f6cc43b55b0>

In [9]:
from haystack.nodes import FARMReader, TransformersReader

# reader = FARMReader(model_name_or_path=model, use_gpu=True)

reader = TransformersReader(model_name_or_path=model, tokenizer=tokenizer_model, use_gpu=True)

INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1


Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/86.7M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at sentence-transformers/multi-qa-MiniLM-L6-cos-v1 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [10]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)

In [11]:
import time
start = time.time()
prediction = pipe.run(
    query=query, params={"Retriever": {"top_k": 20}, "Reader": {"top_k": 10}}
)
end = time.time()
print("Total time: ", end - start)



Total time:  0.5447385311126709




In [12]:
get_scores(prediction)

Unnamed: 0,scores,answers
0,0.5,
1,0.011,CO2
2,0.011,CO2
3,0.011,passenger cars and
4,0.011,passenger cars and
5,0.011,passenger cars and
6,0.011,passenger cars and
7,0.011,passenger cars and
8,0.011,passenger cars and
9,0.011,passenger cars and


In [13]:
import time

bench = {'Retriever': [], 'Reader': [], 'time': []}

for scenario in scenarios:
    start = time.time()
    prediction = pipe.run(
        query=query, params={"Retriever": {"top_k": scenario['Retriever']}, "Reader": {"top_k": scenario['Reader']}}
    )
    end = time.time()
    
    bench['Retriever'].append(scenario['Retriever'])
    bench['Reader'].append(scenario['Reader'])
    bench['time'].append(end - start)
    
df = pd.DataFrame(bench)
df



Unnamed: 0,Retriever,Reader,time
0,100,100,0.463635
1,100,20,0.437975
2,100,10,0.447068
3,100,5,0.462767
4,20,10,0.101146
5,20,5,0.100081
6,10,10,0.05517
7,10,5,0.053197
8,5,5,0.032903


In [14]:
df

Unnamed: 0,Retriever,Reader,time
0,100,100,0.463635
1,100,20,0.437975
2,100,10,0.447068
3,100,5,0.462767
4,20,10,0.101146
5,20,5,0.100081
6,10,10,0.05517
7,10,5,0.053197
8,5,5,0.032903


In [15]:
df.corr()

Unnamed: 0,Retriever,Reader,time
Retriever,1.0,0.457081,0.999283
Reader,0.457081,1.0,0.472118
time,0.999283,0.472118,1.0


In [16]:
read_file(f"./data/{fname}")['query']

'How much have new cars co2 emissions decreased in 2020?'