In [1]:
import logging
from typing import Dict, List, Optional, Union
from pprint import pprint
import pandas as pd

from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter, PreProcessor, EmbeddingRetriever, DensePassageRetriever
from haystack.utils import convert_files_to_docs, print_answers
from haystack.document_stores import InMemoryDocumentStore, FAISSDocumentStore
from haystack.nodes import FARMReader, TransformersReader, RAGenerator, Seq2SeqGenerator
from haystack.pipelines import GenerativeQAPipeline
from haystack.schema import Document

import torch
from transformers import PreTrainedTokenizer, BatchEncoding
import datasets

%load_ext autoreload
%autoreload 2

In [2]:
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [3]:
torch.__version__

'1.12.1+cu113'

In [4]:
torch.version.cuda

'11.3'

# GET documents

In [3]:
# eli5 = datasets.load_dataset('eli5')

# wikitext = datasets.load_dataset('wikitext','wikitext-2-raw-v1')



  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
# wikitext['train']['text'][:30]

In [3]:
# len(wikitext['train'])

In [5]:
DATA_PATH = '../data/bitcoin_articles.csv'

df = pd.read_csv(DATA_PATH)
df.shape

(2500, 17)

In [6]:
df.head()

Unnamed: 0,article_id,title,author,published_date,link,clean_url,excerpt,summary,rights,article_rank,topic,country,language,authors,media,twitter_account,article_score
0,57a00c1140cbd3af79e77bf0e4e6af48,62% of Bitcoin Has Not Moved in a Year as Long-Term Holders Refuse to Sell,Jamie McNeill,04-10-2022 17:15,https://www.business2community.com/crypto-news/62-of-bitcoin-has-not-moved-i...,business2community.com,"Over the course of the last few years, there has been an impressive trend wh...","Over the course of the last few years, there has been an impressive trend wh...",business2community.com,1595,finance,US,en,Jamie McNeill,https://www.business2community.com/wp-content/uploads/2022/10/btcc.webp,@Jamie_DeFi,8.556426
1,21b48b3731c03466be3fac4be6c7dc67,The Orange Party Issue Playlist,Bitcoin Magazine,05-10-2022 21:17,https://bitcoinmagazine.com/culture/orange-party-issue-playlist,bitcoinmagazine.com,News Links: Russia Legalizing Bitcoin And Crypto Is A Matter Of Time Says Mi...,Russia Legalizing Bitcoin And Crypto Is A Matter Of Time Says Minister Of In...,bitcoinmagazine.com,6284,news,US,en,Bitcoin Magazine,https://bitcoinmagazine.com/.image/t_share/MTkyODIyNDQ0Mjc5NDczMzcz/playlist...,,8.507881
2,77030740ee160ad68c25e4e63515dd77,How Many Bitcoins Are There?,AOL Staff,04-10-2022 21:44,https://www.gobankingrates.com/investing/crypto/how-many-bitcoins-are-there/,gobankingrates.com,"Bitcoin has a maximum supply of 21 million. However, that doesn't tell the w...","Bitcoin has a maximum supply of 21 million. However, that doesn't tell the w...",aol.com,5044,news,US,en,"AOL Staff,David Granahan",https://s.yimg.com/ny/api/res/1.2/wPK4V8gjwjrDYoMuUGxoBw--/YXBwaWQ9aGlnaGxhb...,@AOL,8.483973
3,19285dde79599b6a9b4aa5a58805159b,Bitcoin 2008–2022: From Bouncing Baby To Troubled Teenager,Mark Hooson,03-10-2022 15:45,https://www.forbes.com/advisor/au/investing/bitcoin-2008-2022-from-bouncing-...,forbes.com,"Conceived in 2008 and launched in 2009, bitcoin is the world's first and lar...","Conceived in 2008 and launched in 2009, bitcoin is the world's first and lar...",forbes.com,49,business,US,en,"Forbes Staff,Kevin Pratt,Mark Hooson,Sophie Venz",https://thumbor.forbes.com/thumbor/fit-in/x/https://www.forbes.com/advisor/a...,,8.466393
4,ad1822c7ff1bbd7e38f7f9dbdbce245f,"Bitcoin Lightning Network capacity strikes 5,000 BTC",Joseph Hall,03-10-2022 12:09,https://cointelegraph.com/news/bitcoin-lightning-network-capacity-strikes-5-...,cointelegraph.com,"First created in 2018, the Lighting Network has come under fire recently, wi...",Bear markets are for building out capacity on the layer-2 Lightning Network....,cointelegraph.com,1696,news,US,en,Joseph Hall,https://images.cointelegraph.com/images/1200_aHR0cHM6Ly9zMy5jb2ludGVsZWdyYXB...,@cointelegraph,8.446906


In [7]:
# Use data to initialize Document objects

ids = list(df["article_id"].values)
texts = list(df["summary"].values)
titles = list(df["title"].values)
dates = list(df["published_date"].values)
links = list(df["link"].values)

all_docs = []
for i, title, text, date, link in zip(ids, titles, texts, dates, links):
    all_docs.append(Document(id=i, 
                             content=text, 
                             meta={"name": title or "", 
                                   "link": link or "", 
                                   "date": date or ""}))


# all_docs = [
#     {
#         'id':i, 
#         'content': c,
#         'meta': {'article_link': l, 'article_title': t, 'article_date': d}
#     } for i, c, t, d, l in zip(df['article_id'], df['summary'], df['title'], df['published_date'], df['link'])
# ]

In [8]:
len(all_docs)

2500

In [None]:
# PDFS_PATH="/data/kg_pdfs_test/"
# all_docs = convert_files_to_docs(dir_path=PDFS_PATH)

## Preprocessing 

In [9]:
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=256,  # smaller splits works better? 
    split_respect_sentence_boundary=True,
)

all_docs_process = preprocessor.process(all_docs)

print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(all_docs_process)}")

Preprocessing:   0%|          | 0/2500 [00:00<?, ?docs/s]

n_files_input: 2500
n_docs_output: 2500


In [10]:
all_docs_process[2:5]

[<Document: {'content': "Bitcoin has a maximum supply of 21 million. However, that doesn't tell the whole story. As time goes on, it becomes increasingly difficult to acquire bitcoin due to the asset's economics. Here is a full breakdown of how bitcoin works and exactly why it has a limited supply.Read: Looking To Diversify In A Bear Market? Consider These 6 Alternative InvestmentsBitcoin Tokenomics: Proof of Work, Mining and Halving CyclesTokenomics, as one can imagine, refers to the economics of a token. Similar to how fiat currencies, like the dollar, are issued by governments and regulated through monetary policy, tokenomics refers to the rules and functions revolving around cryptocurrency.", 'content_type': 'text', 'score': None, 'meta': {'name': 'How Many Bitcoins Are There?', 'link': 'https://www.gobankingrates.com/investing/crypto/how-many-bitcoins-are-there/', 'date': '04-10-2022 21:44', '_split_id': 0}, 'embedding': None, 'id': '7afbcf9ee51ca07473ee35af09165d3b'}>,
 <Document

## Document Store 

In [12]:
# In-Memory Document Store
# document_store = InMemoryDocumentStore()


# The FAISSDocumentStore uses a SQL(SQLite in-memory be default) database under-the-hood to store the document text and other meta data. 
# The vector embeddings of the text are indexed on a FAISS Index that later is queried for searching answers.

doc_store_dpr = FAISSDocumentStore(sql_url = "sqlite:///faiss_document_store_dpr.db", 
                                    faiss_index_factory_str="Flat", similarity="dot_product", return_embedding=True)

# doc_store_sbertr = FAISSDocumentStore(sql_url = "sqlite:///faiss_document_store_SB.db", 
#                                     faiss_index_factory_str="Flat", similarity="dot_product", return_embedding=False)

In [13]:
doc_store_dpr.write_documents(all_docs_process)

print(doc_store_dpr.get_document_count())

doc_store_dpr.get_all_documents()[101]

Writing Documents:   0%|          | 0/2500 [00:00<?, ?it/s]

1932


<Document: {'content': 'Bitcoin is the most valuable and well-known cryptocurrency in the world. But this popularity has given way to a lot of traffic on the Bitcoin blockchain, which, in turn, has led to long transaction times. Because of this, Bitcoin transaction accelerators have become widely used. But what is a Bitcoin transaction accelerator, and which is best for you? What Is a Bitcoin Transaction Accelerator? Bitcoin transaction accelerators are websites you can use to hopefully process a Bitcoin transaction faster.', 'content_type': 'text', 'score': None, 'meta': {'name': 'The Best Bitcoin Accelerators to Speed Up Your Transactions', 'link': 'https://www.makeuseof.com/best-bitcoin-accelerators/', 'date': '04-10-2022 11:45', '_split_id': '0'}, 'embedding': None, 'id': '1adc1f10d5d8a889be8079fc401d725e'}>

In [None]:
doc_store_sbertr.write_documents(all_docs_process)

print(doc_store_sbertr.get_document_count())

doc_store_sbertr.get_all_documents()[101]

## Retriever


In [14]:
# Initialize DPR Retriever to encode documents, encode question and query documents

dpr_retriever = DensePassageRetriever(
    document_store=doc_store_dpr,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=False,
    embed_title=True,
)

INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0
INFO:haystack.modeling.model.language_model:Auto-detected model language: english
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.
INFO:haystack.modeling.model.language_model:Auto-detected model language: english


In [15]:
# Add / update documents embeddings to index

doc_store_dpr.update_embeddings(retriever=dpr_retriever)

INFO:haystack.document_stores.faiss:Updating embeddings for 1932 docs...


Updating Embedding:   0%|          | 0/1932 [00:00<?, ? docs/s]

Create embeddings:   0%|          | 0/1936 [00:00<?, ? Docs/s]

In [None]:
# Sentence BERT embeddings retriever

# SENT_TRANS_MODEL = "sentence-transformers/all-mpnet-base-v2"
SENT_TRANS_MODEL = "sentence-transformers/multi-qa-mpnet-base-dot-v1"

embedd_retriever = EmbeddingRetriever(
    document_store=doc_store_sbertr,
    embedding_model=SENT_TRANS_MODEL,
    model_format="sentence_transformers",
    use_gpu=True,
    #embed_meta_fields=False
)

In [None]:
# Important:
# Now that we initialized the Retriever, we need to call update_embeddings() to iterate over all previously indexed documents 
# and update their embedding representation.
# While this can be a time consuming operation (depending on the corpus size), it only needs to be done once.
# At query time, we only need to embed the query and compare it to the existing document embeddings, which is very fast.

doc_store_sbertr.update_embeddings(embedd_retriever)

In [16]:
# save doc store

# doc_store_sbertr.save("faiss_index_SB.faiss")
doc_store_dpr.save("faiss_index_DPR.faiss")

In [9]:
query = "When will the mining difficulty of bitcoin increase?"
query1 = "How to use the Lightining Network?"
query2 = "What are some ethical considerations of Bitcoin?"

DPR retrieve

In [32]:
dpr_ls = [(doc.content, doc.meta) for doc in dpr_retriever.retrieve(query2, top_k=5)]

pprint(dpr_ls)

[('According to a study that was published in Scientific Reports, the '
  'environmental costs of mining the digital currency Bitcoin are more '
  'equivalent to the climatic harm caused by raising beef than they are to the '
  'costs of mining gold when expressed as a percentage of the market price.\n'
  'According to the authors, Bitcoin should be compared to considerably more '
  'energy-intensive goods like meat, natural gas, and crude oil rather than '
  "being likened to 'digital gold.'Bitcoin had a market cap of around 960 "
  'billion US dollars in December 2021 and accounted for almost 41% of all '
  'cryptocurrencies worldwide.',
  {'_split_id': '0',
   'date': '03-10-2022 13:30',
   'link': 'https://www.popularmechanics.co.za/science/bitcoin-as-dangerous-to-the-environment-as-production-of-beef/',
   'name': 'Bitcoin as dangerous to the environment as production of beef',
   'vector_id': '1447'}),
 ('Bitcoin is the currency of the Internet: a distributed, worldwide, '
  'dec

sbert retrieve

In [33]:
sb_ls = [(doc.content, doc.meta) for doc in embedd_retriever.retrieve(query2, top_k=5)]

pprint(sb_ls)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[('It is important to stay safe when navigating the crypto ecosystem. Looking '
  'out for projects like Big Eyes Coin with security verification or Bitcoin '
  'with their user-verified reports of safety is crucial, as without them, you '
  'could be at risk.\n'
  'Scams, rug-pulls and other nefarious activity is an unfortunate side effect '
  'of an economy that is run anonymously through the Internet with little '
  'governance, making it crucial to know how to spot red flags in projects and '
  'which projects to turn to instead for a secure investment.',
  {'_split_id': '0',
   'date': '08-10-2022 01:04',
   'link': 'https://techtelegraph.co.uk/three-crypto-projects-keeping-your-investments-safe-in-2022-bitcoin-big-eyes-coin-and-algorand/',
   'name': 'Three Crypto Projects Keeping Your Investments Safe In 2022: '
           'Bitcoin, Big Eyes Coin, and Algorand',
   'vector_id': '1454'}),
 ('Bitcoin has come a long way since its inception in 2008, becoming the most '
  'popular a

# CHECKPOINT 

In [5]:
document_store = FAISSDocumentStore.load("faiss_index_DPR.faiss", 
                                         "faiss_index_DPR.json")

## Retriever


In [6]:
# Initialize DPR Retriever to encode documents, encode question and query documents

dpr_retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=False,
    embed_title=True,
)

INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0
INFO:haystack.modeling.model.language_model:Auto-detected model language: english
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.
INFO:haystack.modeling.model.language_model:Auto-detected model language: english


In [7]:
document_store.get_all_documents()[555] #.embedding.shape

<Document: {'content': 'A drug trafficking group allegedly used cryptocurrency, including Bitcoin, to launder more than $5.35 million in a scheme that also allegedly involved the distribution of fake prescription drugs and other illegal substances.According to the statement, Khuu is charged with distributing phoney prescription medicines and other banned narcotics across the country using the dark web as a marketing channel.A federal grand jury accused John Khuu, 27, of San Francisco, California, on counts of conspiring to launder money, according to a statement released by the Department of Justice on Friday (Oct.', 'content_type': 'text', 'score': None, 'meta': {'vector_id': '555', 'name': 'Alleged $5.4M Drug Conspiracy Case Using Crypto', 'link': 'https://www.bollyinside.com/news/alleged-5-4m-drug-conspiracy-case-using-crypto', 'date': '08-10-2022 05:18', '_split_id': '0'}, 'embedding': '<embedding of shape (768,)>', 'id': '58eb8dad443a0e269add55baaae7153'}>

# Generator

## Retrieval Augmented Generator

In [None]:
# Initialize RAG Generator

rag_generator = RAGenerator(
    model_name_or_path="facebook/rag-token-nq",
    use_gpu=True,
    top_k=1,
    max_length=100,
    min_length=2,
    embed_title=True,
    num_beams=2,
)

In [None]:
Q = query2

ans = rag_generator.predict(Q, 
                            documents=dpr_retriever.retrieve(Q, top_k=5), 
                            top_k=3)

# pprint(ans.get('answers'))
pprint(ans)

## T5-large

In [7]:
class _T5Converter:
    """
    A sequence-to-sequence model input converter (https://huggingface.co/yjernite/bart_eli5) based on the BART architecture fine-tuned on ELI5 dataset (https://arxiv.org/abs/1907.09190).
    The converter takes documents and a query as input and formats them into a single sequence that a seq2seq model can use it as input for its generation step.
    This includes model-specific prefixes, separation tokens and the actual conversion into tensors. 
    For more details refer to Yacine Jernite's excellent LFQA contributions at https://yjernite.github.io/lfqa.html
    """
    def __call__(self, tokenizer: PreTrainedTokenizer, query: str, documents: List[Document], top_k: Optional[int] = None) -> BatchEncoding:
        conditioned_doc = "<P> " + " <P> ".join([d.content for d in documents])
        # print(conditioned_doc)

        # concatenate question and support document into T5 input
        query_and_docs = "question: {} context: {}".format(query, conditioned_doc)
        max_source_length = 512

        # return tokenizer([(query_and_docs, "A")], truncation=True, padding=True, max_length=max_source_length, return_tensors="pt")
        return tokenizer([query_and_docs], truncation=True, padding=True, max_length=max_source_length, return_tensors="pt")

In [8]:
# /data/t5-large; google/t5-large-lm-adapt

t5_generator = Seq2SeqGenerator(
    model_name_or_path="t5-large",
    input_converter=_T5Converter(),
    use_gpu=True,
    top_k=1,
    max_length=100,
    min_length=2,
    num_beams=2,
)

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [10]:
query = "When will the mining difficulty of bitcoin increase?"
query1 = "How to use the Lightining Network?"
query2 = "What are some ethical considerations of Bitcoin?"

In [11]:
Q = query2

ans = t5_generator.predict(Q, 
                           documents=dpr_retriever.retrieve(Q, top_k=5), 
                           top_k=3)

# pprint(ans.get('answers'))
pprint(ans)



{'answers': [<Answer {'answer': 'more resistant to wild inflation and corrupt banks', 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_id': None, 'meta': {'doc_ids': ['c5cc43a36291e5923c46b7e83780336a', 'ae53227a774cdcffd656e84ba112500b', 'e62b58a98cf006806cca2ad57e6e8552', 'e3144e1df39c1a70b5db871317867bd2', 'cd5347b76b43ff36e0b177a571140f68'], 'doc_scores': [0.6930337833109571, 0.6887692681935076, 0.6887352327507607, 0.6884563466591075, 0.6867845451601701], 'content': ["According to a study that was published in Scientific Reports, the environmental costs of mining the digital currency Bitcoin are more equivalent to the climatic harm caused by raising beef than they are to the costs of mining gold when expressed as a percentage of the market price.\nAccording to the authors, Bitcoin should be compared to considerably more energy-intensive goods like meat, natural gas, and crude oil rather than being likened to 'd

In [29]:
ans['answers'][0].answer

'more resistant to wild inflation and corrupt banks'

In [27]:
ans['answers'][0].meta.get('content')

["According to a study that was published in Scientific Reports, the environmental costs of mining the digital currency Bitcoin are more equivalent to the climatic harm caused by raising beef than they are to the costs of mining gold when expressed as a percentage of the market price.\nAccording to the authors, Bitcoin should be compared to considerably more energy-intensive goods like meat, natural gas, and crude oil rather than being likened to 'digital gold.'Bitcoin had a market cap of around 960 billion US dollars in December 2021 and accounted for almost 41% of all cryptocurrencies worldwide.",
 'Bitcoin is the currency of the Internet: a distributed, worldwide, decentralized digital money. Unlike traditional currencies such as dollars, bitcoins are issued and managed without any central authority whatsoever: there is no government, company, or bank in charge of Bitcoin. As such, it is more resistant to wild inflation and corrupt banks. With Bitcoin, you can be your own bank.',
 "

## Pipeline

In [29]:
QUESTIONS = [
    "When was Bitcoin created?",
    "How to use the Lightining Network?",
    "What is the process of Bitcoin mining?",
    "How is Bitcoin different from Ethereum?",
    "What is Binance?",
    "What are Bitcoin whales?",
    "What was Bitcoin's price in 2020?",
    "How much was Bitcoin's price in 2021?",
    "Who is the founder of Ethereum?",
    "What is ADA?",
    "How did the Terra blockchain collaspe?"
]

### T5

SB retriever


In [70]:
pipe_GQA = GenerativeQAPipeline(generator=t5_generator, retriever=embedd_retriever)

for question in QUESTIONS:
    res = pipe_GQA.run(query=question, 
                       params={"Generator": {"top_k": 1}, "Retriever": {"top_k": 5}})
    
    (print_answers(res, details="all"))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Query: When was Bitcoin created?
Answers:
[   <Answer {'answer': 'January 3, 2009', 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_id': None, 'meta': {'doc_ids': ['e78f855ac58aef6ca16f380a6667e07b', '159700ae048958151db904999f347ecc', 'b67559552250c34a8cc12af06ccdc6da', 'de2e2bc7d6532714d5ad2334942149c9', '6b62132efdd8d476494cdfd74228a5f3'], 'doc_scores': [0.5637134904684233, 0.557681140786695, 0.5531961057522989, 0.553056531842014, 0.5528918234010709], 'content': ['January 3, 2009 is not an ordinary date, but the beginning of a revolution that changed the course of money as we know it. Satoshi Nakamoto created the Bitcoin network by mining the first block of the chain, called the Genesis block. Bitcoin started its journey as A Peer-to-Peer Electronic Cash System and now has a trillion-dollar market capitalization with thousands of different cryptocurrencies that followed. In fact, in November 2021, the total ma

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Query: How to use the Lightining Network?
Answers:
[   <Answer {'answer': 'send and receive payments on the Lightning Network', 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_id': None, 'meta': {'doc_ids': ['27cd0beeb996647097d0f2a71f42998f', '634567283bec87c354ca5b7a13798a38', '60f8ff7cde10b42596184c40a9768db9', 'b138da8a414ccb813888ebf5296084c0', '75262b2512dc66f1f765aa9cf1f45c65'], 'doc_scores': [0.5639476669566864, 0.549129193527416, 0.5476153015536874, 0.546535798698747, 0.5459032613460874], 'content': ['Lighting Network is a scaling solution that allows transactions on the Bitcoin network to be faster and cheaper. With a larger Lightning Network capacity, more Bitcoin can be tran …\nRead Full Story', 'Lightning Labs is building the ability to use stablecoins on the Lightning Network using the new Taro protocol due to demand in emerging markets.', "Source: Neutronpay\n\nNeutronpay, a Vancouver, Canada and H

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Query: What is Bitcoin mining?
Answers:
[   <Answer {'answer': 'process used to generate new coins and verify new transactions', 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_id': None, 'meta': {'doc_ids': ['da3d945c6fa1283e0701d984dce654ad', '6c2792ac0167a5155e69fe5c162bb271', 'b956b8ce0694edd16d5ea23adf6f08c', '65d306259283584b6307fd16c1263c54', 'e62b58a98cf006806cca2ad57e6e8552'], 'doc_scores': [0.571245242044047, 0.5688219552394569, 0.5676323396865649, 0.5611203927050911, 0.5607578647151138], 'content': ['Bitcoin mining is the process used to generate new coins and verify new transactions. The process involves vast, decentralized networks of computers …\nRead Full Story', "Bitcoin mining is the process used to generate new coins and verify new transactions. The process involves vast, decentralized networks of computers around the world that verify and secure blockchains, the virtual ledgers that document cr

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Query: What is the difference between Bitcoin and Ethereum?
Answers:
[   <Answer {'answer': 'Bitcoin revolutionized and proved the concept of artificial digital scarcity, but now other cryptocurrencies rule the ball', 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_id': None, 'meta': {'doc_ids': ['ae9ea104155ec5216cec696a4bc8d004', 'a814905d65efac13bb2e787a824d5181', '8a08d62047fd0597670f655eb0f2efa9', 'd7f6780685c3c3d241bbcec2ae96879e', '69ad142dd7184b01629ece65266768d2'], 'doc_scores': [0.5571493068961343, 0.5555615101339246, 0.5540567330375912, 0.5535426304547756, 0.5534943617496249], 'content': ['In explaining the difference between Bitcoin and other cryptocurrencies, Matt Hamilton, former director of development at Ripple, approached the answer from a nonstandard angle.\nThus, the developer stated that the main difference is that Bitcoin revolutionized and proved the concept of artificial digital scarcity, b

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Query: What is Binance?
Answers:
[   <Answer {'answer': 'cryptocurrency exchange giant', 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_id': None, 'meta': {'doc_ids': ['274207503808a926dd5b7adc38e77065', '5d2b05314c4a5b94a7994b400624b6e0', '15f60124d1995b2d3fe9ac27c77e605d', '4e47e8f947712d7212c6ffb2258903b2', 'a66c459a7c4de842e5e941a0a33450af'], 'doc_scores': [0.5518496030809471, 0.5513631079408906, 0.5509074803817374, 0.548587592408547, 0.5479703706172456], 'content': ['Joaquin Victor Tacla, Tech Times 07 October 2022, 07:10 am Binance, a cryptocurrency exchange giant, momentarily shut down its blockchain network after cyber hackers stole BNB tokens valued at about $570 million, as reported first by\xa0CNBC. The crypto company reported on Thursday, Oct. 6, that a\xa0cross-chain bridge\xa0connecting with its BNB Chain had been compromised, allowing hackers to remove BNB coins from the network.\xa0 This form of 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Query: What are Bitcoin whales?
Answers:
[   <Answer {'answer': 'crypto whales', 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_id': None, 'meta': {'doc_ids': ['af27664cc144a92deb1ebf58a3892ae5', 'd859c798bbaf2551f4958487b7ee5be4', 'f7fbb342e39a90969ad4b4c390aa00e', 'cd5347b76b43ff36e0b177a571140f68', '3a3740cbcb81597b9c9fd0d2453ad0c4'], 'doc_scores': [0.5643794611174318, 0.5618503706323196, 0.5611575136922107, 0.5610819417244418, 0.5609328891011023], 'content': ['Bitcoin (BTC) worth millions of dollars is suddenly being purchased by a crypto whale.In early Q4, 2022, Bitcoin, like all the other major cryptocurrencies, failed to end its recession.Some whales, though, are certain that dips are for purchase and the statistics of an unusual address on the Bitcoin network were revealed by anonymous Bitcoin analyst and investor @Capital15C on Twitter.The unknown address began making a lot of aggressive purchases of di

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Query: What was Bitcoin's price in 2020?
Answers:
[   <Answer {'answer': '$20k', 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_id': None, 'meta': {'doc_ids': ['e6b6cfec233d9202d470f7ccd9ca3e20', '963dc93ae03d5e4e8936321a56f2a555', 'a0a73947090a58c905098f7c3e354ecb', '13129967218780584891ae9bad8d07b4', '85369e9d635ec94e48daa98f2f7d0ddd'], 'doc_scores': [0.561720021880423, 0.5615765767571395, 0.5606089573617276, 0.5605384815476503, 0.5599611084072811], 'content': ["The Bitcoin price is currently trading just under $20k, and many analysts believe that there isn't much resistance before it breaks back over this crucial area and overcomes it.\n\nBitcoin will break $20k again\n\nAs the ATH during the 2018 bull market, the $20k level is a significant one, and one that investors are extremely interested in breaking above once more.\n\nIn an inflationary environment, even with interest rates rising, Bitcoin is well-pois

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Query: How much was Bitcoin's price in 2021?
Answers:


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Query: Who is the creator of Ethereum?
Answers:
[   <Answer {'answer': 'Satoshi Nakamoto', 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_id': None, 'meta': {'doc_ids': ['f0997fb8a948048dcf813c6107e120d', '9091eef42f4176eb7ccf9f2bb1ce9ce2', 'e78f855ac58aef6ca16f380a6667e07b', 'd5fa4690e7f34570828770d7d1072955', '67dc8539a6198ef096f979aa4bbc43d8'], 'doc_scores': [0.5594395060398699, 0.5554381180678135, 0.5540113687498042, 0.5537281963043288, 0.5525070231928333], 'content': ["John McAfee is the creator of Bitcoin. This is the remarkable claim of a TikTok influencer who calls himself Harry the Soul Coach.\nHe says, 'John McAfee is the creator of Bitcoin. If we go back and look at his past, he created internet security software McAfee in 1987. So he had backdoor access then.'The influencer goes on to say that he thinks McAfee is still alive.\nJohn McAfee and Conspiracy Theories\nWhile there have been some very wild 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Query: What is ADA?
Answers:
[   <Answer {'answer': 'Cardano', 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_id': None, 'meta': {'doc_ids': ['5dc5418da69f08d3492e712cd29de81', '743c039131af338dd8f2af771f24d17a', '636f84d7be76456de0ab857c30daba54', 'fedb9dbf090e117ee81e82fcab59f0a3', 'd29ddf60ea9657c80539b359d66b20a0'], 'doc_scores': [0.5400985328535065, 0.5380991941051138, 0.5380864511176288, 0.5377526529776744, 0.5371861440740896], 'content': ["Cardano (ADA) creator Charles Hoskinson is unveiling his take on Bitcoin's (BTC) value proposition as it increasingly becomes used in the realm of decentralized finance (DeFi).\nHoskinson believes 'wrapped' coins that are pegged to the price of Bitcoin will spread like wildfire across a wide swath of smart contract-enabled blockchains. Wrapped tokens have grown in popularity by essentially giving investors – in this case Bitcoin holders – a way to trade and utilize thei

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Query: How did the Terra blockchain collaspe?
Answers:
[   <Answer {'answer': 'an arrest warrant was issued against him', 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_id': None, 'meta': {'doc_ids': ['69a4f1f92e18f832d6b49779145b91b3', '28448a2601a767df4c07531171250672', '58fea8106900050732c2e38dbf5c0cb8', 'e78f855ac58aef6ca16f380a6667e07b', '6d96aa22eb5ef2759a643f9d0bde5505'], 'doc_scores': [0.5608760807676864, 0.5540187111711634, 0.5528620903422082, 0.5513777574493502, 0.5484529546644917], 'content': ['Buterin has shared his opinion about the Terra collapseButerin has previously criticized "algostable" and called it "a propaganda term"Do Kwon is the CEO and co-founder of Terraform Labs, the developer behind Terra blockchainFollowing Ethereum\'s monumental success on The Merge, one of its founders, Vitalik Buterin, explained the reason behind Terra\'s collapse and why it caused a major setback in the decentral

DPR retriever


In [30]:
pipe_GQA = GenerativeQAPipeline(generator=t5_generator, retriever=dpr_retriever)

for question in QUESTIONS:
    res = pipe_GQA.run(query=question, 
                       params={"Generator": {"top_k": 1}, "Retriever": {"top_k": 5}})
    
    (print_answers(res, details="all"))


Query: When was Bitcoin created?
Answers:
[   <Answer {'answer': '2008', 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_id': None, 'meta': {'doc_ids': ['b67559552250c34a8cc12af06ccdc6da', 'de2e2bc7d6532714d5ad2334942149c9', '3107a00846fef491728266abbabb5c59', 'a25cc76071b03efed19fe3535373d9a9', '49d82e74e302615bdbb8ee19a9b35291'], 'doc_scores': [0.6909430707528474, 0.683088018888498, 0.6829614265625314, 0.682790060932464, 0.6827812038214758], 'content': ["By Lyle Opolentisima | source:Here Oct 4th, 2022 The history of the world's first crypto began in 2008, when its creator, a Japanese named Satoshi Nakamoto, published the Bitcoin protocol openly.\nFounder Holds Bitcoins worth $1.1 billion\nSatoshi Nakamoto, the founder of bitcoins is said to hold 1 million bitcoins. Today, the value of these bitcoins stands at $1.1 billion.\nNobel Prize for Satoshi Nakamoto\nIn 2015 the bitcoin founder Satoshi Nakamoto got nomi

### RAG

RAG generator did not perform so well

In [34]:
pipe_GQA_2 = GenerativeQAPipeline(generator=rag_generator, retriever=dpr_retriever)

for question in QUESTIONS:
    res = pipe_GQA_2.run(query=question, 
                         params={"Generator": {"top_k": 1}, "Retriever": {"top_k": 5}})
    
    (print_answers(res, details="all"))


Query: When was Bitcoin created?
Answers:
[   <Answer {'answer': ' 2009', 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_id': None, 'meta': {'doc_ids': ['de2e2bc7d6532714d5ad2334942149c9', '3107a00846fef491728266abbabb5c59', 'a25cc76071b03efed19fe3535373d9a9', '49d82e74e302615bdbb8ee19a9b35291', 'edd3c9c1fd287eb22d2a730539d82d42'], 'doc_scores': [0.6950108059107912, 0.6878239914940308, 0.6878239914940308, 0.6878239914940308, 0.6874300817937259], 'content': ['Released on April 31st, 2009, Bitcoin is a cryptocurrency (a.k.a. digital money) that allows users to facilitate pseudonymous transactions over the internet. Unlike Fiat money, Bitcoin is decentralized, which means that it is not regulated by a central governing body. Bitcoin users can, therefore, facilitate peer-to-peer cash transactions without the approval of any financial institution.\nGamers enjoy numerous benefits by using Bitcoin. For instance, this c