In [1]:
import logging
from typing import Dict, List, Optional, Union
from pprint import pprint
import pandas as pd

from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter, PreProcessor, EmbeddingRetriever, DensePassageRetriever
from haystack.utils import convert_files_to_docs, print_answers
from haystack.document_stores import InMemoryDocumentStore, FAISSDocumentStore
from haystack.nodes import FARMReader, TransformersReader, RAGenerator, Seq2SeqGenerator
from haystack.pipelines import GenerativeQAPipeline
from haystack.schema import Document

import torch
from transformers import PreTrainedTokenizer, BatchEncoding
# import datasets

%load_ext autoreload
%autoreload 2

In [2]:
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [3]:
torch.__version__

'1.12.1+cu113'

In [4]:
torch.version.cuda

'11.3'

#### set proxy (optional)

In [5]:
import getpass
import os

user = getpass.getpass(prompt='username: ')
pw = getpass.getpass(prompt='password: ')

username:  ··········
password:  ·········


In [6]:
proxy1 = f"http://{user}:{pw}@proxy.us.dell.com:80"

os.environ['http_proxy'] = proxy1
os.environ['https_proxy'] = proxy1
os.environ['HTTP_PROXY'] = proxy1
os.environ['HTTPs_PROXY'] = proxy1

# GET documents

In [4]:
# wikitext['train']['text'][:30]

In [3]:
# len(wikitext['train'])

In [7]:
DATA_PATH = '../data/bitcoin_articles.csv'

df = pd.read_csv(DATA_PATH)
df.shape

(2500, 17)

In [8]:
df.head()

Unnamed: 0,article_id,title,author,published_date,link,clean_url,excerpt,summary,rights,article_rank,topic,country,language,authors,media,twitter_account,article_score
0,57a00c1140cbd3af79e77bf0e4e6af48,62% of Bitcoin Has Not Moved in a Year as Long-Term Holders Refuse to Sell,Jamie McNeill,04-10-2022 17:15,https://www.business2community.com/crypto-news/62-of-bitcoin-has-not-moved-i...,business2community.com,"Over the course of the last few years, there has been an impressive trend wh...","Over the course of the last few years, there has been an impressive trend wh...",business2community.com,1595,finance,US,en,Jamie McNeill,https://www.business2community.com/wp-content/uploads/2022/10/btcc.webp,@Jamie_DeFi,8.556426
1,21b48b3731c03466be3fac4be6c7dc67,The Orange Party Issue Playlist,Bitcoin Magazine,05-10-2022 21:17,https://bitcoinmagazine.com/culture/orange-party-issue-playlist,bitcoinmagazine.com,News Links: Russia Legalizing Bitcoin And Crypto Is A Matter Of Time Says Mi...,Russia Legalizing Bitcoin And Crypto Is A Matter Of Time Says Minister Of In...,bitcoinmagazine.com,6284,news,US,en,Bitcoin Magazine,https://bitcoinmagazine.com/.image/t_share/MTkyODIyNDQ0Mjc5NDczMzcz/playlist...,,8.507881
2,77030740ee160ad68c25e4e63515dd77,How Many Bitcoins Are There?,AOL Staff,04-10-2022 21:44,https://www.gobankingrates.com/investing/crypto/how-many-bitcoins-are-there/,gobankingrates.com,"Bitcoin has a maximum supply of 21 million. However, that doesn't tell the w...","Bitcoin has a maximum supply of 21 million. However, that doesn't tell the w...",aol.com,5044,news,US,en,"AOL Staff,David Granahan",https://s.yimg.com/ny/api/res/1.2/wPK4V8gjwjrDYoMuUGxoBw--/YXBwaWQ9aGlnaGxhb...,@AOL,8.483973
3,19285dde79599b6a9b4aa5a58805159b,Bitcoin 2008–2022: From Bouncing Baby To Troubled Teenager,Mark Hooson,03-10-2022 15:45,https://www.forbes.com/advisor/au/investing/bitcoin-2008-2022-from-bouncing-...,forbes.com,"Conceived in 2008 and launched in 2009, bitcoin is the world's first and lar...","Conceived in 2008 and launched in 2009, bitcoin is the world's first and lar...",forbes.com,49,business,US,en,"Forbes Staff,Kevin Pratt,Mark Hooson,Sophie Venz",https://thumbor.forbes.com/thumbor/fit-in/x/https://www.forbes.com/advisor/a...,,8.466393
4,ad1822c7ff1bbd7e38f7f9dbdbce245f,"Bitcoin Lightning Network capacity strikes 5,000 BTC",Joseph Hall,03-10-2022 12:09,https://cointelegraph.com/news/bitcoin-lightning-network-capacity-strikes-5-...,cointelegraph.com,"First created in 2018, the Lighting Network has come under fire recently, wi...",Bear markets are for building out capacity on the layer-2 Lightning Network....,cointelegraph.com,1696,news,US,en,Joseph Hall,https://images.cointelegraph.com/images/1200_aHR0cHM6Ly9zMy5jb2ludGVsZWdyYXB...,@cointelegraph,8.446906


In [9]:
# Use data to initialize Document objects

ids = list(df["article_id"].values)
texts = list(df["summary"].values)
titles = list(df["title"].values)
dates = list(df["published_date"].values)
links = list(df["link"].values)

all_docs = []
for i, title, text, date, link in zip(ids, titles, texts, dates, links):
    all_docs.append(Document(id=i, 
                             content=text, 
                             meta={"name": title or "", 
                                   "link": link or "", 
                                   "date": date or ""}))


# all_docs = [
#     {
#         'id':i, 
#         'content': c,
#         'meta': {'article_link': l, 'article_title': t, 'article_date': d}
#     } for i, c, t, d, l in zip(df['article_id'], df['summary'], df['title'], df['published_date'], df['link'])
# ]

In [10]:
len(all_docs)

2500

In [None]:
# PDFS_PATH="/data/kg_pdfs_test/"
# all_docs = convert_files_to_docs(dir_path=PDFS_PATH)

## Preprocessing 

In [11]:
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=256,  # smaller splits works better? 
    split_respect_sentence_boundary=True,
)

all_docs_process = preprocessor.process(all_docs)

print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(all_docs_process)}")

Preprocessing:   0%|          | 0/2500 [00:00<?, ?docs/s]

n_files_input: 2500
n_docs_output: 2500


In [12]:
all_docs_process[2:5]

[<Document: {'content': "Bitcoin has a maximum supply of 21 million. However, that doesn't tell the whole story. As time goes on, it becomes increasingly difficult to acquire bitcoin due to the asset's economics. Here is a full breakdown of how bitcoin works and exactly why it has a limited supply.Read: Looking To Diversify In A Bear Market? Consider These 6 Alternative InvestmentsBitcoin Tokenomics: Proof of Work, Mining and Halving CyclesTokenomics, as one can imagine, refers to the economics of a token. Similar to how fiat currencies, like the dollar, are issued by governments and regulated through monetary policy, tokenomics refers to the rules and functions revolving around cryptocurrency.", 'content_type': 'text', 'score': None, 'meta': {'name': 'How Many Bitcoins Are There?', 'link': 'https://www.gobankingrates.com/investing/crypto/how-many-bitcoins-are-there/', 'date': '04-10-2022 21:44', '_split_id': 0}, 'embedding': None, 'id': '7afbcf9ee51ca07473ee35af09165d3b'}>,
 <Document

## Document Store 

In [14]:
# In-Memory Document Store
# document_store = InMemoryDocumentStore()


# The FAISSDocumentStore uses a SQL(SQLite in-memory be default) database under-the-hood to store the document text and other meta data. 
# The vector embeddings of the text are indexed on a FAISS Index that later is queried for searching answers.

doc_store_dpr = FAISSDocumentStore(sql_url = "sqlite:///faiss_document_store_dpr.db", 
                                    faiss_index_factory_str="Flat", similarity="dot_product", return_embedding=True)

# doc_store_sbertr = FAISSDocumentStore(sql_url = "sqlite:///faiss_document_store_SB.db", 
#                                     faiss_index_factory_str="Flat", similarity="dot_product", return_embedding=False)

In [15]:
doc_store_dpr.write_documents(all_docs_process)

print(doc_store_dpr.get_document_count())

doc_store_dpr.get_all_documents()[101]

Writing Documents:   0%|          | 0/2500 [00:00<?, ?it/s]

1932


<Document: {'content': 'Bitcoin is the most valuable and well-known cryptocurrency in the world. But this popularity has given way to a lot of traffic on the Bitcoin blockchain, which, in turn, has led to long transaction times. Because of this, Bitcoin transaction accelerators have become widely used. But what is a Bitcoin transaction accelerator, and which is best for you? What Is a Bitcoin Transaction Accelerator? Bitcoin transaction accelerators are websites you can use to hopefully process a Bitcoin transaction faster.', 'content_type': 'text', 'score': None, 'meta': {'name': 'The Best Bitcoin Accelerators to Speed Up Your Transactions', 'link': 'https://www.makeuseof.com/best-bitcoin-accelerators/', 'date': '04-10-2022 11:45', '_split_id': '0'}, 'embedding': None, 'id': '1adc1f10d5d8a889be8079fc401d725e'}>

## Retriever


DPR retriever

In [16]:
# Initialize DPR Retriever to encode documents, encode question and query documents

dpr_retriever = DensePassageRetriever(
    document_store=doc_store_dpr,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=True,
    embed_title=True,
)

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.modeling.model.language_model:Auto-detected model language: english
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.
INFO:haystack.modeling.model.language_model:Auto-detected model language: english


In [17]:
# Add / update documents embeddings to index

doc_store_dpr.update_embeddings(retriever=dpr_retriever)

INFO:haystack.document_stores.faiss:Updating embeddings for 1932 docs...


Updating Embedding:   0%|          | 0/1932 [00:00<?, ? docs/s]

Create embeddings:   0%|          | 0/1936 [00:00<?, ? Docs/s]

SBERT retriever

In [None]:
# Sentence BERT embeddings retriever

# SENT_TRANS_MODEL = "sentence-transformers/all-mpnet-base-v2"
SENT_TRANS_MODEL = "sentence-transformers/multi-qa-mpnet-base-dot-v1"

embedd_retriever = EmbeddingRetriever(
    document_store=doc_store_sbertr,
    embedding_model=SENT_TRANS_MODEL,
    model_format="sentence_transformers",
    use_gpu=True,
    #embed_meta_fields=False
)

In [None]:
# Important:
# Now that we initialized the Retriever, we need to call update_embeddings() to iterate over all previously indexed documents 
# and update their embedding representation.
# While this can be a time consuming operation (depending on the corpus size), it only needs to be done once.
# At query time, we only need to embed the query and compare it to the existing document embeddings, which is very fast.

doc_store_sbertr.update_embeddings(embedd_retriever)

__save doc store__


In [18]:
# save doc store

# doc_store_sbertr.save("faiss_inadex_SB.faiss")
doc_store_dpr.save("faiss_index_DPR.faiss")

In [19]:
query = "When will the mining difficulty of bitcoin increase?"
query1 = "How to use the Lightining Network?"
query2 = "What are some ethical considerations of Bitcoin?"

DPR retrieve

In [20]:
dpr_ls = [(doc.content, doc.meta) for doc in dpr_retriever.retrieve(query2, top_k=5)]

pprint(dpr_ls)

[('According to a study that was published in Scientific Reports, the '
  'environmental costs of mining the digital currency Bitcoin are more '
  'equivalent to the climatic harm caused by raising beef than they are to the '
  'costs of mining gold when expressed as a percentage of the market price.\n'
  'According to the authors, Bitcoin should be compared to considerably more '
  'energy-intensive goods like meat, natural gas, and crude oil rather than '
  "being likened to 'digital gold.'Bitcoin had a market cap of around 960 "
  'billion US dollars in December 2021 and accounted for almost 41% of all '
  'cryptocurrencies worldwide.',
  {'_split_id': '0',
   'date': '03-10-2022 13:30',
   'link': 'https://www.popularmechanics.co.za/science/bitcoin-as-dangerous-to-the-environment-as-production-of-beef/',
   'name': 'Bitcoin as dangerous to the environment as production of beef',
   'vector_id': '1447'}),
 ('Bitcoin is the currency of the Internet: a distributed, worldwide, '
  'dec

sbert retrieve

# CHECKPOINT 

In [5]:
document_store = FAISSDocumentStore.load("faiss_index_DPR.faiss", 
                                         "faiss_index_DPR.json")

## Retriever


In [6]:
# Initialize DPR Retriever to encode documents, encode question and query documents

dpr_retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=True,
    embed_title=True,
)

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.modeling.model.language_model:Auto-detected model language: english
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.
INFO:haystack.modeling.model.language_model:Auto-detected model language: english
ERROR:posthog:error uploading: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


In [7]:
document_store.get_all_documents()[51] #.embedding.shape

<Document: {'content': "The payroll and job reports from the world's largest economy were slightly above expectations for September.\nHowever, BTC reacted with a sharp price decline, as it typically happens when the US publishes any sort of data lately.\n\nAlthough the numbers for September are below August and the average for 2022, the States added 263,000 new jobs during the month, and the unemployment rate decreased to 3.5%, said the Bureau of Labor Statistics.\nThese figures are slightly above expectations, especially given the aggressive interest rate hike by the nation's central bank following the record-setting inflation.", 'content_type': 'text', 'score': None, 'meta': {'vector_id': '51', 'name': 'Increased Bitcoin Volatility as US Jobs Data Exceeds Expectations', 'link': 'https://cryptopotato.com/increased-bitcoin-volatility-as-us-jobs-data-exceeds-expectations/', 'date': '07-10-2022 14:10', '_split_id': '0'}, 'embedding': '<embedding of shape (768,)>', 'id': '15040e35ce802eb3

# Generator

## Retrieval Augmented Generator

In [None]:
# Initialize RAG Generator

rag_generator = RAGenerator(
    model_name_or_path="facebook/rag-token-nq",
    use_gpu=True,
    top_k=1,
    max_length=100,
    min_length=2,
    embed_title=True,
    num_beams=2,
)

In [None]:
Q = query2

ans = rag_generator.predict(Q, 
                            documents=dpr_retriever.retrieve(Q, top_k=5), 
                            top_k=3)

# pprint(ans.get('answers'))
pprint(ans)

## T0 Generator

uses ~15gb of GPU ram on A100

In [23]:
class _T0Converter:
    """
    The converter takes documents and a query as input and formats them into a single sequence that a seq2seq model can use it as input for its generation step.
    This includes model-specific prefixes, separation tokens and the actual conversion into tensors. 
    """
    def __call__(self, tokenizer: PreTrainedTokenizer, query: str, documents: List[Document], top_k: Optional[int] = None) -> BatchEncoding:
        # conditioned_doc = "\n".join([d.content for d in documents])
        conditioned_doc = "<P> " + " <P> ".join([d.content for d in documents])

        # concatenate question and support document as a prompt for T0 input
        query_and_docs = "Question: {} \nContext: {} \nAnswer: ".format(query, conditioned_doc,)
        max_source_length = 512
        return tokenizer([query_and_docs], truncation=True, padding=True, max_length=max_source_length, return_tensors="pt")

In [24]:
# bigscience/T0_3B; bigscience/T0pp

t0_generator = Seq2SeqGenerator(
    model_name_or_path="bigscience/T0_3B",
    input_converter=_T0Converter(),
    use_gpu=True,
    top_k=1,
    max_length=100,
    min_length=2,
    num_beams=3,
)

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
ERROR:posthog:error uploading: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


In [25]:
query = "When will the mining difficulty of Bitcoin increase?"
query1 = "How to use the Lightining Network?"
query2 = "What are some ethical considerations of Bitcoin?"

In [26]:
Q = query

ans = t0_generator.predict(Q, 
                           documents=dpr_retriever.retrieve(Q, top_k=5), 
                           top_k=3)

# pprint(ans.get('answers'))
pprint(ans)

{'answers': [<Answer {'answer': 'The halving could occur in 2023', 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_id': None, 'meta': {'doc_ids': ['10e5b838662e35ca49dbae711ce6679e', '8c28e8b509cba1fc52d4ecf134e39916', 'f4c709b8437af8d09b15fb0522071d88', '3917ec5c8ddea6b9d43c866002b97a7', '1a528b0056f77e0fd0e7f34ec8a1f4e0'], 'doc_scores': [0.6889382212548366, 0.6868771489778508, 0.6856622400220389, 0.684761930812981, 0.6835654031073286], 'content': ["On October 3, the Bitcoin hash rate soared to a new all-time high of 244.25 EH/s. Commenting on this, Binance CEO Changpeng Zhao said, 'miners know something we don't.'Two days later, the hash rate surged yet again, smashing the previous record to print a new all-time high of 314.58 EH/s, further signaling miner confidence despite growing price uncertainty amid crypto winter.Meanwhile, since the November 2021 market top, mining difficulty has also increased but not to

## Pipeline

In [32]:
QUESTIONS = [
    "When was Bitcoin created?",
    "How to use the Lightining Network?",
    "What is Bitcoin mining used for?",
    "How is Bitcoin different from Ethereum?",
    "What is Binance?",
    "What are crypto whales?",
    "What was Bitcoin's price in 2020?",
    "How much was Bitcoin's price in 2021?",
    "Who is the founder of Ethereum?",
    "What is ADA?",
    "How did the Terra blockchain collaspe?"
]

### T0

DPR retriever


In [33]:
pipe_GQA = GenerativeQAPipeline(generator=t0_generator, retriever=dpr_retriever)

for question in QUESTIONS:
    res = pipe_GQA.run(query=question, 
                       params={"Generator": {"top_k": 1}, "Retriever": {"top_k": 5}})
    
    (print_answers(res, details="all"))
    # print(res)


Query: When was Bitcoin created?
Answers:
[   <Answer {'answer': 'In 2009, Satoshi Nakamoto published the Bitcoin protocol openly.', 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_id': None, 'meta': {'doc_ids': ['b67559552250c34a8cc12af06ccdc6da', 'de2e2bc7d6532714d5ad2334942149c9', '3107a00846fef491728266abbabb5c59', 'a25cc76071b03efed19fe3535373d9a9', '49d82e74e302615bdbb8ee19a9b35291'], 'doc_scores': [0.6909430055853804, 0.683087985856457, 0.6829614596016413, 0.6827900113594502, 0.6827811377231348], 'content': ["By Lyle Opolentisima | source:Here Oct 4th, 2022 The history of the world's first crypto began in 2008, when its creator, a Japanese named Satoshi Nakamoto, published the Bitcoin protocol openly.\nFounder Holds Bitcoins worth $1.1 billion\nSatoshi Nakamoto, the founder of bitcoins is said to hold 1 million bitcoins. Today, the value of these bitcoins stands at $1.1 billion.\nNobel Prize for Satoshi Na

ERROR:posthog:error uploading: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))



Query: How to use the Lightining Network?
Answers:
[   <Answer {'answer': 'Bitcoin Lightning Network (BLN) is a layer 2 payment protocol which sits atop Bitcoin and facilitates instantly settled payments measured in milliseconds, is capable of millions of transactions per second and allows for exceptionally low fees, which all combined makes it the ideal payment network of any type, period.', 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_id': None, 'meta': {'doc_ids': ['60f8ff7cde10b42596184c40a9768db9', '5b6b8aacfcde31dac029ba028451e791', 'd38b30ace360f84e2463abb1c8e96e32', 'ed7963635a701a435ff26667946ebfde', '8e786f9757b067b055e2af81a3c98fcc'], 'doc_scores': [0.6589387135271702, 0.6563229087361326, 0.6556152855590885, 0.6552289254729342, 0.6543070146600914], 'content': ["Source: Neutronpay\n\nNeutronpay, a Vancouver, Canada and Ho Chi Minh City, Vietnam based startup that enables consumers and businesses to s

SB retriever


### RAG

RAG generator did not perform so well

## experiments 

In [21]:
torch.cuda.empty_cache()

In [22]:
del t0_generator