In [53]:
from haystack.pipelines import Pipeline
from haystack.nodes import Crawler, PreProcessor, DensePassageRetriever, FARMReader, Seq2SeqGenerator
from haystack.document_stores import InMemoryDocumentStore
from haystack.utils import print_answers

In [36]:
!export TOKENIZERS_PARALLELISM=false

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [44]:
#
# Step 1: Get the data, clean it, and store it.
#

# NOTE: You need to run this code just once, every time you create a new Elasticsearch container. Comment it out afterwards.

# Let's create the indexing pipeline. It will contain:
#  1. A Crawler node that fetches text from a website.
#  2. A PreProcessor that makes the documents friendly to the Retriever.
#  3. The Document Store at the end, that receives the documents and stores them. 

crawler = Crawler(
    urls=["https://stackoverflow.com/questions/tagged/pytorch?tab=Votes"],   # Websites to crawl
    crawler_depth=1,    # How many links to follow
    filter_urls=[r"""\/questions\/\d{8}"""],
    output_dir="crawled_files",  # The directory to store the crawled files, not very important, we don't use the files in this example
)

preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=500,
    split_respect_sentence_boundary=True,
)

document_store = InMemoryDocumentStore(similarity="dot_product")

indexing_pipeline = Pipeline() 
indexing_pipeline.add_node(component=crawler, name="crawler", inputs=['File'])
indexing_pipeline.add_node(component=preprocessor, name="preprocessor", inputs=['crawler'])
indexing_pipeline.add_node(component=document_store, name="document_store", inputs=['preprocessor'])

indexing_pipeline.run(params={"crawler": {'return_documents': True}})

Preprocessing:   0%|          | 0/15 [00:00<?, ?docs/s]



{'documents': [<Document: {'content': '\n\nStack Overflow\n\nAbout\n\nProducts\n\nFor Teams\n\nStack Overflow\nPublic questions & answers\n\nStack Overflow for Teams\nWhere developers & technologists share private knowledge with coworkers\n\nTalent\n\nBuild your employer brand\n\nAdvertising\nReach developers & technologists worldwide\n\nAbout the company\n\nLoading…\n\ncurrent community\n\nStack Overflow\n\nhelp\nchat\n\nMeta Stack Overflow\n\nyour communities\n\nSign up or log in to customize your list.\n\nmore stack exchange communities\n\ncompany blog\n\nLog in\n\nSign up\n\nStackExchange.ready(function () { StackExchange.topbar.init(); });\nStackExchange.scrollPadding.setPaddingTop(50, 10);\n\nHome\n\nPublic\n\nQuestions\n\nTags\n\nUsers\n\nCompanies\n\nCollectives\n\nExplore Collectives\n\nTeams\n\nStack Overflow for Teams\n– Start collaborating and sharing organizational knowledge.\n\nCreate a free Team\nWhy Teams?\n\nTeams\n\nCreate free Team\n\nCollectives™ on Stack Overflow\n

In [59]:
#
# Step 2: Use the data to answer questions.
#

# NOTE: You can run this code as many times as you like.

# Let's create a query pipeline. It will contain:
#  1. A Retriever that gets the relevant documents from the document store.
#  2. A Reader that locates the answers inside the documents. 

query_model = "facebook/dpr-question_encoder-single-nq-base"
passage_model = "facebook/dpr-ctx_encoder-single-nq-base"

retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model=query_model,
    passage_embedding_model=passage_model,
    max_seq_len_query=64,
    max_seq_len_passage=256,
)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.


In [46]:
document_store.update_embeddings(retriever)

Updating Embedding:   0%|          | 0/142 [00:00<?, ? docs/s]

Create embeddings:   0%|          | 0/144 [00:00<?, ? Docs/s]

In [51]:
reader =  FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

query_pipeline = Pipeline()
query_pipeline.add_node(component=reloaded_retriever, name="retriever", inputs=["Query"])
query_pipeline.add_node(component=reader, name="reader", inputs=["retriever"])

queries = ["How do I initialize weights in PyTorch?","How do I check if PyTorch is using the GPU?","How do I print the model summary in PyTorch?", "How do I save a trained model in PyTorch?","What does .contiguous() do in PyTorch?","What does model.eval() do in pytorch?", "What does model.train() do in PyTorch?", "What does .view() do in PyTorch?", "What's the difference between reshape and view in pytorch?", "Why do we need to call zero_grad() in PyTorch?"]

for query in queries:
    results = query_pipeline.run(query=query)
    print("\n\n")
    print_answers(results, details="medium")
    print("\n\n")




Inferencing Samples:   0%|          | 0/3 [00:00<?, ? Batches/s]





Query: How do I initialize weights in PyTorch?
Answers:
[   {   'answer': 'automatically',
        'context': 'adges9797 bronze badges\n'
                   '\n'
                   '1\n'
                   '\n'
                   '1\n'
                   '\n'
                   'PyTorch often initializes the weights automatically.\n'
                   '\n'
                   '–\xa0Mateen Ulhaq\n'
                   '\n'
                   'Apr 12, 2021 at 11:07\n'
                   '\n'
                   'Add a comment\n'
                   '|\n'
                   '\n'
                   '10 Answer',
        'score': 0.8810760974884033},
    {   'answer': 'set them to be close to zero without being too small',
        'context': 'l rule for setting weights\n'
                   '\n'
                   'The general rule for setting the weights in a neural '
                   'network is to set them to be close to zero without being '
                   'too small.\n'
          

Inferencing Samples:   0%|          | 0/2 [00:00<?, ? Batches/s]





Query: How do I check if PyTorch is using the GPU?
Answers:
[   {   'answer': "From the official site's get started page",
        'context': "From the official site's get started page, you can check "
                   'if the GPU is available for PyTorch like so:\n'
                   'import torch\n'
                   'torch.cuda.is_available()\n'
                   '\n'
                   'Referenc',
        'score': 0.5022277235984802},
    {   'answer': 'True status',
        'context': '13]: import  torch\n'
                   '\n'
                   'In [14]: torch.cuda.is_available()\n'
                   'Out[14]: True\n'
                   '\n'
                   'True status means that PyTorch is configured correctly and '
                   'is using the GPU alth',
        'score': 0.3514541685581207},
    {   'answer': 'tensor',
        'context': 'ate on tensors allocated in different devices. To see how '
                   'to allocate a tensor to the GPU, see he

Inferencing Samples:   0%|          | 0/3 [00:00<?, ? Batches/s]





Query: How do I print the model summary in PyTorch?
Answers:
[   {   'answer': 'MaxPool2d',
        'context': '_size=(3, 3), stride=(1, 1), padding=(1, 1))\n'
                   '(11): ReLU(inplace)\n'
                   '(12): MaxPool2d(kernel_size=3, stride=2, padding=0, '
                   'dilation=1, ceil_mode=False)\n'
                   ')\n'
                   '(a',
        'score': 0.06543083488941193},
    {   'answer': '<input class="s-input" id="display-name" '
                  'name="display-name" maxlength="30" type="text" value="" '
                  'tabindex="105" placeholder="" />',
        'context': 'relative">\n'
                   '<input class="s-input" id="display-name" '
                   'name="display-name" maxlength="30" type="text" value="" '
                   'tabindex="105" placeholder="" />\n'
                   '</div>\n'
                   '</',
        'score': 0.06323054432868958},
    {   'answer': 'Torchinfo',
                   '\n'
     

Inferencing Samples:   0%|          | 0/2 [00:00<?, ? Batches/s]





Query: How do I save a trained model in PyTorch?
Answers:
[   {   'answer': '.eval()',
        'context': 'Norm Layers etc. You need to turn them off during model '
                   'evaluation, and .eval() will do it for you. In addition, '
                   'the common practice for evaluating/va',
        'score': 0.29799848794937134},
    {   'answer': 'using either a .pt or .pth file extension',
        'context': 's post.\n'
                   '\n'
                   'A common PyTorch convention is to save models using either '
                   'a .pt or .pth file extension.\n'
                   'Save/Load Entire Model\n'
                   'Save:\n'
                   'path = "username/direct',
        'score': 0.2841210663318634},
    {   'answer': 'the module is in the opposite state, eval mode',
        'context': 'train mode by default. When self.training is False, the '
                   'module is in the opposite state, eval mode.\n'
                   'Of the mo

Inferencing Samples:   0%|          | 0/3 [00:00<?, ? Batches/s]





Query: What does .contiguous() do in PyTorch?
Answers:
[   {   'answer': 'rearrange the memory allocation so that the tensor is C '
                  'contiguous',
        'context': '.is_contiguous()\n'
                   'False\n'
                   '\n'
                   'contiguous() will rearrange the memory allocation so that '
                   'the tensor is C contiguous:\n'
                   '\n'
                   '>>> t.T.contiguous().stride()\n'
                   '(3, 1)\n'
                   '\n'
                   'S',
        'score': 0.5028234720230103},
    {   'answer': 'Returns a contiguous tensor containing the same data as '
                  'self\n'
                  'tensor',
        'context': 'rch documentation:\n'
                   '\n'
                   'contiguous() → Tensor\n'
                   'Returns a contiguous tensor containing the same data as '
                   'self\n'
                   'tensor. If self tensor is contiguous, this fun

Inferencing Samples:   0%|          | 0/2 [00:00<?, ? Batches/s]





Query: What does model.eval() do in pytorch?
Answers:
[   {   'answer': 'model.load_state_dict(torch.load(filepath))\n'
                  'model.eval()\n'
                  "Note: Don't forget the last line model.eval() this is "
                  'crucial after loading the model',
        'context': 'ter\n'
                   'model.load_state_dict(torch.load(filepath))\n'
                   'model.eval()\n'
                   "Note: Don't forget the last line model.eval() this is "
                   'crucial after loading the model.\n'
                   'Al',
        'score': 0.1589842140674591},
    {   'answer': '"give me a tensor that has these many columns and you '
                  'compute the appropriate number of rows that is necessary to '
                  'make this happen".',
        'context': 'ing the library: "give me a tensor that has these many '
                   'columns and you compute the appropriate number of rows '
                   'that is necessary 

Inferencing Samples:   0%|          | 0/2 [00:00<?, ? Batches/s]





Query: What does model.train() do in PyTorch?
Answers:
[   {   'answer': 'tells your model that you are training the model',
        'context': 'nswer.\n'
                   '\n'
                   'Show activity on this post.\n'
                   '\n'
                   'model.train() tells your model that you are training the '
                   'model. This helps inform layers such as Dropout and Batc',
        'score': 0.5722594261169434},
    {   'answer': 'Highest score (default)\n'
                  '\n'
                  'Trending (recent votes count more)\n'
                  '\n'
                  'Date modified (newest first)\n'
                  '\n'
                  'Date created (oldest first)\n'
                  '\n'
                  'This answer is useful\n'
                  '\n'
                  '253\n'
                  '\n'
                  'This answer is not useful\n'
                  '\n'
                  'Save this answer.\n'
                  '\n'


Inferencing Samples:   0%|          | 0/2 [00:00<?, ? Batches/s]





Query: What does .view() do in PyTorch?
Answers:
[   {   'answer': 'reshapes the tensor without copying memory',
        'context': 'ave this answer.\n'
                   '\n'
                   'Show activity on this post.\n'
                   '\n'
                   'view() reshapes the tensor without copying memory, similar '
                   "to numpy's reshape().\n"
                   'Given a tensor a with ',
        'score': 0.568334698677063},
    {   'answer': 'return a tensor with the new shape',
        'context': 'is post.\n'
                   '\n'
                   'torch.view has existed for a long time. It will return a '
                   'tensor with the new shape. The returned tensor will share '
                   'the underling data with t',
        'score': 0.36859554052352905},
    {   'answer': 'It will return a tensor with the new shape. The returned '
                  'tensor will share the underling data with the original '
                  'tenso

Inferencing Samples:   0%|          | 0/3 [00:00<?, ? Batches/s]





Query: What's the difference between reshape and view in pytorch?
Answers:
[   {   'answer': "doesn't guarantee data sharing",
        'context': "reshape doesn't impose any contiguity constraints, but "
                   "also doesn't guarantee data sharing. The new tensor may be "
                   'a view of the original tensor, or it',
        'score': 0.535275399684906},
    {   'answer': 'reshape() can operate on both contiguous and non-contiguous '
                  'tensor',
        'context': 'w beforehand.\n'
                   '\n'
                   'Another difference is that reshape() can operate on both '
                   'contiguous and non-contiguous tensor while view() can only '
                   'operate on contiguo',
        'score': 0.4322892129421234},
    {   'answer': 'more robust',
        'context': '\n'
                   '\n'
                   'Save this answer.\n'
                   '\n'
                   'Show activity on this post.\n'
        

Inferencing Samples:   0%|          | 0/2 [00:00<?, ? Batches/s]





Query: Why do we need to call zero_grad() in PyTorch?
Answers:
[   {   'answer': 'to not face the wrong accumulated results',
        'context': 'optimizer to zero every time the past value it may get add '
                   'up and changes the result.\n'
                   'So we use zero_grad to not face the wrong accumulated '
                   'results.\n'
                   '\n',
        'score': 0.7640361189842224},
    {   'answer': 'we do not want past gardients or past results to interfere '
                  'with our current results',
        'context': 'use when we start a training loop we do not want past '
                   'gardients or past results to interfere with our current '
                   'results beacuse how PyTorch works as it ',
        'score': 0.5619258284568787},
    {   'answer': 'in each train step we want to compute new gradients',
        'context': 'rain loop we would call optim.zero_grad() because in each '
                   "train step we 

# Fine Tuned Retriever

In [66]:
# save_dir = "./saved_models/dpr"
# reloaded_retriever = DensePassageRetriever.load(load_dir=save_dir, document_store=document_store)


In [67]:
# document_store.update_embeddings(reloaded_retriever)

In [68]:
# reader =  FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

# query_pipeline = Pipeline()
# query_pipeline.add_node(component=reloaded_retriever, name="retriever", inputs=["Query"])
# query_pipeline.add_node(component=reader, name="reader", inputs=["retriever"])

# queries = ["How do I initialize weights in PyTorch?","How do I check if PyTorch is using the GPU?","How do I print the model summary in PyTorch?", "How do I save a trained model in PyTorch?","What does .contiguous() do in PyTorch?","What does model.eval() do in pytorch?", "What does model.train() do in PyTorch?", "What does .view() do in PyTorch?", "What's the difference between reshape and view in pytorch?", "Why do we need to call zero_grad() in PyTorch?"]

# for query in queries:
#     results = query_pipeline.run(query=query)
#     print("\n\n")
#     print_answers(results, details="medium")
#     print("\n\n")


# Generative QA

In [63]:
# generator = Seq2SeqGenerator(model_name_or_path="vblagoje/bart_lfqa")

In [64]:
# from haystack.pipelines import GenerativeQAPipeline

# pipe = GenerativeQAPipeline(generator, retriever)

In [65]:
# queries = ["How do I initialize weights in PyTorch?","How do I check if PyTorch is using the GPU?","How do I print the model summary in PyTorch?", "How do I save a trained model in PyTorch?","What does .contiguous() do in PyTorch?","What does model.eval() do in pytorch?", "What does model.train() do in PyTorch?", "What does .view() do in PyTorch?", "What's the difference between reshape and view in pytorch?", "Why do we need to call zero_grad() in PyTorch?"]

# for query in queries:
#     results = pipe.run(
#         query=query, params={"Retriever": {"top_k": 3}}
#     )
#     print("\n\n")
#     print_answers(results, details="medium")
#     print("\n\n")
    