In [25]:
import os
import pickle
import re
import pandas as pd
from tqdm import tqdm
from llama_index import GPTSimpleVectorIndex, SimpleDirectoryReader, ServiceContext, PromptHelper, LLMPredictor, QuestionAnswerPrompt
from langchain.vectorstores.faiss import FAISS
from langchain import PromptTemplate, LLMChain
from langchain.callbacks.base import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.llms import LlamaCpp
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import LangchainEmbedding



## llama index using custom embeddings and custom llm

In [2]:
file_paths = ['knowledgebase/docs/pages.pkl', 'knowledgebase/so/pages.pkl', 
              'knowledgebase/blogs/pages.pkl', 'knowledgebase/discussion_forum/pages.pkl']

documents = SimpleDirectoryReader(input_files = file_paths).load_data()

In [3]:
embed_model = LangchainEmbedding(HuggingFaceEmbeddings())


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda


In [4]:

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

#download model using
#wget https://huggingface.co/Pi3141/alpaca-native-7B-ggml/resolve/397e872bf4c83f4c642317a5bf65ce84a105786e/ggml-model-q4_0.bin

llm = LlamaCpp(model_path="ggml-model-q4_0.bin", n_ctx=4096, callback_manager=callback_manager)

llm_predictor = LLMPredictor(llm=llm)

llama.cpp: loading model from ggml-model-q4_0.bin
llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this
llama_model_load_internal: format     = 'ggml' (old version with low tokenizer quality and no mmap support)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 4096
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size = 4113739.11 KB
llama_model_load_internal: mem required  = 5809.32 MB (+ 2052.00 MB per state)
...................................................................................................
.
AVX = 1 | AV

In [5]:
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, embed_model=embed_model, chunk_size_limit=4096)

In [6]:

index = GPTSimpleVectorIndex.from_documents(documents, service_context=service_context)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 12376668 tokens


In [7]:
QA_PROMPT_TMPL = (
    """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context_str}

Question: {query_str}
Answer: """
)
QA_PROMPT = QuestionAnswerPrompt(QA_PROMPT_TMPL)


In [16]:
# so questions
df = pd.read_csv('top100questions.csv')

queries = df[0:4]['question'].tolist()

queries

['How do I check if PyTorch is using the GPU?\n',
 'How do I save a trained model in PyTorch?\n',
 'What does .view() do in PyTorch?\n',
 'Why do we need to call zero_grad() in PyTorch?\n']

In [23]:
for i in queries:
    print("QUERY: ", i)
    output = index.query(query_str=i, text_qa_template=QA_PROMPT)
    print(output)
    print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<< context >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n')
    print(output.source_nodes)
    print('---------------------------------------------------------------------------')

QUERY:  How do I check if PyTorch is using the GPU?



Batches:   0%|          | 0/1 [00:00<?, ?it/s]


llama_print_timings:        load time =   397.86 ms
llama_print_timings:      sample time =    62.16 ms /    74 runs   (    0.84 ms per run)
llama_print_timings: prompt eval time =  2249.28 ms /    48 tokens (   46.86 ms per token)
llama_print_timings:        eval time =  8860.40 ms /    73 runs   (  121.38 ms per run)
llama_print_timings:       total time = 11180.86 ms

llama_print_timings:        load time =   397.86 ms
llama_print_timings:      sample time =   140.53 ms /   161 runs   (    0.87 ms per run)
llama_print_timings: prompt eval time = 19053.06 ms /   398 tokens (   47.87 ms per token)
llama_print_timings:        eval time = 21743.64 ms /   160 runs   (  135.90 ms per run)
llama_print_timings:       total time = 40966.18 ms
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 4471 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 14 tokens



hhWa(h
hPiperdeine pipelines for training a TensorFlow Lite model. The input
   tensor should be the model's parameter or buffer which will be copied
   to the appropriate device before running the module.
"""
import torch
from typing import Any, List

def check_pipeline(model):
    for d in range(1, len(torch.distributed_utils.get_devices())):
        if (d not in torch.distributed_utils.get_devices() or
            model.cuda()) :
            break
    
    return torch.distributed_utils.get_pipeline(model)
<<<<<<<<<<<<<<<<<<<<<<<<<<<<< context >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

---------------------------------------------------------------------------
QUERY:  How do I save a trained model in PyTorch?



Batches:   0%|          | 0/1 [00:00<?, ?it/s]


llama_print_timings:        load time =   397.86 ms
llama_print_timings:      sample time =   221.31 ms /   252 runs   (    0.88 ms per run)
llama_print_timings: prompt eval time = 16557.08 ms /   347 tokens (   47.71 ms per token)
llama_print_timings:        eval time = 33875.49 ms /   251 runs   (  134.96 ms per run)
llama_print_timings:       total time = 50704.57 ms

llama_print_timings:        load time =   397.86 ms
llama_print_timings:      sample time =    99.75 ms /   114 runs   (    0.87 ms per run)
llama_print_timings: prompt eval time = 18651.22 ms /   390 tokens (   47.82 ms per token)
llama_print_timings:        eval time = 15169.39 ms /   113 runs   (  134.24 ms per run)
llama_print_timings:       total time = 33939.38 ms
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 4905 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 13 tokens


oh {
  function check(url) {
    return torch.hub.check_url(url);
   }
}h
We have an opportunity to refine the existing answer (only if needed) with some more context below.
------------
This function will take a given URL as an argument, and then check it against the list of trusted URLs in the cache. If it is not present in that list, the behaviour will fall back onto the "trust_repo=Fals" option. 
}h
<<<<<<<<<<<<<<<<<<<<<<<<<<<<< context >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

---------------------------------------------------------------------------
QUERY:  What does .view() do in PyTorch?



Batches:   0%|          | 0/1 [00:00<?, ?it/s]


llama_print_timings:        load time =   397.86 ms
llama_print_timings:      sample time =   168.10 ms /   256 runs   (    0.66 ms per run)
llama_print_timings: prompt eval time =  5122.03 ms /   109 tokens (   46.99 ms per token)
llama_print_timings:        eval time = 32043.21 ms /   255 runs   (  125.66 ms per run)
llama_print_timings:       total time = 37378.65 ms

llama_print_timings:        load time =   397.86 ms
llama_print_timings:      sample time =    43.40 ms /    50 runs   (    0.87 ms per run)
llama_print_timings: prompt eval time = 18137.59 ms /   380 tokens (   47.73 ms per token)
llama_print_timings:        eval time =  6462.57 ms /    49 runs   (  131.89 ms per run)
llama_print_timings:       total time = 24652.04 ms
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 4954 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 12 tokens


view(data){
    console.log("view called") // Not supported: "*=*", "<<=", ">>=", "%=", "^=", "@=", "&=", "//=", "%" operator for some
<<<<<<<<<<<<<<<<<<<<<<<<<<<<< context >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

[NodeWithScore(node=Node(text='                     |\n+-----------------------------------+-----------------------------------+-----------------------------------+h\n}h\x0c9https://pytorch.org/docs/stable/jit_python_reference.htmlsuh\x0e(h\x08h\nh\x10}ubh\x03)}(h\x06}(h\x08X\x03\x00\x00+-----------------------------------+-----------------------------------+-----------------------------------+\n| 1.2. Notation                     | Not Relevant                      |                                   |\n+-----------------------------------+-----------------------------------+-----------------------------------+\n| 2. Lexical analysis               | Not Relevant                      |                                   |\n+-----------------------------------+---------------------

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


llama_print_timings:        load time =   397.86 ms
llama_print_timings:      sample time =    18.31 ms /    21 runs   (    0.87 ms per run)
llama_print_timings: prompt eval time =  6965.30 ms /   148 tokens (   47.06 ms per token)
llama_print_timings:        eval time =  2461.81 ms /    20 runs   (  123.09 ms per run)
llama_print_timings:       total time =  9448.64 ms

llama_print_timings:        load time =   397.86 ms
llama_print_timings:      sample time =    12.18 ms /    14 runs   (    0.87 ms per run)
llama_print_timings: prompt eval time = 23518.32 ms /   339 tokens (   69.38 ms per token)
llama_print_timings:        eval time =  1690.67 ms /    13 runs   (  130.05 ms per run)
llama_print_timings:       total time = 25224.85 ms
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 4347 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 16 tokens


d<br>Answer: False
------------
"""
<<<<<<<<<<<<<<<<<<<<<<<<<<<<< context >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

---------------------------------------------------------------------------


### LlamaCpp using langchain

In [26]:

# hf embeddings for embedding data

EMBED = "hf"
embeddings = HuggingFaceEmbeddings()

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

llm = LlamaCpp(model_path="ggml-model-q4_0.bin", n_ctx=4096, max_tokens = 128, temperature = 0.2, callback_manager=callback_manager)

template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Answer: """

prompt = PromptTemplate(template=template, input_variables=["question","context"])

llm_chain = LLMChain(prompt=prompt, llm=llm)


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda
llama.cpp: loading model from ggml-model-q4_0.bin
llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this
llama_model_load_internal: format     = 'ggml' (old version with low tokenizer quality and no mmap support)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 4096
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size = 4113739.11 

In [27]:

for query in queries:
    print("QUERY: ", query)
    for vectordb in os.listdir('vectorstore/hf_embeddings'):
        source = os.path.splitext(vectordb)[0]
        vectordb = 'vectorstore/hf_embeddings/'+vectordb
        if 'ipynb_checkpoints' in vectordb:
            continue
        db = FAISS.load_local(vectordb, embeddings)
        #db = pickle.load(open(vectordb, 'rb'))
        relevant_docs = db.similarity_search(query, k=4)
        print("From ", source)
        print(llm_chain.run(question=query, context=relevant_docs))
        print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<< context >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n')
        print(relevant_docs)
        print('---------------------------------------------------------------------------')
    print("\n============\n")

INFO:faiss.loader:Loading faiss with AVX2 support.
INFO:faiss.loader:Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'")
INFO:faiss.loader:Loading faiss.
INFO:faiss.loader:Successfully loaded faiss.


QUERY:  How do I check if PyTorch is using the GPU?



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

From  discussion_forum_faiss_index
 Simply checking whether a GPU is “used” might be dangerous as it might be a race with something else that is contending for a GPU. However, if you are confident about the scheduling of jobs, you can try something like nvidia-smi --query-compute-apps=pid,process_name,used_memory,gpu_bus_id --format=csv.
<<<<<<<<<<<<<<<<<<<<<<<<<<<<< context >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

---------------------------------------------------------------------------



llama_print_timings:        load time =   399.55 ms
llama_print_timings:      sample time =    71.90 ms /    82 runs   (    0.88 ms per run)
llama_print_timings: prompt eval time = 63340.72 ms /  1243 tokens (   50.96 ms per token)
llama_print_timings:        eval time = 13824.06 ms /    81 runs   (  170.67 ms per run)
llama_print_timings:       total time = 77255.88 ms


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

From  blogs_faiss_index

You can use the following command to check if PyTorch is using the GPU:

import torch

if torch.device('GPU'):
    print('PyTorch is using the GPU')
else:
    print('PyTorch is not using the GPU')
<<<<<<<<<<<<<<<<<<<<<<<<<<<<< context >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

[Document(page_content='The new PyTorch Profiler (```torch.profiler```) is a tool that brings both types of information together and then builds experience that realizes the full potential of that information. This new profiler collects both GPU hardware and PyTorch related information, correlates them, performs automatic detection of bottlenecks in the model, and generates recommendations on how to resolve these bottlenecks. All of this information from the profiler is visualized for the user in TensorBoard. The new Profiler API is natively supported in PyTorch and delivers the simplest experience available to date where users can profile their models without installing any additional packages


llama_print_timings:        load time =   399.55 ms
llama_print_timings:      sample time =    55.66 ms /    63 runs   (    0.88 ms per run)
llama_print_timings: prompt eval time = 60050.76 ms /  1187 tokens (   50.59 ms per token)
llama_print_timings:        eval time = 10380.39 ms /    62 runs   (  167.43 ms per run)
llama_print_timings:       total time = 70503.04 ms


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

From  so_faiss_index
 These functions should help:

import torch

true

torch.cuDA.is_available()
torch.cuDA.device_count()
torch.cuDA.current_device()
torch.cuDA.device(0)
torch.cuDA.get_device_name(0)

This tells us:
cuda is available and can be used by one device.  Device 0 refers to the GPU geforce gtx 950m, and it is currently chosen by PyTorch.
<<<<<<<<<<<<<<<<<<<<<<<<<<<<< context >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

[Document(page_content="QUESTION: how do i check if pytorch is using the gpu? ANSWER: these functions should help:\n&gt;&gt;&gt; import torch\n\n&gt;&gt;&gt; torch.cuda.is_available()\ntrue\n\n&gt;&gt;&gt; torch.cuda.device_count()\n1\n\n&gt;&gt;&gt; torch.cuda.current_device()\n0\n\n&gt;&gt;&gt; torch.cuda.device(0)\n&lt;torch.cuda.device at 0x7efce0b03be0&gt;\n\n&gt;&gt;&gt; torch.cuda.get_device_name(0)\n'geforce gtx 950m'\n\nthis tells us:\n\ncuda is available and can be used by one device.\ndevice 0 refers to the gpu geforce gtx 950m, and it is currently chosen


llama_print_timings:        load time =   399.55 ms
llama_print_timings:      sample time =   107.68 ms /   123 runs   (    0.88 ms per run)
llama_print_timings: prompt eval time = 38151.29 ms /   776 tokens (   49.16 ms per token)
llama_print_timings:        eval time = 18256.06 ms /   122 runs   (  149.64 ms per run)
llama_print_timings:       total time = 56537.82 ms


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

From  docs_faiss_index



llama_print_timings:        load time =   399.55 ms
llama_print_timings:      sample time =    54.44 ms /    62 runs   (    0.88 ms per run)
llama_print_timings: prompt eval time = 43779.62 ms /   883 tokens (   49.58 ms per token)
llama_print_timings:        eval time =  9352.97 ms /    61 runs   (  153.33 ms per run)
llama_print_timings:       total time = 53200.91 ms



You can check whether PyTorch is using the GPU by calling the `is_available()` function. If it returns True, then PyTorch is using the GPU. You can also use the `current_device()` function to get the currently used device and check if it is a GPU.
<<<<<<<<<<<<<<<<<<<<<<<<<<<<< context >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

---------------------------------------------------------------------------


QUERY:  How do I save a trained model in PyTorch?



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

From  discussion_forum_faiss_index

You can save the model by using the torch.save() function. The syntax is as follows:

\begin{code}
torch.save(state, filename)
\end{code}

Where state is a dictionary containing the parameters of your model and filename is the path to where you want to save the model. If you want to save the best model, you can use the following syntax:

\begin{code}
torch.save(state, 'model_best.pt')
\end{code}
<<<<<<<<<<<<<<<<<<<<<<<<<<<<< context >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

[Document(page_content="QUESTION: Saving and loading a model in Pytorch?? If I have a model class and a trainer class. I create an instance of the model and train it. \n model = mymodel()\ntrain = trainer.train(model...) \n \n How can I save the model to a file, after it has been trained and how can I then reload it and continue training? I searched for this but didn’t get an answer. ANSWER: @Rinku_Jadhav2014 unfortunately that tutorial is incomplete to resume training. It will only al


llama_print_timings:        load time =   399.55 ms
llama_print_timings:      sample time =   102.72 ms /   117 runs   (    0.88 ms per run)
llama_print_timings: prompt eval time = 67645.73 ms /  1325 tokens (   51.05 ms per token)
llama_print_timings:        eval time = 20244.10 ms /   116 runs   (  174.52 ms per run)
llama_print_timings:       total time = 88019.35 ms


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

From  blogs_faiss_index
1. Build the PyTorch Runtime in **instrumentation mode** (this is called an **instrumentation build** of PyTorch). This will record the used operators, kernel and features.
2. Run your models through this instrumentation build by using the provided **model_tracer** binary. This will generate a single YAML file that stores all the features used by your model. These features will be preserved in the minimal runtime.
3. Build PyTorch using this selectively-built PyToarc library, to reduce the size of your mobile application!
<<<<<<<<<<<<<<<<<<<<<<<<<<<<< context >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

[Document(page_content='2. Choose a default handler that fits your task (e.g image classification, etc) or author a [custom handler](https://github.com/pytorch/serve/blob/master/docs/custom_service.md#custom-handlers).\n3. [Package your model](https://github.com/pytorch/serve/tree/master/examples/Huggingface_Transformers#create-model-archive-eager-mode) artifacts (traine


llama_print_timings:        load time =   399.55 ms
llama_print_timings:      sample time =   109.60 ms /   124 runs   (    0.88 ms per run)
llama_print_timings: prompt eval time = 63120.46 ms /  1242 tokens (   50.82 ms per token)
llama_print_timings:        eval time = 21060.42 ms /   123 runs   (  171.22 ms per run)
llama_print_timings:       total time = 84318.72 ms


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

From  so_faiss_index

Found this page on their GitHub repo:

recommended approach for saving a model.

There are two main approaches for serializing and restoring a model.

The first (recommended) saves and loads only the model parameters:

torch.save(the_model.state_dict(), path)

then later:

the_model = themodelclass(*args, **kwargs)

the_model.load_state_dict(torch.load(path))

however in this case, the serialized data is bound to the specific classes and
<<<<<<<<<<<<<<<<<<<<<<<<<<<<< context >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

[Document(page_content='QUESTION: how to save a pytorch model? ANSWER: to save:\n# save the weights of the model to a .pt file\ntorch.save(model.state_dict(), &quot;your_model_path.pt&quot;)\n\nto load:\n# load your model architecture/module\nmodel = yourmodel()\n# fill your architecture with the trained weights\nmodel.load_state_dict(torch.load(&quot;your_model_path.pt&quot;))', metadata={'source': 'https://stackoverflow.com/questions/66821329/'}), Documen


llama_print_timings:        load time =   399.55 ms
llama_print_timings:      sample time =   112.15 ms /   128 runs   (    0.88 ms per run)
llama_print_timings: prompt eval time = 56242.07 ms /  1114 tokens (   50.49 ms per token)
llama_print_timings:        eval time = 20971.86 ms /   127 runs   (  165.13 ms per run)
llama_print_timings:       total time = 77353.91 ms


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

From  docs_faiss_index
 To save a trained model in PyTorch, you can use the torch.save() function. This function takes as an argument the model to be saved and the name of the file where it should be stored. Additionally, you can specify the pickle protocol version (defualt is 4) and the module used for pickling metadata and objects (usually pickle). For example:

torch.save(model, "my_model.pt")
<<<<<<<<<<<<<<<<<<<<<<<<<<<<< context >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

---------------------------------------------------------------------------


QUERY:  What does .view() do in PyTorch?




llama_print_timings:        load time =   399.55 ms
llama_print_timings:      sample time =    86.82 ms /    98 runs   (    0.89 ms per run)
llama_print_timings: prompt eval time = 74627.90 ms /  1447 tokens (   51.57 ms per token)
llama_print_timings:        eval time = 17289.14 ms /    97 runs   (  178.24 ms per run)
llama_print_timings:       total time = 92027.71 ms


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

From  discussion_forum_faiss_index

view() takes a tensor and reshape it. A requirement being that the product of the lengths of each dimension in the new shape equals that of the original. Hence a tensor with shape (4,3) can be reshaped with view to one of shape (1,2,4,3). This is useful when you want to create a view of a tensor without creating a copy of it.
<<<<<<<<<<<<<<<<<<<<<<<<<<<<< context >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

[Document(page_content='QUESTION: What does .view(-1) do?? I see this line of code \n lab=lab.view(-1)\n \n I am wondering what this code does? I put a breakpoint on it, and as far as I can see the contents of lab are the same both before and after I execute the code. So does this code make any difference? ANSWER: The view(-1) operation flattens the tensor, if it wasn’t already flattened as seen here: \n x = torch.randn(2, 3, 4)\nprint(x.shape)\n> torch.Size([2, 3, 4])\nx = x.view(-1)\nprint(x.shape)\n> torch.Size([24])\n \n It’ll modify the tensor metada


llama_print_timings:        load time =   399.55 ms
llama_print_timings:      sample time =    73.43 ms /    84 runs   (    0.87 ms per run)
llama_print_timings: prompt eval time = 41912.72 ms /   848 tokens (   49.43 ms per token)
llama_print_timings:        eval time = 12671.39 ms /    83 runs   (  152.67 ms per run)
llama_print_timings:       total time = 54674.44 ms


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

From  blogs_faiss_index
 .view() is a method of the Tensor object which allows you to view the values stored in a tensor. It returns a list containing the values at each index of the tensor.
<<<<<<<<<<<<<<<<<<<<<<<<<<<<< context >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

[Document(page_content='Detailed design discussion on GitHub can be found [here](https://github.com/pytorch/pytorch/issues/30632).\n\n## Python 2 no longer supported\n\nStarting PyTorch 1.5.0, we will no longer support Python 2, specifically version 2.7. Going forward support for Python will be limited to Python 3, specifically Python 3.5, 3.6, 3.7 and 3.8 (first enabled in PyTorch 1.4.0).\n\n\n*We’d like to thank the entire PyTorch team and the community for all their contributions to this work.*\n\nCheers!\n\nTeam PyTorch', metadata={'source': 'https://pytorch.org/blog/pytorch-1-dot-5-released-with-new-and-updated-apis/'}), Document(page_content='## Tracing Mode\n\nThe PyTorch tracer, `torch.jit.trace`, is a function that 


llama_print_timings:        load time =   399.55 ms
llama_print_timings:      sample time =    33.62 ms /    38 runs   (    0.88 ms per run)
llama_print_timings: prompt eval time = 61059.54 ms /  1205 tokens (   50.67 ms per token)
llama_print_timings:        eval time =  6224.65 ms /    37 runs   (  168.23 ms per run)
llama_print_timings:       total time = 67330.26 ms


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

From  so_faiss_index

view() reshape the tensor without copying memory, similar to numpy's reshape().
Given a tensor a with 16 elements:

import torch
a = torch.range(1, 16)

to reshape this tensor to make it a 4 x 4 tensor, use:
a = a.view(4, 4)

now a will be a 4 x 4 tensor. note that after the reshape the total number of elements need to remain the same. Reshaping the tensor to a 3 x 5 tensor would not
<<<<<<<<<<<<<<<<<<<<<<<<<<<<< context >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

[Document(page_content='QUESTION: what is the formal definition for "view" in this context? ANSWER: a &quot;view&quot; is how you interpret this data, or more precisely, the shape of the tensor. for example, given a memory block with 40 contiguous bytes (10 contiguous floats), you can either view it as a 2x5 tensor, or a 5x2 tensor.\nin pytorch, the api to change the view of a tensor is view(). some examples:\npython 3.8.10 (default, sep 28 2021, 16:10:42) \n[gcc 9.3.0] on linux\ntype &quot;help&quot;, &quot;co


llama_print_timings:        load time =   399.55 ms
llama_print_timings:      sample time =   113.04 ms /   128 runs   (    0.88 ms per run)
llama_print_timings: prompt eval time = 60514.96 ms /  1195 tokens (   50.64 ms per token)
llama_print_timings:        eval time = 21454.89 ms /   127 runs   (  168.94 ms per run)
llama_print_timings:       total time = 82111.18 ms


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

From  docs_faiss_index
 .view() is a PyTorch op which allows a tensor to be a "View" of an existing tensor. Viewing a tensor shares the same underlying data with its base tensor, thus avoids explicit data copy, allowing for fast and memory efficient reshape, slipping and element-wise operations. For example, to get a view of an existing tensor "t", you can call t.view(...). 

Supporting "View" allows us to do fast and memory efficient reshape, slipping and element-wise operations.
<<<<<<<<<<<<<<<<<<<<<<<<<<<<< context >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

[Document(page_content='Tensor Views\n************\n\nPyTorch allows a tensor to be a "View" of an existing tensor. View\ntensor shares the same underlying data with its base tensor.\nSupporting "View" avoids explicit data copy, thus allows us to do fast\nand memory efficient reshaping, slicing and element-wise operations.\n\nFor example, to get a view of an existing tensor "t", you can call\n"t.view(...)".\n\n   >>> t = torch.rand(4, 


llama_print_timings:        load time =   399.55 ms
llama_print_timings:      sample time =   100.37 ms /   113 runs   (    0.89 ms per run)
llama_print_timings: prompt eval time = 79778.33 ms /  1536 tokens (   51.94 ms per token)
llama_print_timings:        eval time = 20310.61 ms /   112 runs   (  181.34 ms per run)
llama_print_timings:       total time = 100217.32 ms


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

From  discussion_forum_faiss_index

In PyTorch, zero_grad() is a context manager which prevents calculating gradients for the given parameters. This can be used when you are evaluating your model and don't need to backward() to calculate gradients and update corresponding parameters. It can also be used to initialize weights with torch.nn.init functions, since you don't need gradients there either. 

In general, it is better to use zero_grad() when possible as it helps reduce the amount of computation needed for training.
<<<<<<<<<<<<<<<<<<<<<<<<<<<<< context >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

[Document(page_content='QUESTION: Why is grad None? In the following program, b.grad is None but a.grad is not None. Why is that? \n     a = torch.tensor([[[[1.0]]]]).requires_grad_()     \n    b = torch.nn.functional.interpolate(a, size=(2, 2))\n    loss = torch.sum(b)           \n    loss.backward()               \n    b.grad                        \n    a.grad\n ANSWER: @Hovnatan_Karapetyan\


llama_print_timings:        load time =   399.55 ms
llama_print_timings:      sample time =    97.20 ms /   111 runs   (    0.88 ms per run)
llama_print_timings: prompt eval time = 51041.08 ms /  1019 tokens (   50.09 ms per token)
llama_print_timings:        eval time = 17939.17 ms /   110 runs   (  163.08 ms per run)
llama_print_timings:       total time = 69101.06 ms


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

From  blogs_faiss_index

We need to call `zero_grad()` in PyTorch when we want to clear the gradients of a tensor. This is useful for example when you want to backpropagate through a layer and don't want the gradients to be calculated. It can also be used to reset the gradients of a tensor to 0.
<<<<<<<<<<<<<<<<<<<<<<<<<<<<< context >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

[Document(page_content='```python\n>>> x = torch.zeros(1, requires_grad=True)\n>>> with torch.no_grad():\n...     y = x * 2\n>>> y.requires_grad\nFalse\n>>>\n>>> is_train = False\n>>> with torch.set_grad_enabled(is_train):\n...     y = x * 2\n>>> y.requires_grad\nFalse\n>>> torch.set_grad_enabled(True)  # this can also be used as a function\n>>> y = x * 2\n>>> y.requires_grad\nTrue\n>>> torch.set_grad_enabled(False)\n>>> y = x * 2\n>>> y.requires_grad\nFalse\n```\n\n## [`dtypes`](https://pytorch.org/docs/0.4.0/tensor_attributes.html#torch.torch.dtype), [`devices`](https://pytorch.org/docs/0.4.0/tensor_attributes.html#tor


llama_print_timings:        load time =   399.55 ms
llama_print_timings:      sample time =    64.20 ms /    72 runs   (    0.89 ms per run)
llama_print_timings: prompt eval time = 82840.42 ms /  1589 tokens (   52.13 ms per token)
llama_print_timings:        eval time = 12995.41 ms /    71 runs   (  183.03 ms per run)
llama_print_timings:       total time = 95921.01 ms


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

From  so_faiss_index



llama_print_timings:        load time =   399.55 ms
llama_print_timings:      sample time =   106.05 ms /   120 runs   (    0.88 ms per run)
llama_print_timings: prompt eval time = 44874.64 ms /   904 tokens (   49.64 ms per token)
llama_print_timings:        eval time = 18486.58 ms /   119 runs   (  155.35 ms per run)
llama_print_timings:       total time = 63491.95 ms


 In PyTorch, for every mini-batch during the training phase, we typically want to explicitly set the gradient to zero before starting to do backpropagation (i.e., updating the weights and bias) because PyTorch accumulates the gradient on subsequent backward passes. This accumulating behavior is convenient when training RNNs or when we want to compute the gradient of the loss summated over multiple mini-batches. So, the default action has been set to accumulate (i.e. sum) the gradient on every loss.backward() call.
<<<<<<<<<<<<<<<<<<<<<<<<<<<<< context >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

[Document(page_content='QUESTION: why do we need to call zero_grad() in pytorch? ANSWER: in pytorch, for every mini-batch during the training phase, we typically want to explicitly set the gradients to zero before starting to do backpropragation (i.e., updating the weights and biases) because pytorch accumulates the gradients on subsequent backward passes. this accumulating behaviour is convenient whil

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

From  docs_faiss_index
 We need to call zero_grad() in PyTorch because it sets the gradients of all optimized "torch.Tensor" s to zero, which can improve performance and change certain behaviors. For example, if the user tries to access a gradient and perform manual ops on it, a None attribute or a Tensor full of 0s will behave differently. If we don't call zero_grad(), then gradients may not be set to 0 for parameters that did not receive a gradient, which can lead to unexpected results.
<<<<<<<<<<<<<<<<<<<<<<<<<<<<< context >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

[Document(page_content='zero_grad(set_to_none=False)\n\n      Sets the gradients of all optimized "torch.Tensor" s to zero.\n\n      Parameters:\n         **set_to_none** (*bool*) -- instead of setting to zero, set\n         the grads to None. This will in general have lower memory\n         footprint, and can modestly improve performance. However, it\n         changes certain behaviors. For example: 1. When the user\n         


llama_print_timings:        load time =   399.55 ms
llama_print_timings:      sample time =   101.39 ms /   114 runs   (    0.89 ms per run)
llama_print_timings: prompt eval time = 65086.22 ms /  1278 tokens (   50.93 ms per token)
llama_print_timings:        eval time = 19376.65 ms /   113 runs   (  171.47 ms per run)
llama_print_timings:       total time = 84590.08 ms
