In [1]:
import os
import pickle
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import pandas as pd
import re
from tqdm import tqdm
from langchain.llms import LlamaCpp
from langchain import PromptTemplate, LLMChain
from langchain.callbacks.base import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler


In [2]:
# hf embeddings for embedding data

EMBED = "hf"
embeddings = HuggingFaceEmbeddings()


### LlamaCpp using langchain

In [4]:

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

In [13]:
#download model using
#wget https://huggingface.co/Pi3141/alpaca-native-7B-ggml/resolve/397e872bf4c83f4c642317a5bf65ce84a105786e/ggml-model-q4_0.bin

llm = LlamaCpp(model_path="ggml-model-q4_0.bin", n_ctx=4096, max_tokens = 128, temperature = 0.2, callback_manager=callback_manager)

llama.cpp: loading model from ggml-model-q4_0.bin
llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this
llama_model_load_internal: format     = 'ggml' (old version with low tokenizer quality and no mmap support)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 4096
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size = 4113739.11 KB
llama_model_load_internal: mem required  = 5809.32 MB (+ 1026.00 MB per state)
...................................................................................................
.
llama_init_f

In [100]:

#https://python.langchain.com/en/latest/modules/chains/index_examples/vector_db_qa.html#custom-prompts

template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Answer: """

prompt = PromptTemplate(template=template, input_variables=["question","context"])

In [101]:
llm_chain = LLMChain(prompt=prompt, llm=llm)


### prediction

In [4]:
# so questions
df = pd.read_csv('top100questions.csv')
df

Unnamed: 0,question
0,How do I check if PyTorch is using the GPU?\n
1,How do I save a trained model in PyTorch?\n
2,What does .view() do in PyTorch?\n
3,Why do we need to call zero_grad() in PyTorch?\n
4,How do I print the model summary in PyTorch?\n
...,...
95,CUDA error: device-side assert triggered on Co...
96,CUDA error: CUBLAS_STATUS_ALLOC_FAILED when ca...
97,dropout(): argument 'input' (position 1) must ...
98,"Issues installing PyTorch 1.4 - ""No matching d..."


In [5]:
queries = df[0:4]['question'].tolist()

In [6]:
queries

['How do I check if PyTorch is using the GPU?\n',
 'How do I save a trained model in PyTorch?\n',
 'What does .view() do in PyTorch?\n',
 'Why do we need to call zero_grad() in PyTorch?\n']

In [122]:

for query in queries:
    print("QUERY: ", query)
    for vectordb in os.listdir('vectorstore/hf_embeddings'):
        source = os.path.splitext(vectordb)[0]
        vectordb = 'vectorstore/hf_embeddings/'+vectordb
        if 'ipynb_checkpoints' in vectordb:
            continue
        db = FAISS.load_local(vectordb, embeddings)
        #db = pickle.load(open(vectordb, 'rb'))
        relevant_docs = db.similarity_search(query, k=4)
        print("From ", source)
        print(llm_chain.run(question=query, context=relevant_docs))
        print("------")
    print("\n============\n")

QUERY:  How do I check if PyTorch is using the GPU?

From  discussion_forum_faiss_index
 Simply checking whether a GPU is “used” might be dangerous as it might be a race with something else that is contending for a GPU. However, if you are confident about the scheduling of jobs, you can try something like nvidia-smi --query-compute-apps=pid,process_name,used_memory,gpu_bus_id --format=csv.
------
From  blogs_faiss_index

You can check if PyTorch is using the GPU by looking at the trace or kinetic trace of your model. You can also use tools such as TensorBoard to monitor how your model is utilizing the GPU.
------
From  so_faiss_index
 These functions should help:

import torch

true

torch.cuDA.is_available()
torch.cuDA.device_count()
torch.cuDA.current_device()
torch.cuDA.device(0)
torch.cuDA.get_device_name(0)

This tells us:
cuda is available and can be used by one device.  Device 0 refers to the GPU geforce gtx 950m, and it is currently chosen by PyTorch.
------
From  docs_faiss_in

In [76]:

queries = [
    "Does PyTorch work on windows 32-bit?",
    "How do I make my experiment deterministic?",
    "How should I scale up my Pytorch models?",
    "Why is my training so slow?"
]

for query in queries:
    print("QUERY: ", query)
    for vectordb in os.listdir('vectorstore/hf_embeddings'):
        source = os.path.splitext(vectordb)[0]
        vectordb = 'vectorstore/hf_embeddings/'+vectordb
        if 'ipynb_checkpoints' in vectordb:
            continue
        db = FAISS.load_local(vectordb, embeddings)
        #db = pickle.load(open(vectordb, 'rb'))
        relevant_docs = db.similarity_search(query, k=4)
        print("From ", source)
        print(llm_chain.run(question=query, context=relevant_docs))
        print("------")
    print("\n============\n")

QUERY:  Does PyTorch work on windows 32-bit?
From  discussion_forum_faiss_index
 Yes, PyTorch works on Windows 32-bit.
------
From  blogs_faiss_index
 Yes, PyTorch does work on Windows 32-bit.
------
From  so_faiss_index
 Yes, PyTorch does work on Windows 32-bit. It has been tested and confirmed to be working by several users.
------
From  docs_faiss_index
 No, PyTorch doesn't work on Windows 32-bit system. Please use Windows and Python 64-bit version.
------


QUERY:  How do I make my experiment deterministic?
From  discussion_forum_faiss_index
 In order to make computationds deterministic on your specific problem on one specific platform and PyTohr, there are a couple of steps to take. […] A number of operations have backwards that use atomicAdd , in particular […] many forms of pooling, padding, and sampling. There currently is no simple way of avoiding non-determinism in these functions.\n \n Does this mean if I follow the guidelines, I will get deterministic results between indivi