In [1]:
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

import warnings
warnings.filterwarnings("ignore")

In [2]:
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

In [3]:
from langchain import HuggingFacePipeline
from langchain.document_loaders import TextLoader, DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings 

from langchain.vectorstores import FAISS

from langchain.chains.question_answering import load_qa_chain

### Loading LLM -> tiiuae/falcon-7b-instruct

In [4]:
repo_id = "tiiuae/falcon-7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(repo_id)
base_model = AutoModelForCausalLM.from_pretrained(repo_id, load_in_8bit=True, device_map="auto", trust_remote_code=True)

pipe = transformers.pipeline(
    "text-generation",
    model=base_model,
    tokenizer=tokenizer,
    max_length=512,
    max_new_tokens = 300,
    do_sample=True,
    num_return_sequences=1,
    eos_token_id = tokenizer.eos_token_id,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)

llm = HuggingFacePipeline(pipeline=pipe, model_kwargs={"temprature": 0.1})

2023-06-12 19:05:21.258270: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-12 19:05:21.764967: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-06-12 19:05:21.765012: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/srivatsa/anaconda3/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda112.so
CUDA SETUP: CUDA runtime path found: /home/srivatsa/anaconda3/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.9
CUDA SETUP: Detected CUDA version 112
CUDA SETUP: Loading binary /home/srivatsa/anaconda3/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda112.so...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The model 'RWForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausalLM', 'RobertaForCausalLM', 'RobertaPreLayerNormForC

### Creating Embeddings

In [5]:
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

### Creating Documents

In [6]:
try:
    vector_db = FAISS.load_local("faiss_index", embedding)
except FileIOReader as err:
    loader = DirectoryLoader("./research_papers/", glob="./*.pdf", loader_cls=PyPDFLoader)
    research_documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    texts = text_splitter.split_documents(research_documents)
    print(len(texts))

### Creating DB from Documents

In [7]:
# embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# vector_db = FAISS.from_documents(
#     documents=texts,
#     embedding=embedding
# )

In [7]:
#vector_db.save_local("faiss_index")

try:
    vector_db = FAISS.load_local("faiss_index", embedding)
except Exception as exp:
    vector_db = FAISS.from_documents(
    documents=texts,
    embedding=embedding
    )
    vector_db.save_local("faiss_index")
    vector_db = FAISS.load_local("faiss_index", embeddings)

In [8]:
# vector_db.similarity_search("what is self attention ?")

### Creating LLM Chain

In [9]:
qa_chain = load_qa_chain(llm=llm, chain_type="stuff")

In [10]:
def answer_question(query):    
    answer = qa_chain.run(
        input_documents=vector_db.similarity_search(query),
        question=query
    )
    print(answer)

In [11]:
query = "what is a transformer?"
answer_question(query)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Both `max_new_tokens` (=300) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


 A transformer is a type of ML model that typically converts the input text to a numerical value and outputs a representation of the text as a numerical value. These text-to-numeric models are popular in NLP tasks such as language translation and summarization.

The Transformer model is a popular NLP model that is used to generate text. It is able to generate text from previous texts and can be used to answer questions asked on the Transformer model. It is often used for summarization and translation tasks.
The Transformer model is a type of Neural Network used to answer language-related questions such as translation, summarization, or classification. The Transformer model is composed of three parts: a data model, an encoding model and a decoding model.
A common type of text summarization model.
The transformer model is used to generate text with or without any training dataset. It is able to generate text with many different structures and can be used in natural language processing ta

In [13]:
query = "How is a transformer different from recurrent neural networks ?"
answer_question(query)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Both `max_new_tokens` (=300) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



A transformer is a multi-layered neural network with one encoder layer and multiple decoder layers. Instead of using the traditional self-attention of neural networks, transformers use position-based attention as well as context representation to encode and decode sequences. So, while recurrent neural networks may use the same input sequence to generate sequence-to-sequence tasks, transformers use the position of the elements in the input sequence to generate translation tasks.


In [14]:
query = "what are the different kinds of transformer networks ? Can you explain each of them ?"
answer_question(query)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Both `max_new_tokens` (=300) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


 A Transformer network is composed of several layers of neural networks, each stacked
on top of each other to recreate the functionality of a transformer. The number and types of layers
in the network depend on its desired functionality, the number of tasks it will be handling
and, most importantly, its accuracy. They are divided into (1) Encoder, (2) Decoder, (3) Encoder-Decoder (E-D),
Convolutional Neural Networks (CNNs), and Recurrent Neural Networks (RNNs). The main difference
between the last two (the convolutional neural networks and recurrent neural networks) is that
CNNs can use variable-sized input data and use more computing power than fixed 2D trans-
formers, while the former can be used with a very wide range of input sizes, with no
computing power penalty and with little information loss.
4 Encoder: A single-layer (2-dimensional) neural network that performs each of the basic computa-
tional layers. This layer is used to generate the low-level features of the model, while 

In [29]:
# import gc

# effnet_b7 = None
# gc.collect()

# torch.cuda.empty_cache()