In [None]:
# Mount google driver
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Change to google driver folder which contains datasets
# This folder will also be used to save model
%cd /content/drive/MyDrive/Langchain_Llama2_Lab

In [None]:
# Install python packages
!pip install -r requirements.txt

In [None]:
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index.schema import TextNode
import torch
import os

In [None]:

# Define environment variable, path of data, model name and device
os.environ["HF_HOME"] = "/content/huggingface"  # Replace with your desired directory
print("Please replace the text with your hugging face access token:")
os.environ["HF_HOME_TOKEN"] = "PLEASE_REPLACE_IT_WITH_YOUR_HF_ACCESS_TOKEN"

result_dir = '/content/drive/MyDrive/Langchain_Llama2_Lab/results'
data_folder_path = '/content/drive/MyDrive/Langchain_Llama2_Lab/data/'
vectorstore_path = '/content/drive/MyDrive/Langchain_Llama2_Lab/vectorstore/db_faiss/'
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
class Document:
    def __init__(self, text, id):
        self.text = text
        self.id = id

In [None]:
def get_documents():
    loader = DirectoryLoader(data_folder_path, glob="*.pdf", loader_cls=PyPDFLoader, show_progress=False)
    documents = loader.load()
    return documents

In [None]:
def build_vectorstore():
    # Load PDF files from data directory
    documents = get_documents()
    len(documents)

    # Split text from PDF into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=120, chunk_overlap=0)
    texts = text_splitter.split_documents(documents)

    # Load embeddings model
    embedding_function = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': device})
    embedding_function.embed_query(texts[0].page_content)

    # Build and persist FAISS vector store
    vector_database = FAISS.from_documents(texts, embedding_function)

    vector_database.save_local(vectorstore_path)
    print("Vector store created in: ", vectorstore_path)

In [None]:
def get_vectorstore():
    # Load embeddings model
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': device})

    # Load vector store
    vectorstore = FAISS.load_local(vectorstore_path, embeddings)

    return vectorstore

In [None]:
build_vectorstore()

In [None]:
loaded_vectorstore = get_vectorstore()

In [None]:
def generateResponseText(prompt):
    response = ""
    response_raw_texts = loaded_vectorstore.similarity_search(prompt, top_k=1)
    for document in response_raw_texts:
        response += document.page_content
    return response

In [None]:
#prompt = "What is the name of your company?"
#prompt = "What is the product lines of your company?"
#prompt = "What are your services?"
prompt = ""
while True:
  prompt = input("Enter your input (press Enter when done): " + " " * 5)
  print(generateResponseText(prompt))

In [None]:
from llama_index import (
    SimpleDirectoryReader,
    LLMPredictor,
    PromptHelper, GPTListIndex
)
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
from llama_index.schema import BaseNode, Document


In [None]:
max_input_size = 1024
num_output = 100
max_chunk_overlap = 0
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)

In [None]:
# https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF
llm = LlamaCPP(
    model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf",
    model_path=None,
    temperature=0.1,
    max_new_tokens=256,
    context_window=3900,
    generate_kwargs={},
    model_kwargs={"n_gpu_layers": 1},
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

In [None]:
llm_predictor = LLMPredictor(llm)

In [None]:
def add_id_to_documents(documents):
    node_list = []
    for i, document in enumerate(documents):
        doc_page_content = document.page_content
        node = TextNode(text=doc_page_content, id_=i)
        node_list.append(node)
    return node_list

In [None]:
docs = add_id_to_documents(get_documents())
print(docs)

In [None]:
for doc in docs:
    print(doc)

In [None]:
index = GPTListIndex(docs, llm_predictor=llm_predictor, prompt_helper=prompt_helper)

In [None]:
response = index.as_query_engine().query("What is your product lines?")
print(response)