In [1]:
import os

import streamlit as st
from dotenv import load_dotenv

In [2]:
# !pip install llama-cpp-python

In [3]:
# !pip install --upgrade --force-reinstall llama-cpp-python --no-cache-dir

In [4]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms import LlamaCpp

In [5]:
my_model_path = r"D:\_programlama\ML\LangChainTutorial_1\models\chat\llama-2-7b-chat.Q3_K_M.gguf"

# Load and split document

In [6]:
from langchain.document_loaders import TextLoader

from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

In [7]:
token_size = 250
vector_store_persist_directory = f"./vector-store/asu_ai_db_tr_{token_size}_llama_en"
original_document_path = "./data/new_faq.txt"

## Embedding function

In [8]:
from langchain_community.embeddings import LlamaCppEmbeddings

In [9]:
embedding_function = LlamaCppEmbeddings(model_path=my_model_path)

AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


In [None]:
if os.path.exists(vector_store_persist_directory):
    # Load documents
    loader = TextLoader(original_document_path, encoding="UTF-8")
    documents = loader.load()
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=token_size)

    # Split and persist documents
    docs = text_splitter.split_documents(documents)
    db = Chroma.from_documents(
        docs, embedding_function, persist_directory=vector_store_persist_directory
    )
    db.persist()
else:
    db = Chroma(persist_directory=vector_store_persist_directory, embedding_function=embedding_function)

Created a chunk of size 322, which is longer than the specified 250
Created a chunk of size 251, which is longer than the specified 250
Created a chunk of size 260, which is longer than the specified 250


## Callbacks support token-wise streaming

In [None]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

## LLM

In [None]:
from langchain.prompts import HumanMessagePromptTemplate, SystemMessagePromptTemplate, ChatPromptTemplate

In [None]:
n_gpu_layers = 40  # Change this value based on your model and your GPU VRAM pool.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path=my_model_path,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

In [None]:

from langchain.retrievers.multi_query import MultiQueryRetriever

In [None]:
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=db.as_retriever(),
    llm=llm,
)
unique_docs = retriever_from_llm.get_relevant_documents(query="I failed the thesis defense exam.")

print(unique_docs)
# from langchain.prompts import PromptTemplate

# def answer_my_question(question):
#     unique_docs = retriever_from_llm.get_relevant_documents(query=question)
#     print("unique_docs",unique_docs)
    # template = """Answer the {question} question in accordance with the information in the context given below.\n{context}.
    #  Return only the shortest answer to the question in the answer. If the question is not clear enough, ask it to be asked again by giving question alternatives."""
    # prompt = PromptTemplate(template=template, input_variables=["question","context"])

   
    # llm_chain = LLMChain(prompt=prompt, llm=llm)
    # llm_chain.run(question=question, context=unique_docs)

In [None]:
# answer_my_question("Which master's programs are free?")