# This is a basic test from LangChain framework

## Loading variables

In [None]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

## Implementing the Retrieval-augmented generation ([RAG](https://python.langchain.com/docs/use_cases/question_answering/))

![Retrieval-augmented generation](RAG.png)

### Document Loading

In [None]:
from langchain.document_loaders import DirectoryLoader

In [None]:
directory = "/Users/bernal/Documents/ext/GitRepos"
loader = DirectoryLoader(directory, glob="**/README.md")

### Splitting (optional)

In [None]:
#from langchain.text_splitter import RecursiveCharacterTextSplitter
#text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)
#splits = text_splitter.split_documents(loader.load())
splits = loader.load()

### Storage (Embed and store splits)

In [None]:
from langchain.vectorstores import Chroma
#from langchain.embeddings import OpenAIEmbeddings
# gpt4all
#from langchain.embeddings import GPT4AllEmbeddings
# sentence-transformers
from langchain.embeddings import HuggingFaceBgeEmbeddings

#emb = OpenAIEmbeddings()
#emb = GPT4AllEmbeddings()
model_name = "BAAI/bge-small-en"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
emb = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

vectorstore = Chroma.from_documents(documents=splits,embedding=emb)

#### Retrieve fast test

In [None]:
#question = "What is active metadata?"
#docs = vectorstore.similarity_search_with_relevance_scores(question, k=8)

### Retrieval

In [None]:
retriever = vectorstore.as_retriever()

#### retrieve fast check from retriever

In [None]:
#question = "What is active metadata?"
#docs = retriever.invoke(question)

### Output (Generate response)

##### LLM: OpenAI model case

In [None]:
from langchain.chat_models import ChatOpenAI
lmm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.7)

##### LLM: LLamaCpp case (Local Execution)

In [None]:
# llama-cpp-python
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

def get_llm():
    #return ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

    n_gpu_layers = 0  # Metal set to 1 is enough.
    n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.
    callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
    
    # Make sure the model path is correct for your system!
    llm = LlamaCpp(
        model_path="/Users/bernal/Documents/ext/GitRepos/genrative-ai-knowledge/notebooks/llama2/llama2-13b-tiefighter.Q3_K_S.gguf",
        n_gpu_layers=n_gpu_layers,
        n_batch=n_batch,
        n_ctx=4096,
        f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
        callback_manager=callback_manager,
        verbose=True,
    )
    return llm

llm = get_llm()

##### LMM: chat4all (local execution. No GPU)

In [None]:
# gpt4all
from langchain.llms import GPT4All
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

def get_llm():
    local_path = (
        "/Users/bernal/Documents/ext/GitRepos/generative-ai-knowledge/notebooks/gpt4all/mistral-7b-openorca.Q4_0.gguf"  # replace with your desired local file path
    )

    # Callbacks support token-wise streaming
    callbacks = [StreamingStdOutCallbackHandler()]
    
    # Verbose is required to pass to the callback manager
    llm = GPT4All(model=local_path, callbacks=callbacks, verbose=True)
    return llm

llm = get_llm()

##### Simple LMM test

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

template = """Question: {question}

Answer: Let's think step by step."""

prompt = PromptTemplate(template=template, input_variables=["question"])
llm_chain = LLMChain(prompt=prompt, llm=llm)

question = "What NFL team won the Super Bowl in the year Justin Bieber was born?"
llm_chain.run(question)

#### Generation execution

In [None]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import PromptTemplate

template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Use three sentences maximum and keep the answer as concise as possible. 
Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
rag_prompt_custom = PromptTemplate.from_template(template)

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()} 
    | rag_prompt_custom 
    | llm 
)

rag_chain.invoke("What is active metadata?")