# RAG

## Intialize

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset
from transformers import AutoModel, pipeline, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import Trainer, TrainingArguments
import torch
import transformers
transformers.set_seed(42)
import dotenv
import wandb
import os

dotenv.load_dotenv("./.env", override=True)
wandb.login(key=os.getenv('WANDB_API_KEY'))

In [None]:

# Specify the model checkpoint
#model_name = "dbmdz/german-gpt2"
model_name = "LeoLM/leo-hessianai-7b-chat"
# load the model
# model = AutoModel.from_pretrained(model_checkpoint)
model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        max_memory={i: '24000MB' for i in range(torch.cuda.device_count())},
        # slower?
        load_in_4bit=True,
        #load_in_8bit=True,
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",        
        )
    )
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
import chromadb.utils.embedding_functions as embedding_functions

# hugginface api key
hf_api_key = os.getenv('HUGGINGFACE_API_KEY_READ')
embedding_function_hf = embedding_functions.HuggingFaceEmbeddingFunction(
    api_key=hf_api_key,
    model_name="sentence-transformers/all-MiniLM-L6-v2", # do not use LLM!!!
)

In [None]:
# if you're having an sqlite3 error, you may have to uncomment this line see: https://docs.trychroma.com/troubleshooting#sqlite
#import settings 
import chromadb

chroma_client = chromadb.PersistentClient() # Equivalent to chromadb.Client(), persistent.
#chroma_client = chromadb.EphemeralClient()

ams_content_collection = chroma_client.get_or_create_collection(name='ams_content', embedding_function=embedding_function_hf)


In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('Device name:', torch.cuda.get_device_name(0))
    

## Interference

### Helper Functions etc.

In [None]:
def query_llm(system_prompt, chroma_list, model):
    generator = pipeline('text-generation', model=model, tokenizer=tokenizer, max_length=8000)
    messages=[
            # {"role": "<|im_start|>system\n", "content": system_prompt + "<|im_end|>\n"},
            # {"role": "<|im_start|>user\n", "content": chroma_list + "<|im_end|>\n<|im_start|>assistant\n"}
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": chroma_list}
        ]
    input_text = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
    #print(input_text)
    generated_text = generator(input_text)[0]['generated_text']
    #print(generated_text)
    return generated_text
    
def chromadb_to_list(search_result):
    # get list of retrieved texts
    contexts = [item for item in search_result['documents'][0]]
    return "\n\n---\n\n".join(contexts)+"\n\n-----\n\n"+query
    
def search_docs(user_query, model):
    search_result = ams_content_collection.query(query_texts=user_query, n_results=10, include=['documents']) 
    chroma_list = chromadb_to_list(search_result)
    return query_llm(system_prompt, chroma_list, model=model)

### Prompt & execution

In [None]:
# deutscher prompt für Berufsberatungsbot
system_prompt = f"""Du bist ein Q&A Berufsberatungsbot. Du beantwortest die Fragen des Users anhand der Informationen über jeder Frage. Gehe auf Tätigkeitsmerkmale, Anforderungen, Ausbildung, Beschäftigungsmöglichkeiten, Gehalt und Berufsausichten ein. Wenn du etwas nicht weißt, sage wahrheitsgemäß "Ich weiß es nicht".
""" 

In [None]:
user_query = "Ich bin 15 Jahre alt, gut im Umgang mit Menschen und möchte gerne einen Beruf erlernen, in dem ich viel mit Menschen zu tun habe."
result = search_docs(user_query, model=model)

In [None]:
#use markdown display
from IPython.display import Markdown, display
display(Markdown(result))

In [None]:
search_result = ams_content_collection.query(query_texts="Tiere", n_results=10, include=['documents']) 
search_result
#display(Markdown(chromadb_to_list(search_result)))
#print(chromadb_to_list(search_result))

# Langchain

In [None]:
import os
from langchain.agents import SimpleAgent
from langchain.toolkits import CustomToolkit
from langchain.tools import CustomTool
from langchain.utilities import LangChainLM

from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms import LlamaCpp

import torch

# Check if CUDA is available for GPU support
torch.cuda.is_available()

n_gpu_layers = 40  # Change this value based on your model and your GPU VRAM pool.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path="/Users/rlm/Desktop/Code/llama.cpp/models/openorca-platypus2-13b.gguf.q4_0.bin",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

class JobExtractionTool(CustomTool):
    def __init__(self, llm):
        # System prompt that sets the context for the LLM
        self.system_prompt = (
            "system\n"
            "Extrahiere aus der Frage des Users den für ihn am besten passenden Job und retouniere genau diesen und nur diesen.\n\n"
        )
        self.llm_chain = LLMChain(prompt=PromptTemplate(template=self.system_prompt), llm=llm)

    def run(self, user_query):
        # Combine the system prompt with the user query
        full_prompt = self.system_prompt + "user\n" + user_query + "\nassistant\n"

        # Get the LLM's response
        response = self.llm_chain.run(full_prompt)
        
        # Extract the first word as the job (customize this part as needed)
        job = response.split()[0] if response else None
        return job

# Define a tool for querying AMS content collection in ChromaDB
class AMSQueryTool(CustomTool):
    def run(self, job):
        # Implement logic to query the AMS content collection using ChromaDB
        # Placeholder: Returns a dummy response (customize with actual ChromaDB query)
        documents = f"Dummy documents related to {job}"
        return documents

# Initialize the tools
job_extraction_tool = JobExtractionTool(llm)
ams_query_tool = AMSQueryTool()

# Create a custom toolkit
class CareerCounselingToolkit(CustomToolkit):
    def get_tools(self):
        return [job_extraction_tool, ams_query_tool]

# Initialize the toolkit
toolkit = CareerCounselingToolkit()

# Create the agent
agent = SimpleAgent(llm, [toolkit])

# Function to handle user query
def handle_user_query(query):
    # Extract the job
    job = job_extraction_tool.run(query)
    if not job:
        return "Unable to identify a specific job in your query."

    # Query AMS content collection
    ams_documents = ams_query_tool.run(job)

    # Generate a response
    response = f"Based on your interest in {job}, here is some information: {ams_documents}"
    return response

# Example usage
user_query = "I'm interested in becoming a software engineer. What should I know?"
response = handle_user_query(user_query)
print(response)
