In [None]:
%pip install langchain llama-cpp-python chromadb sentence-transformers

In [7]:
import pandas as pd
import chromadb
from langchain_community.vectorstores import Chroma
from langchain_community.llms import LlamaCpp
#from langchain_community.embeddings import LlamaCppEmbeddings
from langchain.prompts import (
    PromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    ChatPromptTemplate,
)
from langchain.chains import LLMChain

df = pd.read_csv('./data/oscars.csv')
df = df.loc[df['year_ceremony'] == 2023]
df = df.dropna(subset=['film'])
df.loc[:, 'category'] = df['category'].str.lower()
df.loc[:, 'text'] = df['name'] + ' got nominated under the category, ' + df['category'] + ', for the film ' + df['film'] + ' to win the award'
df.loc[df['winner'] == False, 'text'] = df['name'] + ' got nominated under the category, ' + df['category'] + ', for the film ' + df['film'] + ' but did not win'               

client = chromadb.Client()
collection = client.get_or_create_collection("oscars-2023")

docs = df["text"].tolist() 

vector_db = Chroma(
    "langchain_store",
    collection
)

# Load the LlamaCpp language model, adjust GPU usage based on your hardware
llm = LlamaCpp(
    model_path="../../../.cache/lm-studio/models/monal04/llama-2-7b-chat.Q4_0.gguf-GGML/llama-2-7b-chat.Q4_0.gguf",
    n_gpu_layers=40,
    n_batch=512,  # Batch size for model processing
    #verbose=False,  # Enable detailed logging for debugging
)

# Define the prompt template with a placeholder for the question
system_template_str = """You are a helpful AI assistant that can answer questions on Oscar 2023 awards. Answer based on the context provided. If you cannot find the correct answerm, say I don't know. Be concise and just include the response.

{context}
"""

system_prompt = SystemMessagePromptTemplate(
    prompt=PromptTemplate(
        input_variables=["context"],
        template=system_template_str,
    )
)

human_prompt = HumanMessagePromptTemplate(
    prompt=PromptTemplate(
        input_variables=["question"],
        template="{question}",
    )
)
messages = [system_prompt, human_prompt]

prompt_template = ChatPromptTemplate(
    input_variables=["context", "question"],
    messages=messages,
)

# Create an LLMChain to manage interactions with the prompt and model
llm_chain = LLMChain(prompt=prompt_template, llm=llm)

context = vector_db.as_retriever(k=3)

print("Chatbot initialized, ready to chat...")
while True:
    #context="I am healthy!"
    question = input("> ")
    context = context
    answer = llm_chain.invoke({"context": context, "question": question})
    print(answer, '\n')

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ../../../.cache/lm-studio/models/monal04/llama-2-7b-chat.Q4_0.gguf-GGML/llama-2-7b-chat.Q4_0.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - 

Chatbot initialized, ready to chat...



llama_print_timings:        load time =   24881.55 ms
llama_print_timings:      sample time =       7.72 ms /    33 runs   (    0.23 ms per token,  4272.40 tokens per second)
llama_print_timings: prompt eval time =   24881.01 ms /   121 tokens (  205.63 ms per token,     4.86 tokens per second)
llama_print_timings:        eval time =    8794.03 ms /    32 runs   (  274.81 ms per token,     3.64 tokens per second)
llama_print_timings:       total time =   33813.75 ms /   153 tokens


{'context': VectorStoreRetriever(tags=['Chroma', 'Collection'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x0000014D8B2AFB10>), 'question': 'Did Lady Gaga win the oscars?', 'text': '\nAI Assistant: Yes, Lady Gaga won the Oscars in 2023 for her role in "A Star is Born." '} 



Llama.generate: prefix-match hit

llama_print_timings:        load time =   24881.55 ms
llama_print_timings:      sample time =       5.50 ms /    22 runs   (    0.25 ms per token,  3999.27 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =    6170.73 ms /    22 runs   (  280.49 ms per token,     3.57 tokens per second)
llama_print_timings:       total time =    6261.41 ms /    23 tokens


{'context': VectorStoreRetriever(tags=['Chroma', 'Collection'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x0000014D8B2AFB10>), 'question': '', 'text': ' Can you tell me who won the Best Director award at the 2023 Oscars?'} 



Llama.generate: prefix-match hit


KeyboardInterrupt: 