In [None]:
!pip install langchain_huggingface langchain_ollama langchain langchain_community
!pip install openai
!pip install pinecone
!curl curl -fsSL https://ollama.com/install.sh | sh
!pip install -qU langchain-ollama

In [None]:
import subprocess
subprocess.Popen(["ollama", "serve"])
import time
time.sleep(3) # Wait for a few seconds for Ollama to load!

In [None]:
!ollama pull hf.co/heegyu/EEVE-Korean-Instruct-10.8B-v1.0-GGUF:Q4_K_M

In [None]:
import sys
sys.path.insert(1, "/kaggle/input/rag-code")

In [None]:
from fastapi_with_reranker import *

In [None]:
import json
from tqdm import tqdm
from typing import Optional
from langchain_core.language_models import BaseChatModel
from langchain_core.vectorstores import VectorStore
import datasets
from langchain_core.prompts import ChatPromptTemplate
from transformers import pipeline

# Step 0: Set up Pinecone index and embedding model
pinecone_api_key = 'PINCONE_API_KEY'
hf_api_token = "HUGGINGFACE_API_TOKEN"
pinecone_namespace = 'web'

# Initialize Pinecone client
pc = Pinecone(api_key=pinecone_api_key)
index = pc.Index('iskku-data3')


model_name = "intfloat/multilingual-e5-large-instruct"
hf_embeddings = HuggingFaceEndpointEmbeddings(
    model=model_name,
    task="feature-extraction",
    huggingfacehub_api_token=hf_api_token,
)

# Step 1.5: Define the reranker model
tokenizer = AutoTokenizer.from_pretrained("Dongjin-kr/ko-reranker")
reranker_model = AutoModelForSequenceClassification.from_pretrained("Dongjin-kr/ko-reranker")

# Step 2: Retriever function to extract top_k matches from Pinecone
def retrieve(query, top_k=5):
    embedded_query = hf_embeddings.embed_query(query)
    results = index.query(vector=embedded_query, top_k=top_k,namespace = pinecone_namespace, include_metadata=True)
    return results['matches']

# Step 3: Reranker function to reorder retrieved contexts
def rerank(query, matches):
    reranked_matches = []
    inputs = []
    for match in matches:
        context_text = match['metadata']['text']
        inputs.append((query, context_text))

    # Tokenize and create input tensors
    tokenized_inputs = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        scores = reranker_model(**tokenized_inputs).logits.squeeze()

    # Attach scores to matches and sort
    for i, match in enumerate(matches):
        match['score'] = scores[i].item()
    reranked_matches = sorted(matches, key=lambda x: x['score'], reverse=True)

    return reranked_matches


# Step 4: Define the generator component using ChatOllama
def generate_response(query, context):
    llm = ChatOllama(model='hf.co/heegyu/EEVE-Korean-Instruct-10.8B-v1.0-GGUF:Q4_K_M', temperature=0.1, endpoint = 'http://localhost:11434')

    # Define the prompt
    prompt = ChatPromptTemplate.from_template('''You are an AI assistant specialized in providing accurate and detailed information. 
    Based on the given context about SungKyunKwan University (성균관대학교), answer the following question thoroughly and concisely. 
    Respond in the same language as the user.

Context: {context}
Question: {query}
Answer:''')

    chain = prompt | llm
    answer = chain.invoke(
        {
            "query": "query",
            "context": "context",
        }
    )
    print(answer.content)
    return answer.content

# Step 4: RAG pipeline combining retriever and generator
def rag_pipeline(query):
    # Retrieve relevant contexts from Pinecone
    matches = retrieve(query)
    retrieved_context = "\n".join([match['metadata']['text'] for match in matches])

    # Generate answer based on retrieved context
    response = generate_response(query, retrieved_context)
    return response, matches

In [None]:
def rag_pipeline_reranker(query):
    # Retrieve relevant contexts from Pinecone
    matches = retrieve(query)
    
    # Rerank the retrieved contexts
    reranked_matches = rerank(query, matches)
    retrieved_context = "\n".join([match['metadata']['text'] for match in reranked_matches[:3]])

    # Generate the response iterator
    response = generate_response(query, retrieved_context)

    # Return the StreamingResponse directly
    return response, reranked_matches

In [None]:
from langchain_core.language_models import BaseChatModel


def run_rag_tests(
    eval_dataset: datasets.Dataset,
    output_file: str,
    verbose: Optional[bool] = True,
    test_settings: Optional[str] = None,  # To document the test settings used
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    # try:  # load previous generations if they exist
    #     with open(output_file, "r") as f:
    #         outputs = json.load(f)
    # except:
    outputs = []

    for example in tqdm(eval_dataset):
        question = example["question"]
        if question in [output["question"] for output in outputs]:
            continue

        answer, relevant_docs = rag_pipeline(question)

         # Convert `ScoredVector` objects to serializable dictionaries
        retrieved_docs = [
            {
                "id": doc.id,  # Document ID
                "score": doc.score,  # Similarity score
                "metadata": doc.metadata,  # Metadata from Pinecone
            }
            for doc in relevant_docs
        ]
        
        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f'True answer: {example["answer"]}')
        result = {
            "question": question,
            "true_answer": example["answer"],
            "context": example["content"],
            "generated_answer": answer,
            "retrieved_docs": retrieved_docs,
        }
        if test_settings:
            result["test_settings"] = test_settings
        outputs.append(result)

        with open(output_file, "w") as f:
            json.dump(outputs, f, indent=4, ensure_ascii=False)

In [None]:
EVALUATION_PROMPT = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.

###The instruction to evaluate:
{instruction}

###Response to evaluate:
{response}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
[Is the response correct, accurate, and factual based on the reference answer?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback:"""

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import SystemMessage


evaluation_prompt_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content="You are a fair evaluator language model."),
        HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT),
    ]
)

In [None]:
from langchain.chat_models import ChatOpenAI

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
OPENAI_API_KEY = user_secrets.get_secret("Open_AI")

eval_chat_model = ChatOpenAI(model="gpt-4o-mini", temperature=0, openai_api_key=OPENAI_API_KEY)
evaluator_name = "GPT4o-mini"


def evaluate_answers(
    answer_path: str,
    eval_chat_model,
    evaluator_name: str,
    evaluation_prompt_template: ChatPromptTemplate,
) -> None:
    """Evaluates generated answers. Modifies the given answer file in place for better checkpointing."""
    answers = []
    # if os.path.isfile(answer_path):  # load previous generations if they exist
    #     answers = json.load(open(answer_path, "r"))

    for experiment in tqdm(answers):
        if f"eval_score_{evaluator_name}" in experiment:
            continue

        eval_prompt = evaluation_prompt_template.format_messages(
            instruction=experiment["question"],
            response=experiment["generated_answer"],
            reference_answer=experiment["true_answer"],
        )
        eval_result = eval_chat_model.invoke(eval_prompt)
        feedback, score = [item.strip() for item in eval_result.content.split("[RESULT]")]
        experiment[f"eval_score_{evaluator_name}"] = score
        experiment[f"eval_feedback_{evaluator_name}"] = feedback

        with open(answer_path, "w") as f:
            json.dump(answers, f)

In [None]:
import os
if not os.path.exists("./output"):
    os.mkdir("./output")

GENERATOR_NAME = "EEVE-Korean-Instruct-10.8B-v1.0-GGUF:Q4_K_M"
from datasets import load_dataset

# Load the dataset from a local JSON file
eval_dataset = load_dataset("json", data_files="/kaggle/input/eval-data/eval_dataset_final.json")

# Access the Dataset object
eval_dataset = eval_dataset["train"]  # The loaded file is stored in the "train" split

settings_name = f"chunk_embeddings_rerank:False_reader-model:{GENERATOR_NAME}"
output_file_name = f"./output/rag_{settings_name}.json"

print(f"Running evaluation for {settings_name}:")

print("Loading knowledge base embeddings...")

print("Running RAG...")
run_rag_tests(
    eval_dataset=eval_dataset,
    output_file=output_file_name,
    verbose=False,
    test_settings=settings_name,
)

print("Running evaluation...")
evaluate_answers(
    output_file_name,
    eval_chat_model,
    evaluator_name,
    evaluation_prompt_template,
    )