In [1]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from bs4 import BeautifulSoup as Soup
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from langsmith.evaluation._runner import evaluate
from langsmith.evaluation.integrations import LangChainStringEvaluator
from langchain.evaluation.qa.eval_chain import CotQAEvalChain


from langchain import hub
from langchain import prompts
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI

In [2]:
encode_kwargs = {"normalize_embeddings": True}
embedding_function = HuggingFaceEmbeddings(
    model_name='sentence-transformers/all-MiniLM-L6-v2',
    model_kwargs={"device": "cpu"},
    encode_kwargs=encode_kwargs,
)

db_faiss = FAISS.load_local("/Users/justinvhuang/Desktop/CSE-6242-Group-Project/app/faiss_anime_index_v3", embeddings = embedding_function)
retriever = db_faiss.as_retriever(search_kwargs={"k": 50})

In [3]:
import yaml

# Load API key from config.yaml
with open("/Users/justinvhuang/Desktop/CSE-6242-Group-Project/app/config.yaml", "r") as file:
    config = yaml.safe_load(file)

api_key = config["api_key"]
import google.generativeai as genai
userdata = {"GOOGLE_API_KEY": api_key}
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [8]:
#### RETRIEVAL and GENERATION ####


# Generation
generator = prompts.ChatPromptTemplate.from_messages(
    [
        ("system", "Respond using the following documents as context:\n{documents}"),
        ("user", "{question}"),
    ]
) | ChatGoogleGenerativeAI(model="gemini-pro", temperature=0,google_api_key = GOOGLE_API_KEY,convert_system_message_to_human=True)


# Chain
rag_chain = {
    "documents": retriever,
    "question": RunnablePassthrough(),
} | generator | StrOutputParser()

# Question
rag_chain.invoke("Who is fririen?")

In [7]:
llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0,google_api_key = GOOGLE_API_KEY,convert_system_message_to_human=True)

In [8]:
# QA
inputs = [
    "Who is Fririen",
    "What are good pirate anime?",
    "What are some good kid anime?"
]

outputs = [
    "Use RunnablePassthrough. from langchain_core.runnables import RunnableParallel, RunnablePassthrough; from langchain_core.prompts import ChatPromptTemplate; from langchain_openai import ChatOpenAI; prompt = ChatPromptTemplate.from_template('Tell a joke about: {input}'); model = ChatOpenAI(); runnable = ({'input' : RunnablePassthrough()} | prompt | model); runnable.invoke('flowers')",
    "Use StrOutputParser. from langchain_openai import ChatOpenAI; from langchain_core.prompts import ChatPromptTemplate; from langchain_core.output_parsers import StrOutputParser; prompt = ChatPromptTemplate.from_template('Tell me a short joke about {topic}'); model = ChatOpenAI(model='gpt-3.5-turbo') #gpt-4 or other LLMs can be used here; output_parser = StrOutputParser(); chain = prompt | model | output_parser",
    "Use RunnableLambda with itemgetter to extract the relevant key. from operator import itemgetter; from langchain_core.prompts import ChatPromptTemplate; from langchain_core.runnables import RunnableLambda; from langchain_openai import ChatOpenAI; def length_function(text): return len(text); chain = ({'prompt_input': itemgetter('foo') | RunnableLambda(length_function),} | prompt | model); chain.invoke({'foo':'hello world'})"
]

# Dataset
qa_pairs = [{"question": q, "answer": a} for q, a in zip(inputs, outputs)]

In [9]:
from langsmith import Client
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true' # enables tracing 
os.environ['LANGCHAIN_API_KEY'] = 

In [18]:
# Create dataset
client = Client()
dataset_name = "RAG_ANIME"

# Store
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="QA pairs about ANIME.",
)
client.create_examples(
    inputs=[{"question": q} for q in inputs],
    outputs=[{"answer": a} for a in outputs],
    dataset_id=dataset.id,
)

In [17]:
# Evaluators
qa_evalulator = [LangChainStringEvaluator("cot_qa", config={"llm":ChatGoogleGenerativeAI(model="gemini-pro", 
                                                                                         temperature=0,google_api_key = GOOGLE_API_KEY,convert_system_message_to_human=True)})]
dataset_name = "RAG_ANIME"

def predict(example: dict):
    return {"answer": rag_chain.invoke(example["question"])}
    
experiment_results = evaluate(
    predict,
    data=dataset_name,
    evaluators=qa_evalulator,
    experiment_prefix="rag-qa-oai",
    # Any experiment metadata can be specified here
    metadata={
      "variant": "ANIME context, gemini",
    },
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

View the evaluation results for experiment: 'rag-qa-oai-e84562f5' at:
https://smith.langchain.com/o/ee2bfa72-1f76-5322-9029-9ae994e50470/datasets/ee970bc4-f988-4209-8f64-7a2d348d0771/compare?selectedSessions=aed5058f-6bb3-4ade-b82e-8661c9b4fb51




0it [00:00, ?it/s]

Connection error caused failure to POST https://api.smith.langchain.com/runs  in LangSmith API. Please confirm your internet connection.. ConnectTimeout(MaxRetryError("HTTPSConnectionPool(host='api.smith.langchain.com', port=443): Max retries exceeded with url: /runs (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fbcdeaf5fc0>, 'Connection to api.smith.langchain.com timed out. (connect timeout=10.0)'))"))
Error in LangChainTracer.on_chain_start callback: LangSmithConnectionError('Connection error caused failure to POST https://api.smith.langchain.com/runs  in LangSmith API. Please confirm your internet connection.. ConnectTimeout(MaxRetryError("HTTPSConnectionPool(host=\'api.smith.langchain.com\', port=443): Max retries exceeded with url: /runs (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fbcdeaf5fc0>, \'Connection to api.smith.langchain.com timed out. (connect timeout=10.0)\'))"))')
