In [None]:
# %env LANGCHAIN_TRACING_V2="true"
# %env LANGCHAIN_API_KEY=YOUR_API_KEY

In [None]:
# get openai api key from openai_config.json file
import json
with open('openai_config.json') as f:
    openai_api_key = json.load(f)['openai_api_key']

In [None]:
%env OPENAI_API_KEY = {openai_api_key}

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

In [None]:
# Check if the API KEY is set
llm.invoke("Hello, world!")

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [None]:
from langchain_community.vectorstores import FAISS
from langchain.document_loaders import CSVLoader

# Loader for the NAICS dataset
loader = CSVLoader(r"..\guidance_for_environmental_impact_factor_mapping_on_aws\assets\datasets\2022_NAICS_Index_File.csv")

In [2]:
documents = loader.load()

In [None]:
# Check if the documents are loaded
documents[0]

Document(metadata={'source': '..\\guidance_for_environmental_impact_factor_mapping_on_aws\\assets\\datasets\\2022_NAICS_Index_File.csv', 'row': 0}, page_content='\ufeffNAICS22: 111110\nINDEX ITEM DESCRIPTION: Soybean farming, field and seed production')

In [None]:
# 첫 번째 실행에서만!!!
from langchain_community.vectorstores import FAISS

# Make FAISS vectorstore
vector_store = FAISS.from_documents(documents, embeddings)

# save db to local
vector_store.save_local('./db/faiss')

In [None]:
# Load the DB from local
# 두 번째 실행부터는 위 셀 실행하지 않고 이 셀만 실행
# vector_store = FAISS.load_local(
#     folder_path="./db/faiss",
#     index_name="index",
#     embeddings=embeddings,
#     allow_dangerous_deserialization=True,
# )

In [None]:
%%capture --no-stderr
%pip install langgraph

In [None]:
from langchain_core.documents import Document
from typing_extensions import List, TypedDict


class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

In [None]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")

In [None]:
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}

In [None]:
from langgraph.graph import START, StateGraph

graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [None]:
from IPython.display import Image, display

display(Image(graph.get_graph().draw_mermaid_png()))

In [None]:
result = graph.invoke({"question": "What is a possible NAICS titles for the given activity: GLOVES WORK MECHANIC SYNTHETIC LEATHER SZ LARGE"})

print(f'Context: {result["context"]}\n\n')
print(f'Answer: {result["answer"]}')