### Part 3. Langchain - VectorDB 이용한 간단한 RAG 구현
- Objectives: Langchain+파인콘으로 간단한 위키문서 upsert 및 이를 기반으로 하는 naive-RAG 구현

In [None]:
!pip install langchain pinecone-client datasets openai tiktoken

In [None]:
import os

import time
os.environ['PINECONE_API_KEY']='<YOUR_API_KEY>'
os.environ['OPENAI_API_KEY']='<YOUR_API_KEY>'
pinecone_api_key = os.environ.get('PINECONE_API_KEY')
openai_api_key = os.environ.get('OPENAI_API_KEY')

In [None]:
#https://huggingface.co/datasets/lcw99/wikipedia-korean-20221001/viewer/default/train
# 사용 데이터 로드
from datasets import load_dataset
dataset = load_dataset("lcw99/wikipedia-korean-20221001", split='train[:100]')

In [None]:
dataset[0]

In [None]:
# 텍스트 스플리터 기능을 활용한 데이터 문서 청킹 작업 정의
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    separators=["\n\n", "\n", " ", ""]
)

In [None]:
# 정의한 청킹 방식으로 어떤 식으로 짤라지는지 확인.
text_splitter.split_text(dataset[0]['text'])[:5]

In [None]:
# 청크 단위 임베딩이 필요하므로 임베딩 벡터 불러오기
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=openai_api_key
)

In [None]:

from pinecone import Pinecone, PodSpec

api_key = os.getenv("PINECONE_API_KEY")

pc = Pinecone(api_key=api_key)

In [None]:
from time import sleep

index_name = 'quickstart'
dimension = 1536
metric = 'dotproduct'
spec = PodSpec('gcp-starter')

if index_name in [index_info["name"] for index_info in pc.list_indexes()]:
    pc.delete_index(index_name)

pc.create_index(index_name, dimension=dimension, metric=metric, spec=spec)

while not pc.describe_index(index_name).status['ready']:
    sleep(1)
index = pc.Index(index_name)
sleep(1)
index_stats = index.describe_index_stats()
print(index_stats)

In [None]:
from tqdm.auto import tqdm
from uuid import uuid4

def process_and_upload_records(dataset, batch_limit=100):
    texts = []
    metadatas = []

    for i, record in enumerate(tqdm(dataset)):
        metadata = {
            'id': str(record['id']),
            'source': record['url'],
            'title': record['title']
        }

        record_texts = text_splitter.split_text(record['text'])
        record_metadatas = [
            {"chunk": j, "text": text, **metadata}
            for j, text in enumerate(record_texts)
        ]

        texts.extend(record_texts)
        metadatas.extend(record_metadatas)

        if len(texts) >= batch_limit:
            upload_data(texts, metadatas)
            texts, metadatas = [], []

    if texts:
        upload_data(texts, metadatas)

def upload_data(texts, metadatas):
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

process_and_upload_records(dataset)


In [None]:
# 잘 올라가졌는지 확인
index.describe_index_stats()

In [None]:
#파인콘 api로 기반 문서들 업서트 이후에는 이제 랭체인에서 이를 활용하기 위해 벡터스토어 오브젝트로 연결시켜줌
from langchain.vectorstores import Pinecone

text_field = "text"

vectorstore = Pinecone(
    index, embed.embed_query, text_field
)

In [None]:
# 단순 retrieval
query = "지미 카터가 누구야?"

vectorstore.similarity_search(
    query,
    k=3
)

In [None]:
# full RAG 구성
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

llm = ChatOpenAI(
    openai_api_key=openai_api_key,
    model_name='gpt-3.5-turbo',
    temperature=0
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [None]:
# 사용자 질문 -> similarity Search -> GPT 답안 생성
qa.run(query)

In [None]:
# 못믿는 사람들을 위한 증거제시용 소스체인 구성
from langchain.chains import RetrievalQAWithSourcesChain

qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [None]:
# 소스체인 RAG 시험
qa_with_sources(query)