In [None]:
!pip install llama-index pinecone-client datasets llama-index-vector-stores-pinecone

In [None]:
from datasets import load_dataset
from llama_index.core import Document, VectorStoreIndex
from llama_index.core.ingestion import IngestionPipeline
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.evaluation import RelevancyEvaluator
from pinecone import Pinecone, PodSpec
import re, os, nest_asyncio
from time import sleep

In [None]:
os.environ['PINECONE_API_KEY']='<YOUR_API_KEY>'
os.environ['OPENAI_API_KEY']='<YOUR_API_KEY>'
pinecone_api_key = os.environ.get('PINECONE_API_KEY')
openai_api_key = os.environ.get('OPENAI_API_KEY')

In [None]:
# 실습 데이터 로드 및 판다스 데이터프레임화
dataset = load_dataset("lcw99/wikipedia-korean-20221001", split='train[:100]')
data = dataset.to_pandas()[['id', 'text', 'title']].drop_duplicates(subset='text', keep='first')

In [None]:
# 스트링 전처리
def clean_up_text(content: str) -> str:
    content = re.sub(r'(\w+)-\n(\w+)', r'\1\2', content)
    content = re.sub(r'\\n|  —|——————————|—————————|—————|\\u[\dA-Fa-f]{4}|\uf075|\uf0b7', "", content)
    content = re.sub(r'(\w)\s*-\s*(\w)', r'\1-\2', content)
    content = re.sub(r'\s+', ' ', content)

    return content

# 텍스트 전처리와 곁들여 document화 진행
docs = [Document(
    text=clean_up_text(row['text']),
    doc_id=row['id'],
    extra_info={'title': row['title']}
) for _, row in data.iterrows()]

In [None]:
# Pinecone setup
api_key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=api_key)
index_name = 'quickstart'
dimension = 1536
metric = 'dotproduct'
spec = PodSpec('gcp-starter')


In [None]:
# 파인콘 인덱스 생성
if index_name in [index_info["name"] for index_info in pc.list_indexes()]:
    pc.delete_index(index_name)
pc.create_index(index_name, dimension=dimension, metric=metric, spec=spec)

while not pc.describe_index(index_name).status['ready']:
    sleep(1)
index = pc.Index(index_name)
sleep(1)
print(index.describe_index_stats())



In [None]:
# Ingestion Pipeline 정의
embed_model = OpenAIEmbedding(api_key=os.getenv('OPENAI_API_KEY'), model='text-embedding-ada-002')
vector_store = PineconeVectorStore(pinecone_index=index)
pipeline = IngestionPipeline(transformations=[SimpleNodeParser(), embed_model], vector_store=vector_store)

# Run pipeline
pipeline.run(documents=docs)

In [None]:

# 쿼링 스테이지
vector_index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
retriever = VectorIndexRetriever(index=vector_index, similarty_top_k=5)

In [None]:
# 단순 retreive 테스트
answer = retriever.retrieve()
print([i.get_content() for i in answer])

In [None]:
from llama_index.core import ServiceContext
# RAG 구성
query_engine = RetrieverQueryEngine(retriever=retriever).from_args(retriever=retriever,llm=llm)
nest_asyncio.apply()

query = ''
llm_response = query_engine.query(query)
llm_response.response

In [None]:
# 구체적인 수치에 대한 질문
query = ''
llm_response = query_engine.query(query)
llm_response.response

In [None]:
# 명시되지 않았지만, 1차적인 reasoning이 있으면 답변 가능한 질문
query = ''
llm_response = query_engine.query(query)
llm_response.response

In [None]:
# 답할수 없는 질문일땐?
query = ''
llm_response = query_engine.query(query)
llm_response.response

In [None]:
llm_response_source_nodes = [i.get_content() for i in llm_response.source_nodes]
llm_response_source_nodes

In [None]:
evaluator = RelevancyEvaluator()
eval_result = evaluator.evaluate_response(query=query, response=llm_response)

llm_response_source_nodes = [i.get_content() for i in llm_response.source_nodes]
print(f'\nGiven the {len(llm_response_source_nodes)} chunks of content (below), is your LLM\'s response relevant? {eval_result.passing}\n \
        \n ----Contexts----- \n \
        \n{llm_response_source_nodes}')