In [None]:
from uuid import uuid4

import chromadb
import nltk
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_community.vectorstores.utils import DistanceStrategy

### 문서 로드 & 전처리

In [None]:
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

In [None]:
version = "v4.49.0"

urls = [
    f"https://huggingface.co/docs/transformers/{version}/ko/pipeline_tutorial",
    f"https://huggingface.co/docs/transformers/{version}/ko/autoclass_tutorial",
    f"https://huggingface.co/docs/transformers/{version}/ko/preprocessing",
    f"https://huggingface.co/docs/transformers/{version}/ko/training",
    f"https://huggingface.co/docs/transformers/{version}/ko/run_scripts",
    f"https://huggingface.co/docs/transformers/{version}/ko/tokenizer_summary",
    f"https://huggingface.co/docs/transformers/{version}/ko/attention",
    f"https://huggingface.co/docs/transformers/{version}/ko/pad_truncation",
    f"https://huggingface.co/docs/transformers/{version}/ko/pipeline_webserver",
    f"https://huggingface.co/docs/transformers/{version}/ko/tasks_explained",
    f"https://huggingface.co/docs/transformers/{version}/ko/hpo_train",
    f"https://huggingface.co/docs/transformers/{version}/ko/tasks/sequence_classification",
    f"https://huggingface.co/docs/transformers/{version}/ko/tasks/token_classification",
    f"https://huggingface.co/docs/transformers/{version}/ko/tasks/question_answering",
    f"https://huggingface.co/docs/transformers/{version}/ko/tasks/language_modeling",
    f"https://huggingface.co/docs/transformers/{version}/ko/tasks/masked_language_modeling",
    f"https://huggingface.co/docs/transformers/{version}/ko/tasks/translation",
    f"https://huggingface.co/docs/transformers/{version}/ko/tasks/summarization",
]
loader = UnstructuredURLLoader(urls=urls)
docs = loader.load()

In [None]:
docs[0].page_content.split("→")

In [None]:
"""
튜토리얼에 나온 것처럼 ToC를 사용하고 싶었는데 
UnstructuredURLLoader에서 ToC까지 가져오지 못함
"""

for doc in docs:
    doc.page_content = doc.page_content.split("to get started\n\n")[-1].split("< > Update on GitHub\n\n")[0]

In [None]:
embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
embedding_model = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    encode_kwargs={"normalize_embeddings": True}
)

In [None]:
collection_name = "collection_huggingface_transformer"

vector_store = Chroma(
    collection_name=collection_name,
    embedding_function=embedding_model,
    persist_directory="./chroma.db"
)

In [None]:
uuids = [str(uuid4()) for _ in range(len(docs))]
vector_store.add_documents(documents=docs, ids=uuids)

In [None]:
vector_store.get()

In [None]:
vector_store.delete_collection()

In [None]:
user_query = "어텐션 매커니즘이 무엇인가요?"
query_vector = embedding_model.embed_query(user_query)

In [None]:
len(query_vector)

In [None]:
result = vector_store.similarity_search(
    user_query,
    k=2
)