<a href="https://colab.research.google.com/github/eugenie-kim012/LLMDS4/blob/main/D%2B64%2C_Text_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **문장을 숫자로 바꾸자, Text Embedding**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install langchain langchain-community langchain_openai langchain_text_splitters sentence_transformers pypdfium2

Collecting langchain-community
  Downloading langchain_community-0.3.26-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain_openai
  Downloading langchain_openai-0.3.25-py3-none-any.whl.metadata (2.3 kB)
Collecting pypdfium2
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-core<1.0.0,>=0.3.58 (from langchain)
  Downloading langchain_core-0.3.66-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain
  Downloading langchain-0.3.26-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchai

### **OpenAI의 텍스트 임베딩 모델 활용하기 (유료)**

In [3]:
import os
from langchain_openai import OpenAIEmbeddings
os.environ["OPENAI_API_KEY"] = "your API"
#유료
embeddings_model = OpenAIEmbeddings(model = 'text-embedding-3-small')
#임베딩 모델을 가져오는 것
embeddings = embeddings_model.embed_documents(
    [
        "Hi there!",
        "Oh, hello!",
        "What's your name?",
        "My friends call me World",
        "Hello World!"
    ]
)
len(embeddings), len(embeddings[0])

(5, 1536)

In [4]:
from langchain_community.document_loaders import PyPDFium2Loader
from langchain_text_splitters import RecursiveCharacterTextSplitter

#임베딩 모델 API 호출
embeddings_model = OpenAIEmbeddings(model = 'text-embedding-3-small')

#PDF 문서 로드
loader = PyPDFium2Loader("/content/drive/MyDrive/WB/2025JPO/Reading/Riding the Demographic Wave- Pensions and Retirement Income in an Aging World.pdf")
pages = loader.load()

#PDF 문서를 여러 청크로 분할
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

texts = text_splitter.split_documents(pages)

#OpenAI 임베딩 모델로 청크들을 임베딩 변환하기
embeddings = embeddings_model.embed_documents([i.page_content for i in texts])
len(embeddings), len(embeddings[0])



(348, 1536)

**[문장 유사도 계산해보기]**

In [5]:
examples= embeddings_model.embed_documents(
     [
        "안녕하세요",
        "제 이름은 홍두깨입니다.",
        "이름이 무엇인가요?",
        "랭체인은 유용합니다.",
     ]
 )

#예시 질문과 답변 임베딩
embedded_query_q = embeddings_model.embed_query("이 대화에서 언급된 이름은 무엇입니까?")
embedded_query_a = embeddings_model.embed_query("이 대화에서 언급된 이름은 홍길동입니다.")

In [9]:
from numpy import dot
from numpy.linalg import norm
import numpy as np

def cos_sim(A, B):
       return dot(A, B)/(norm(A)*norm(B))

print(cos_sim(embedded_query_q, embedded_query_a)) #벡터 유사도를 cos_sim으로 확인
print(cos_sim(embedded_query_a, examples [2])) #한글도 임베딩이 잘 되고 있는 것을 확인할 수 있었음
print(cos_sim(embedded_query_a, examples [3]))

0.6491181289078308
0.28953510215635
0.14843522304151544


### **오픈소스 임베딩 모델 활용하기**

**[jhgan/ko-sroberta-multitask 임베딩 모델 활용]**

In [10]:
# #Open-source 임베딩 모델 활용을 위한 sentence-transformer 라이브러리 설치
!pip install sentence-transformers



In [11]:
from langchain.embeddings import HuggingFaceEmbeddings

#HuggingfaceEmbedding 함수로 Open source 임베딩 모델 로드 (오픈 소스)
model_name = "jhgan/ko-sroberta-multitask"
ko_embedding= HuggingFaceEmbeddings(
    model_name=model_name
)
#한글도 임베딩을 잘하고 있음
examples = ko_embedding.embed_documents(
     [
        "안녕하세요",
        "제 이름은 홍두깨입니다.",
        "이름이 무엇인가요?",
        "랭체인은 유용합니다.",
     ]
 )

embedded_query_q = ko_embedding.embed_query("이 대화에서 언급된 이름은 무엇입니까?")
embedded_query_a = ko_embedding.embed_query("이 대화에서 언급된 이름은 홍길동입니다.")

print(cos_sim(embedded_query_q, embedded_query_a))
print(cos_sim(embedded_query_q, examples[1]))
print(cos_sim(embedded_query_q, examples[3]))

  ko_embedding= HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.86k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/744 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/585 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

0.6070005852394463
0.2947341657162066
0.2757840706251745


**[BAAI/bge-small-en 임베딩 모델 활용 코드]**

In [12]:
from langchain_community.embeddings import HuggingFaceEmbeddings

model_name = "BAAI/bge-small-en"
bge_embedding= HuggingFaceEmbeddings(
    model_name=model_name
)

examples = bge_embedding.embed_documents(
     [
        "안녕하세요",
        "제 이름은 홍두깨입니다.",
        "이름이 무엇인가요?",
        "랭체인은 유용합니다.",
     ]
 )

embedded_query_q = bge_embedding.embed_query("이 대화에서 언급된 이름은 무엇입니까?")
embedded_query_a = bge_embedding.embed_query("이 대화에서 언급된 이름은 홍길동입니다.")

print(cos_sim(embedded_query_q, embedded_query_a))
print(cos_sim(embedded_query_q, examples[1]))
print(cos_sim(embedded_query_q, examples[3]))

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

0.955454149782215
0.9431682731233773
0.8853417113547791


In [20]:
# LangChain 커뮤니티 모듈에서 HuggingFace 기반 임베딩 클래스 불러오기
from langchain_community.embeddings import HuggingFaceEmbeddings

# 사용할 사전학습 임베딩 모델 이름 지정 ("BAAI/bge-small-en"은 영어용 sentence embedding 모델)
model_name = "BAAI/bge-small-en"

# HuggingFaceEmbeddings 객체 생성 (해당 모델을 기반으로)
bge_embedding = HuggingFaceEmbeddings(
    model_name=model_name
)

# 여러 개의 문장을 임베딩 (벡터화)하여 예제 문장들의 임베딩 리스트를 생성
examples = bge_embedding.embed_documents(
     [
        "Hello",
        "My name is Kim.",
        "What is your name?",
        "Learning datascience is quite useful.",
     ]
)

# 질의 문장을 벡터로 임베딩
embedded_query_q = bge_embedding.embed_query("What is the name of the person?")
# 그에 대한 잠재적 응답 문장도 벡터로 임베딩
embedded_query_a = bge_embedding.embed_query("His/Her name is Kim.")

# cosine similarity를 사용하여 두 문장 간의 의미적 유사도를 계산하고 출력
print(cos_sim(embedded_query_q, embedded_query_a))  # 질의와 응답 간 유사도

# 질의와 예시 문장 중 두 번째 문장 ("My name is Kim.") 간 유사도
print(cos_sim(embedded_query_q, examples[1]))

# 질의와 네 번째 문장 ("Learning datascience is quite useful.") 간 유사도
print(cos_sim(embedded_query_q, examples[2]))


0.8847590098227257
0.8334137841388627
0.9212577237761075
