In [None]:
import warnings
import os

# 경고 무시, 기본 다운로드 경로 설정
warnings.filterwarnings('ignore')
os.environ['HF_HOME'] = './cache/'

In [None]:
texts = [
    "안녕, 만나서 반가워.",
    "LangChain simplifies the process of building applications with large language models",
    "랭체인 한국어 튜토리얼은 LangChain의 공식 문서, cookbook 및 다양한 실용 예제를 바탕으로 하여 사용자가 LangChain을 더 쉽고 효과적으로 활용할 수 있도록 구성되어 있습니다. ",
    "LangChain은 초거대 언어모델로 애플리케이션을 구축하는 과정을 단순화합니다.",
    "Retrieval-Augmented Generation (RAG) is an effective technique for improving AI responses.",
]

In [None]:
!pip install -qU langchain_huggingface huggingface_hub

In [None]:
from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings

model_name = "intfloat/multilingual-e5-large-instruct"

hf_embeddings = HuggingFaceEndpointEmbeddings(
    model=model_name,
    task="feature-extraction",
    huggingfacehub_api_token=""
)


In [None]:
%time embedded_documents = hf_embeddings.embed_documents(texts)

CPU times: user 13.1 ms, sys: 0 ns, total: 13.1 ms
Wall time: 133 ms


In [None]:
print("[HuggingFace Endpoint Embedding]")
print(f"Model: \t\t{model_name}")
print(f"Dimension: \t{len(embedded_documents[0])}")

[HuggingFace Endpoint Embedding]
Model: 		intfloat/multilingual-e5-large-instruct
Dimension: 	1024


In [None]:
# Document Embedding 수행
embedded_query = hf_embeddings.embed_query("LangChain 에 대해서 알려주세요.")
embedded_query

[0.009226548485457897,
 0.01639796793460846,
 0.0035652639344334602,
 -0.02523578330874443,
 0.02613539807498455,
 -0.01715373992919922,
 -0.024752875789999962,
 0.029846079647541046,
 0.03138943389058113,
 -0.03261665254831314,
 0.02063494734466076,
 0.013859652914106846,
 -0.025687560439109802,
 0.004425381310284138,
 -0.01833072304725647,
 -0.022040527313947678,
 -0.06823819130659103,
 0.006597291678190231,
 -0.02591727115213871,
 -0.009243537671864033,
 0.057264138013124466,
 0.011886654421687126,
 -0.015873637050390244,
 -0.020473109558224678,
 -0.014880276285111904,
 -0.00825903844088316,
 -0.02687610499560833,
 -0.046371039003133774,
 0.0009884691098704934,
 -0.03643620014190674,
 0.0056318906135857105,
 0.009133921936154366,
 -0.01921655237674713,
 -0.04842524230480194,
 -0.01693970523774624,
 0.02952422760426998,
 0.052462898194789886,
 0.0444953516125679,
 -0.02714822068810463,
 0.060879938304424286,
 -0.023127388209104538,
 0.05907386168837547,
 0.02450823411345482,
 0.00476

### Localization

In [None]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

model_name = 'intfloat/multilingual-e5-large-instruct'

hf_embeddings = HuggingFaceEmbeddings(
    model_name = model_name,
    model_kwargs = {'device': 'cpu'},
    encode_kwargs = {'normalize_embeddings': True}
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/128 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/140k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

In [None]:
%time embedded_documents = hf_embeddings.embed_documents(texts)

CPU times: user 3.08 s, sys: 51.5 ms, total: 3.14 s
Wall time: 3.58 s


In [None]:
print(f"Model: \n\n[model.name]")
print(f"Dimension: \t{len(embedded_documents[0])}")

Model: 

[model.name]
Dimension: 	1024


In [None]:
import numpy as np

# Document Embedding 수행
embedded_query = hf_embeddings.embed_query("LangChain 에 대해서 알려주세요.")
embedded_documents = hf_embeddings.embed_documents(texts)

import numpy as np

# 질문(embedded_query): LangChain 에 대해서 알려주세요.
np.array(embedded_query) @ np.array(embedded_documents).T

sorted_idx = (np.array(embedded_query) @ np.array(embedded_documents).T).argsort()[::-1]

print("[Query] LangChain 에 대해서 알려주세요.\n====================================")
for i, idx in enumerate(sorted_idx):
    print(f"[{i}] {texts[idx]}")
    print()

[Query] LangChain 에 대해서 알려주세요.
[0] LangChain은 초거대 언어모델로 애플리케이션을 구축하는 과정을 단순화합니다.

[1] LangChain simplifies the process of building applications with large language models

[2] 랭체인 한국어 튜토리얼은 LangChain의 공식 문서, cookbook 및 다양한 실용 예제를 바탕으로 하여 사용자가 LangChain을 더 쉽고 효과적으로 활용할 수 있도록 구성되어 있습니다. 

[3] 안녕, 만나서 반가워.

[4] Retrieval-Augmented Generation (RAG) is an effective technique for improving AI responses.



### FlagEmbedding

* **Dense Vector:** BGE-M3의 다국어, 다중 작업 능력을 기반으로 함
* **Lexical:** weight를 활용한 sparse embedding으로 정확한 단어 매칭을 수행
* **ColBERT**의 multi-vector 접근법으로 문맥을 고려한 세밀한 매칭 수행

In [None]:
!pip install -qU FlagEmbedding

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/161.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m153.6/161.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.8/161.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m84.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m347.9/347.9 kB[0m [31m20.6 MB/

In [None]:
from FlagEmbedding import BGEM3FlagModel

model_name = "BAAI/bge-m3"
bge_embeddings = BGEM3FlagModel(
    model_name, use_fp16=True
)  # use_fp16을 True로 설정하면 약간의 성능 저하와 함께 계산 속도가 빨라집니다.

bge_embedded = bge_embeddings.encode(
    texts,
    batch_size=12,
    max_length=8192,  # 이렇게 긴 길이가 필요하지 않은 경우 더 작은 값을 설정하여 인코딩 프로세스의 속도를 높일 수 있습니다.
)["dense_vecs"]


tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/15.8k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

colbert_linear.pt:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

imgs/bm25.jpg:   0%|          | 0.00/132k [00:00<?, ?B/s]

imgs/.DS_Store:   0%|          | 0.00/6.15k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

imgs/long.jpg:   0%|          | 0.00/485k [00:00<?, ?B/s]

imgs/mkqa.jpg:   0%|          | 0.00/608k [00:00<?, ?B/s]

imgs/others.webp:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

long.jpg:   0%|          | 0.00/127k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

onnx/Constant_7_attr__value:   0%|          | 0.00/65.6k [00:00<?, ?B/s]

onnx/config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

model.onnx_data:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/725k [00:00<?, ?B/s]

onnx/tokenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

imgs/nqa.jpg:   0%|          | 0.00/158k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

imgs/miracl.jpg:   0%|          | 0.00/576k [00:00<?, ?B/s]

sparse_linear.pt:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
bge_embedded.shape

(5, 1024)

### Lexical

In [None]:
bge_flagmodel = BGEM3FlagModel(
    "BAAI/bge-m3", use_fp16=True
)  # use_fp16을 True로 설정하면 약간의 성능 저하와 함께 계산 속도가 빨라집니다.
bge_encoded = bge_flagmodel.encode(texts, return_sparse=True)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
lexical_scores1 = bge_flagmodel.compute_lexical_matching_score(
    bge_encoded["lexical_weights"][0], bge_encoded["lexical_weights"][0]
)
lexical_scores2 = bge_flagmodel.compute_lexical_matching_score(
    bge_encoded["lexical_weights"][0], bge_encoded["lexical_weights"][2]
)
# 0 <-> 0
print(lexical_scores1)
# 0 <-> 1
print(lexical_scores2)

0.3015604605898261
0.0002422393299639225


In [None]:
num_sentences = 5

for i in range(num_sentences):
  for j in range(num_sentences):
    lexical_scores = bge_flagmodel.compute_lexical_matching_score(
        bge_encoded["lexical_weights"][i], bge_encoded["lexical_weights"][j]
    )
    print(f"Sentence {i} <-> Sentence {j}: {lexical_scores}")

Sentence 0 <-> Sentence 0: 0.3015604605898261
Sentence 0 <-> Sentence 1: 0
Sentence 0 <-> Sentence 2: 0.0002422393299639225
Sentence 0 <-> Sentence 3: 0.0005542480503208935
Sentence 0 <-> Sentence 4: 0.004006281029433012
Sentence 1 <-> Sentence 0: 0
Sentence 1 <-> Sentence 1: 0.514524119815178
Sentence 1 <-> Sentence 2: 0.14775124192237854
Sentence 1 <-> Sentence 3: 0.20167488604784012
Sentence 1 <-> Sentence 4: 0
Sentence 2 <-> Sentence 0: 0.0002422393299639225
Sentence 2 <-> Sentence 1: 0.14775124192237854
Sentence 2 <-> Sentence 2: 0.501750381278157
Sentence 2 <-> Sentence 3: 0.14519595867022872
Sentence 2 <-> Sentence 4: 0
Sentence 3 <-> Sentence 0: 0.0005542480503208935
Sentence 3 <-> Sentence 1: 0.20167488604784012
Sentence 3 <-> Sentence 2: 0.14519595867022872
Sentence 3 <-> Sentence 3: 0.5084092195702397
Sentence 3 <-> Sentence 4: 0.00015817038365639746
Sentence 4 <-> Sentence 0: 0.004006281029433012
Sentence 4 <-> Sentence 1: 0
Sentence 4 <-> Sentence 2: 0
Sentence 4 <-> Sente

### colBERT

* 토큰 수준의 세밀한 매칭이 가능합니다.
* 문맥을 고려한 임베딩을 생성할 수 있습니다.
* 긴 문서에 대해서도 효과적으로 작동합니다.

In [None]:
bge_flagmodel = BGEM3FlagModel(
    "BAAI/bge-m3", use_fp16=True
)  # use_fp16을 True로 설정하면 약간의 성능 저하와 함께 계산 속도가 빨라집니다.
bge_encoded = bge_flagmodel.encode(texts, return_colbert_vecs=True)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
colbert_scores1 = bge_flagmodel.colbert_score(
    bge_encoded["colbert_vecs"][0], bge_encoded["colbert_vecs"][0]
)
colbert_scores2 = bge_flagmodel.colbert_score(
    bge_encoded["colbert_vecs"][0], bge_encoded["colbert_vecs"][1]
)
# 0 <-> 0
print(colbert_scores1)
# 0 <-> 1
print(colbert_scores2)


tensor(1.)
tensor(0.3748)


In [None]:
num_sentences = 5

for i in range(num_sentences):
  for j in range(num_sentences):
    # Use colbert_score instead of compute_colbert_score
    lexical_scores = bge_flagmodel.colbert_score(
        bge_encoded["colbert_vecs"][i], bge_encoded["colbert_vecs"][j]
    )
    print(f"Sentence {i} <-> Sentence {j}: {lexical_scores}")

Sentence 0 <-> Sentence 0: 1.0
Sentence 0 <-> Sentence 1: 0.3748414218425751
Sentence 0 <-> Sentence 2: 0.2792990803718567
Sentence 0 <-> Sentence 3: 0.34978264570236206
Sentence 0 <-> Sentence 4: 0.3649008870124817
Sentence 1 <-> Sentence 0: 0.34872308373451233
Sentence 1 <-> Sentence 1: 1.0
Sentence 1 <-> Sentence 2: 0.5893551111221313
Sentence 1 <-> Sentence 3: 0.8961040377616882
Sentence 1 <-> Sentence 4: 0.40955448150634766
Sentence 2 <-> Sentence 0: 0.21900951862335205
Sentence 2 <-> Sentence 1: 0.5333906412124634
Sentence 2 <-> Sentence 2: 1.0
Sentence 2 <-> Sentence 3: 0.5514200925827026
Sentence 2 <-> Sentence 4: 0.3236272633075714
Sentence 3 <-> Sentence 0: 0.2688165307044983
Sentence 3 <-> Sentence 1: 0.8746428489685059
Sentence 3 <-> Sentence 2: 0.5845511555671692
Sentence 3 <-> Sentence 3: 1.0
Sentence 3 <-> Sentence 4: 0.38624268770217896
Sentence 4 <-> Sentence 0: 0.3413386940956116
Sentence 4 <-> Sentence 1: 0.4109952449798584
Sentence 4 <-> Sentence 2: 0.34988105297088