### Gemini API 준비

In [None]:
# 패키지 설치
!pip install -q -U google-generativeai

In [None]:
from google.colab import userdata
import google.generativeai as genai

# 환경 변수 준비(좌측 하단의 열쇠 아이콘으로 GOOGLE_API_KEY 설정)
GOOGLE_API_KEY=userdata.get("GOOGLE_API_KEY")
genai.configure(api_key=GOOGLE_API_KEY)

### 임베딩 모델 목록 확인

In [None]:
import google.generativeai as genai

# 임베딩 모델 목록 가져오기
for m in genai.list_models():
    if "embedContent" in m.supported_generation_methods:
        print(m.name)

models/embedding-001
models/text-embedding-004


### text-embedding-004의 사용법

In [None]:
# 임베딩 모델 준비
def embedding(texts):
    return genai.embed_content(
        model="models/text-embedding-004",
        content=texts,
    )["embedding"]

In [None]:
# 텍스트 준비
texts = ["This is a Test."]

# 임베딩 벡터로 변환
embeds = embedding(texts)
print(embeds)

[[0.02170973, -0.010004892, -0.07831449, 0.00021624475, 0.01636689, -0.0061589633, 0.056820635, 0.03478271, -0.0053757895, 0.035010446, -4.107778e-05, 0.01616381, 0.0466155, -0.019197056, -0.000764824, -0.019785574, 0.026104964, 0.067962535, -0.067852125, -0.031471316, 0.020794151, -0.046117395, 0.0019919856, -0.03520202, -0.026550831, -0.033206023, 0.008064018, 0.00317448, 0.025845883, -0.021457082, 0.012842093, 0.043763965, 0.031452917, -0.004718088, 0.0126908785, -0.011095253, -0.012762025, 0.020994186, 0.02032984, -0.080613144, 0.0061794934, 0.08040165, -0.07673336, -0.007738254, -0.025886036, -0.04122773, 0.038715914, 0.004498921, 0.0033334992, -0.0023913607, 0.04072559, 0.05169514, -0.057483457, -0.0028400922, 0.015980821, -0.00978375, 0.0131146535, -0.046905294, 0.052408155, 0.007512055, 0.04123414, -0.028586876, -0.0052159326, 0.0056034247, 0.013024183, -0.0059801433, 0.014997187, -0.01935876, -0.039094973, -0.029858405, -0.071830705, 0.018266413, 0.0065697925, -0.008557149, -0

In [None]:
# 임베딩 벡터의 길이 확인
print(len(embeds[0]))

768


### text-embedding-004를 활용한 이웃탐색

In [None]:
# 파이스 패키지 설치
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0


In [None]:
# 입력 텍스트
in_texts = [
    "I'm glad it didn't rain today"
]

# 타겟 텍스트
target_texts = [
    "What is your favorite food?",
    "Where do you live?",
    "Morning trains are crowded.",
    "It's nice weather today.",
    "The economy is bad lately.",
]

# 임베딩 벡터로 변환
in_embeds = embedding(in_texts)
target_embeds = embedding(target_texts)

In [None]:
import numpy as np

# 넘파이로 변환
in_embeds = np.array(in_embeds).astype("float32")
target_embeds = np.array(target_embeds).astype("float32")

In [None]:
import faiss

# 파이스 인덱스 생성
index = faiss.IndexFlatL2(len(in_embeds[0]))

In [None]:
# 타겟 텍스트를 인덱스에 추가
index.add(target_embeds)

In [None]:
# 이웃 탐색 실행
distances, indices = index.search(in_embeds, 1)

# 확인
print(distances)
print(indices)
print(target_texts[indices[0][0]])

[[0.53067213]]
[[3]]
It's nice weather today.


### bge-m3의 사용법

In [None]:
# bge-m3 패키지 설치
!pip install FlagEmbedding peft

Collecting FlagEmbedding
  Downloading FlagEmbedding-1.2.11.tar.gz (147 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/147.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m143.4/147.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.1/147.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting datasets (from FlagEmbedding)
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting sentence_transformers (from FlagEmbedding)
  Downloading sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->FlagEmbedding)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets->FlagEmbedding)
  Downloa

In [None]:
from FlagEmbedding import BGEM3FlagModel

# 임베딩 모델 초기화
model = BGEM3FlagModel("BAAI/bge-m3", use_fp16=True)

# 임베딩 함수 준비
def embedding(texts):
    return model.encode(texts)["dense_vecs"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/15.8k [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

imgs/.DS_Store:   0%|          | 0.00/6.15k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

imgs/bm25.jpg:   0%|          | 0.00/132k [00:00<?, ?B/s]

colbert_linear.pt:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

imgs/miracl.jpg:   0%|          | 0.00/576k [00:00<?, ?B/s]

long.jpg:   0%|          | 0.00/127k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

imgs/long.jpg:   0%|          | 0.00/485k [00:00<?, ?B/s]

imgs/others.webp:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

imgs/nqa.jpg:   0%|          | 0.00/158k [00:00<?, ?B/s]

imgs/mkqa.jpg:   0%|          | 0.00/608k [00:00<?, ?B/s]

model.onnx_data:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

onnx/config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

onnx/Constant_7_attr__value:   0%|          | 0.00/65.6k [00:00<?, ?B/s]

onnx/special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/725k [00:00<?, ?B/s]

onnx/tokenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

sparse_linear.pt:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

  colbert_state_dict = torch.load(os.path.join(model_dir, 'colbert_linear.pt'), map_location='cpu')
  sparse_state_dict = torch.load(os.path.join(model_dir, 'sparse_linear.pt'), map_location='cpu')


In [None]:
# 텍스트 준비
texts = ["이것은 테스트입니다."]

# 텍스트를 임베딩 벡터로 변환
embeds = embedding(texts)
print(embeds)

[[-0.02364   0.01746  -0.0461   ... -0.02083   0.006268  0.005203]]


In [None]:
# 임베딩 벡터의 길이 확인
print(len(embeds[0]))

1024


### bge-m3을 활용한 이웃 탐색

In [None]:
# 파이스 패키지 설치
!pip install faiss-cpu



In [None]:
# 입력 텍스트
in_texts = [
    "오늘은 비가 안와서 다행입니다."
]

# 타겟 텍스트
target_texts = [
    "좋아하는 음식은 무엇인가요?",
    "어디에 거주하시나요?",
    "출근시간에 지하철은 매우 붐빕니다.",
    "오늘 날씨가 참 좋네요.",
    "최근 경기가 좋지 않습니다."
]

# 임베딩 작성
in_embeds = embedding(in_texts)
target_embeds = embedding(target_texts)

In [None]:
import numpy as np

# 넘파이로 변환
in_embeds = np.array(in_embeds).astype("float32")
target_embeds = np.array(target_embeds).astype("float32")

In [None]:
import faiss

# 파이스 인덱스 생성
index = faiss.IndexFlatL2(len(in_embeds[0]))

In [None]:
# 타겟 텍스트를 인덱스에 추가
index.add(target_embeds)

In [None]:
# 이웃 탐색 실행
distances, indices = index.search(in_embeds, 1)

# 확인
print(distances)
print(indices)
print(target_texts[indices[0][0]])

[[0.39318293]]
[[3]]
오늘 날씨가 참 좋네요.
