In [1]:
import numpy as np

In [2]:
def cosine_similarity(vec_a:np.ndarray, vec_b:np.ndarray)-> float:
    dot = np.dot(vec_a, vec_b)
    norm_a = np.linalg.norm(vec_a)
    norm_b = np.linalg.norm(vec_b)
    return dot / (norm_a*norm_b)

# openai embedding

In [10]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [5]:
from openai import OpenAI
openai_client = OpenAI()

In [18]:
def get_openai_embedding(text:str, model='text-embedding-3-small'):
    response = openai_client.embeddings.create(
        input=text,
        model=model
    )
    return response.data[0].embedding

In [19]:
text_str = '안녕하세요'
emb_vector = get_openai_embedding(text_str)
len(emb_vector)  # 임베딩 벡터의 사이즈

1536

In [23]:
# 임베딩 모델 생성
from langchain_upstage import UpstageEmbeddings
 
embeddings = UpstageEmbeddings(
    api_key=os.getenv("UPSTAGE_API_KEY"),
    model="solar-embedding-1-large"
)

In [24]:
# 업스테이지 모델로 임베딩 함수
def get_upstage_embedding(text: str, is_query: bool = False) -> np.ndarray:
    """
    Upstage 임베딩 벡터를 반환한다.

    Args:
        text (str): 임베딩할 문장
        is_query (bool): True이면 검색 쿼리용 임베딩(embed_query), 
                         False이면 문서 임베딩(embed_documents)

    Returns:
        np.ndarray: 임베딩 벡터 (float 배열)
    """
    if is_query:
        vec = embeddings.embed_query(text)
    else:
        vec = embeddings.embed_documents([text])[0]  # 리스트에서 첫 번째 요소만 추출
    return np.array(vec)

In [11]:
from langchain_upstage import UpstageEmbeddings
 
embeddings = UpstageEmbeddings(
    api_key=os.getenv("UPSTAGE_API_KEY"),
    model="embedding-query"
)
 
doc_result = embeddings.embed_documents(
    ["Sam is a teacher.", "This is another document"]
)
print(doc_result)
 
query_result = embeddings.embed_query("What does Sam do?")
print(query_result)

[[0.0164034403860569, 0.017108500003814697, -0.007777245249599218, 0.0241303239017725, 0.0033562302123755217, -0.006834766827523708, -0.014655179344117641, -0.010676625184714794, -0.015640825033187866, 0.006741238292306662, 0.01836034283041954, 0.00505053298547864, 0.007234061136841774, 0.01180616021156311, 0.02769879251718521, 0.021900031715631485, -0.021453972905874252, -0.0012671297881752253, -0.0029803181532770395, -0.016777554526925087, -0.024159101769328117, -0.00909383688122034, -0.011216212064027786, -0.005913871806114912, -0.010396040044724941, 0.016273939982056618, 0.0027626845985651016, -0.01197163388133049, 0.005136867053806782, 0.02151152864098549, 0.0056260921992361546, 0.014086814597249031, 0.002041436964645982, -0.009367227554321289, 0.010719791986048222, -0.012245024554431438, -0.0035684676840901375, 0.023713042959570885, -0.012453665025532246, 0.012870945036411285, -0.011201823130249977, -0.02001507580280304, -0.014058036729693413, 0.00010937875049421564, -0.016504162

In [16]:
len(query_result)   # 임베딩 벡터의 사이즈

4096

In [17]:
texts = ['king', 'queen', 'slave', '왕']

# openai embedding 모델로 임베딩

In [None]:
openai_embeddings = {txt: get_openai_embedding(txt) for txt in texts}

In [21]:
cosine_similarity(openai_embeddings['queen'], openai_embeddings['king'])

np.float64(0.590601530239691)

# upstage embedding 모델로 임베딩

In [25]:
upstage_embeddings = {txt: get_upstage_embedding(txt) for txt in texts}

In [26]:
cosine_similarity(upstage_embeddings['queen'], upstage_embeddings['king'])

np.float64(0.6445020625549306)

왕 비교

In [28]:
cosine_similarity(openai_embeddings['왕'], openai_embeddings['king'])

np.float64(0.5040406331683572)

In [27]:
cosine_similarity(upstage_embeddings['왕'], upstage_embeddings['king'])

np.float64(0.6963750608733088)

# 올라마 임베딩 > huggingface의 임베딩 모델

젬마2 모델로 임베딩 비교

In [2]:
from langchain_community.chat_models import ChatOllama
llm_ollama = ChatOllama(model='gemma2')
response = llm_ollama.invoke('안녕, 네 소개를 2줄로 작성해줘.')
print(response.content)

  llm_ollama = ChatOllama(model='gemma2')


저는 Google에서 훈련된 대규모 언어 모델입니다.

텍스트 생성, 번역, 요약 등 다양한 작업을 수행하며 사용자의 질문에 답변하고 정보를 제공하는 데 도움이 될 수 있습니다.


In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('BAAI/bge-multilingual-gemma2')
gemma_embedding = model.encode(texts)
gemma_embedding