# 문장 -> 벡터(1차원 숫자 배열 [8.1, 9.1, 2, 5, 4, 3 ....])

- openAi API : https://platform.openai.com 의 키(OPENAI_API_KEY) .env에 등록
- upstage console : https://console.upstage.ai/ 의 키(UPSTAGE_API_KEY) .env에 등록 

## 1. 환경변수 load

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

# 2. 유사도 계산하는 방법 : https://www.pinecone.io/learn/vector-similarity
    1. 유클리드 거리 : 드 벡터간의 거리가 가까운지
    2. 코사인유사도 : 두 벡터간의 방향이 유사한지
    3. dot product : 두 벡터간의 곱을 사용하여 거리와 방향을 모두 고려

In [3]:
import numpy as np
def cosine_similarity(vec1, vec2):
    """두 백터 사이의 코사인 유사도 계산"""
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1) # 벡터의 길이 
    norm_vec2 = np.linalg.norm(vec2)
    if norm_vec1==0 or norm_vec2==0:
        return 0.0
    return dot_product / (norm_vec1*norm_vec2)

# 3. OpenAI API의 embedding model 사용

In [4]:
from openai import OpenAI
openai_client = OpenAI()

In [32]:
from openai import OpenAI
client = OpenAI()

response = client.embeddings.create(
    input="king",
    model="text-embedding-3-large"
)

response.data[0].embedding

[0.00871916301548481,
 0.023934651166200638,
 -0.001016518217511475,
 0.0020447769202291965,
 0.0014254736015573144,
 0.021226543933153152,
 0.018174054101109505,
 0.007251619827002287,
 -0.013916222378611565,
 0.0453333854675293,
 0.0032031575683504343,
 -0.00498182000592351,
 -0.0572928860783577,
 -0.055320508778095245,
 0.03234073892235756,
 0.008750470355153084,
 -0.06248994544148445,
 -0.005318376235663891,
 -0.05444389581680298,
 0.0010194532806053758,
 0.012452593073248863,
 -0.05143836513161659,
 -0.0017659435980021954,
 0.00826520286500454,
 0.030023977160453796,
 -0.019332434982061386,
 -0.007130302954465151,
 0.0034888393711298704,
 -0.011466404423117638,
 0.0068681021220982075,
 0.005995403043925762,
 0.05967226251959801,
 0.01487893145531416,
 0.03666118532419205,
 -0.01109853945672512,
 0.0015330933965742588,
 0.014792835339903831,
 -0.010221927426755428,
 -0.012687399983406067,
 -0.021962272003293037,
 -0.015732062980532646,
 -0.04069986566901207,
 -0.06825054436922073,


In [33]:
import numpy as np
king_vector = np.array(response.data[0].embedding)
print(king_vector.shape)
print(king_vector)

(3072,)
[ 0.00871916  0.02393465 -0.00101652 ...  0.00806953  0.00886787
 -0.00220914]


In [14]:
queen_response = openai_client.embeddings.create(
    input='queen',
    model='text-embedding-3-large'
)

In [15]:
queen_vector = np.array(queen_response.data[0].embedding)
print(queen_vector)
print(queen_vector.shape)

[-0.01385735  0.0008602  -0.0167823  ...  0.00017693  0.01159847
  0.00638929]
(3072,)


In [34]:
king_queen_similarity = cosine_similarity(king_vector, queen_vector)
print(king_queen_similarity)

0.5524397653173301


In [18]:
slave_response = openai_client.embeddings.create(
    input="slave",
    model="text-embedding-3-large"
)
slave_vector = np.array(slave_response.data[0].embedding)
print(slave_vector.shape)
print(slave_vector)

(3072,)
[-0.01999537  0.00620363  0.01191717 ...  0.00094749 -0.02679118
 -0.0058524 ]


In [35]:
king_slave_similarity = cosine_similarity(king_vector, slave_vector)
print('king과 slave유사도 :', king_slave_similarity)

king과 slave유사도 : 0.293244891303711


In [None]:
# 한국어 문장을 벡터로 바꿔도 유사도는 비슷해야 할 듯

In [20]:
kor_king_response = openai_client.embeddings.create(
    input="왕",
    model="text-embedding-3-large"
)

In [21]:
kor_king_vector = np.array(kor_king_response.data[0].embedding)
print(kor_king_vector.shape)

(3072,)


In [24]:
kor_queen_response = openai_client.embeddings.create(
    input="여왕",
    model="text-embedding-3-large"
)

In [25]:
kor_queen_vector = np.array(kor_queen_response.data[0].embedding)
print(kor_queen_vector.shape)

(3072,)


In [36]:
# 왕과 여왕의 유사도
cosine_similarity(kor_king_vector, kor_queen_vector)

np.float64(0.48733449549538954)

In [28]:
kor_slave_response = openai_client.embeddings.create(
    input="거지",
    model="text-embedding-3-large"
)

In [30]:
kor_slave_vector = np.array(kor_slave_response.data[0].embedding)
print(kor_slave_vector.shape)

(3072,)


In [31]:
# 왕과 거지의 유사도
cosine_similarity(kor_king_vector, kor_slave_vector)

np.float64(0.2552452064791607)

In [38]:
# king과 왕의 유사도
cosine_similarity(king_vector, kor_king_vector)

np.float64(0.548343839756621)

# 4. upstage의 embedding model 사용

In [40]:
import os
from openai import OpenAI
upstage_api_key = os.getenv("UPSTAGE_API_KEY")
upstage_client = OpenAI(
    api_key=upstage_api_key,
    base_url="https://api.upstage.ai/v1"
)

In [44]:
up_king_response = upstage_client.embeddings.create(
    input="king",
    model="embedding-query"
)

In [45]:
up_king_vector = np.array(up_king_response.data[0].embedding)
print(up_king_vector.shape)
print(up_king_vector)

(4096,)
[-0.01187134 -0.02062988 -0.00674057 ... -0.01081848  0.00247955
  0.01520538]


In [51]:
up_queen_response = upstage_client.embeddings.create(
    input="queen",
    model="embedding-query"
)

In [52]:
up_queen_vector = np.array(up_queen_response.data[0].embedding)
print(up_queen_vector.shape)
print(up_queen_vector)

(4096,)
[-0.0016222  -0.00952148 -0.00471878 ...  0.00985718 -0.00732803
  0.0259552 ]


In [53]:
cosine_similarity(up_king_vector, up_queen_vector)

np.float64(0.6278139653061549)

In [58]:
up_kor_king_response = upstage_client.embeddings.create(
    input="왕",
    model="embedding-query"
)

In [59]:
up_kor_king_vector = np.array(up_kor_king_response.data[0].embedding)
print(up_kor_king_vector.shape)
print(up_kor_king_vector)

(4096,)
[-0.01210022 -0.02249146 -0.01314545 ... -0.00024557  0.00358391
  0.01416779]


In [60]:
cosine_similarity(up_king_vector, up_kor_king_vector)

np.float64(0.8522292879902242)

In [63]:
up_kor_king_response2 = upstage_client.embeddings.create(
    input="세종",
    model="embedding-query"
)
up_kor_king_vector2 = np.array(up_kor_king_response2.data[0].embedding)
cosine_similarity(up_kor_king_vector, up_kor_king_vector2)

np.float64(0.6494713561983843)