# 01_임베딩 3D 시각화

# 0. 기본 Library

In [None]:
import pandas as pd
import numpy as np

## 1. OpenAI 라이브러리 설치 확인

In [None]:
# !pip install openai python-dotenv ipympl

## 2. OpenAI 정보 읽기

In [None]:
import os
import openai
from dotenv import load_dotenv, find_dotenv
dotenv_path = find_dotenv(filename='./.env')
load_dotenv(dotenv_path=dotenv_path)

openai.api_type = "azure"
openai.api_version = os.getenv("OPENAI_API_VERSION","").strip()

API_KEY = os.getenv("OPENAI_API_KEY","").strip()
assert API_KEY, "ERROR: Azure OpenAI Key is missing"
openai.api_key = API_KEY

RESOURCE_ENDPOINT = os.getenv("OPENAI_API_BASE","").strip()
assert RESOURCE_ENDPOINT, "ERROR: Azure OpenAI Endpoint is missing"
assert "openai.azure.com" in RESOURCE_ENDPOINT.lower(), "ERROR: Azure OpenAI Endpoint should be in the form: \n\n\t<your unique endpoint identifier>.openai.azure.com"
openai.api_base = RESOURCE_ENDPOINT

deployment_id ='text-embedding-ada-002'

## 3. 함수 정의

In [None]:
def get_embedding(text, deployment_id=deployment_id):
    """ 
    Get embeddings for an input text from the dataframe. 
    """
    result = openai.Embedding.create(
      deployment_id=deployment_id,
      input=text
    )
    result = np.array(result["data"][0]["embedding"])
    return result

def vector_similarity(x, y):
    """
    Returns the similarity between two vectors.    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    similarity = np.dot(x, y)
    return similarity 

def order_document_sections_by_query_similarity(query, contexts):
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated articles embeddings
    to find the most relevant articles. 
    Return the list of articles, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)

    document_similarities = sorted(
        [(vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()], 
        reverse=True)
    
    return document_similarities

## 4. 데이터 소스 Embedding

In [None]:
df_orig = pd.read_csv("../data/bbc-news-data.csv", delimiter='\t')
df = df_orig.copy()
df

In [None]:
df['embedding'] = ''

# for i in range(len(df)):    
for i in range(505,520):
    try:
        embedding = openai.Embedding.create(input=df['content'][i], deployment_id=deployment_id)
        df['embedding'][i] = np.array(embedding['data'][0]['embedding'])
    except Exception as err:
        i
        print(f"Unexpected {err=}, {type(err)=}")

df = df[df['embedding'].apply(lambda x: isinstance(x, (list, np.ndarray)))]

df

## 5. 질의 기반 검색

In [None]:
query = 'News about stock market.'

In [None]:
answers = order_document_sections_by_query_similarity(query=query, contexts=df['embedding'])[0:3] # Set to top 3

print(answers)

# print top 3
for answer in answers:
    print(f'similarity score:   {answer[0]}')
    print(df['content'].loc[answer[1]], '\n')

## 6. Embedding 시각화

### Question 시각화 데이터 추가 및 검색 결과 데이터 변경

In [None]:
for answer in answers:
    df.at[answer[1], 'category'] = df.loc[answer[1], 'category'] + "/answer"


embedding = openai.Embedding.create(input=query, deployment_id=deployment_id)

new_row = {'category': 'Question', 'content': query, 'embedding': np.array(embedding['data'][0]['embedding'])}
    
new_df = pd.DataFrame([new_row])

df = pd.concat([df, new_df], ignore_index=True)

### 차원 줄이기

In [None]:
if False: # Set to True if using TSNE
    from sklearn.manifold import TSNE

    # Create a t-SNE model and transform the data
    tsne = TSNE(n_components=3, perplexity=15, random_state=42, init='random', learning_rate=200)
    vis_dims_tsne = tsne.fit_transform(df['embedding'].to_list())
    vis_dims_tsne.shape

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
vis_dims_pca = pca.fit_transform(df['embedding'].to_list())
vis_dims_pca.shape
vis_dims_pca

In [None]:
df["vis_dims_pca"] = vis_dims_pca.tolist()

### 차트 시각화

In [None]:
%matplotlib widget 
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(projection='3d')
cmap = plt.get_cmap("tab20")

categories = sorted(df['category'].unique())

# Plot each sample category individually
for i, cat in enumerate(categories):
    sub_matrix = np.array(df[df["category"] == cat]["vis_dims_pca"].to_list())
    x=sub_matrix[:, 0]
    y=sub_matrix[:, 1]
    z=sub_matrix[:, 2]
    colors = [cmap(i/len(categories))] * len(sub_matrix)
    _ = ax.scatter(x, y, zs=z, zdir='z', c=colors, label=cat)

_ = ax.set_xlabel('x')
_ = ax.set_ylabel('y')
_ = ax.set_zlabel('z')
_ = ax.legend()