# Import Required Libraries
Import libraries such as pandas for data manipulation and Azure OpenAI for embedding generation.

In [None]:
import pandas as pd
import os
from openai import AzureOpenAI
from sklearn.decomposition import PCA
from dotenv import load_dotenv

# Load Dataset
Load the dataset into a pandas DataFrame and inspect the 'combined_text' column.

In [None]:
# Load the dataset
df = pd.read_json('output_folder/cleaned_advisories.json', lines=True)

# Preprocess Text Data
Perform any necessary preprocessing on the 'combined_text' column, such as removing null values or cleaning text.

In [None]:
# Preprocess Text Data
# Drop rows with null values in the 'combined_text' column
df = df.dropna(subset=['combined_text'])

# Optional: Add any additional text preprocessing steps here
print(f"Dataset after preprocessing: {len(df)} rows")

# Generate Two-Dimensional Embeddings
Ensure embeddings are two-dimensional and save them in a new column for each row.

In [None]:
# Load environment variables from .env file
load_dotenv()

# Initialize Azure OpenAI client correctly
client = AzureOpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    api_version="2023-05-15",
    azure_endpoint=os.getenv("OPENAI_API_BASE")
)

# Deployment name for the embeddings model
deployment_name = "text-embedding-3-small"

# Generate embeddings for the text
embeddings = []
for text in df['combined_text']:
    response = client.embeddings.create(
        input=text,
        model=deployment_name
    )
    embedding = response.data[0].embedding
    embeddings.append(embedding)

# Reduce embeddings to 2 dimensions using PCA
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(embeddings)

# Add the 2D embeddings as a new column in the DataFrame
df['embeddings_2d'] = embeddings_2d.tolist()

# Save the DataFrame with 2D embeddings to a new JSON file
df.to_json('output_folder\\dataset_with_2d_embeddings.json', orient='records', lines=True)

print("2D embeddings have been successfully saved to 'dataset_with_2d_embeddings.json'.")

In [None]:
# Check the length of the embeddings array for the first row
embedding_length = len(df['embeddings_2d'].iloc[0])
print(f"The embeddings have {embedding_length} dimensions.")