# Import Required Libraries
Import libraries such as pandas for data manipulation and Azure OpenAI for embedding generation.

In [1]:
import pandas as pd
import os
from openai import AzureOpenAI
from sklearn.decomposition import PCA
from dotenv import load_dotenv

# Load Dataset
Load the dataset into a pandas DataFrame and inspect the 'combined_text' column.

In [6]:
# Load the dataset
df = pd.read_json('output_folder\cleaned_advisories.json', lines=True)

# Inspect the first few rows of the dataset
print(df.head())

# Check for the presence of the 'combined_text' column
if 'combined_text' not in df.columns:
    raise ValueError("The dataset does not contain a 'combined_text' column.")

               ghsa_id          cve_id  \
0  GHSA-472w-7w45-g3w5            None   
1  GHSA-hmp7-x699-cvhq            None   
2  GHSA-rq77-p4h8-4crw  CVE-2025-24358   
3  GHSA-vw58-ph65-6rxp  CVE-2024-47822   
4  GHSA-7vpp-9cxj-q8gv   CVE-2025-3445   

                                                 url  \
0  https://api.github.com/advisories/GHSA-472w-7w...   
1  https://api.github.com/advisories/GHSA-hmp7-x6...   
2  https://api.github.com/advisories/GHSA-rq77-p4...   
3  https://api.github.com/advisories/GHSA-vw58-ph...   
4  https://api.github.com/advisories/GHSA-7vpp-9c...   

                                            html_url  \
0  https://github.com/advisories/GHSA-472w-7w45-g3w5   
1  https://github.com/advisories/GHSA-hmp7-x699-cvhq   
2  https://github.com/advisories/GHSA-rq77-p4h8-4crw   
3  https://github.com/advisories/GHSA-vw58-ph65-6rxp   
4  https://github.com/advisories/GHSA-7vpp-9cxj-q8gv   

                                             summary  \
0  Pleezer resour

  df = pd.read_json('output_folder\cleaned_advisories.json', lines=True)


# Preprocess Text Data
Perform any necessary preprocessing on the 'combined_text' column, such as removing null values or cleaning text.

In [7]:
# Preprocess Text Data
# Drop rows with null values in the 'combined_text' column
df = df.dropna(subset=['combined_text'])

# Optional: Add any additional text preprocessing steps here
print(f"Dataset after preprocessing: {len(df)} rows")

Dataset after preprocessing: 30 rows


# Generate Two-Dimensional Embeddings
Ensure embeddings are two-dimensional and save them in a new column for each row.

In [None]:
# Load environment variables from .env file
load_dotenv()

# Initialize Azure OpenAI client correctly
client = AzureOpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    api_version="2023-05-15",
    azure_endpoint=os.getenv("OPENAI_API_BASE")
)

# Deployment name for the embeddings model
deployment_name = "text-embedding-3-small"

# Generate embeddings for the text
embeddings = []
for text in df['combined_text']:
    response = client.embeddings.create(
        input=text,
        model=deployment_name
    )
    embedding = response.data[0].embedding
    embeddings.append(embedding)

# Reduce embeddings to 2 dimensions using PCA
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(embeddings)

# Add the 2D embeddings as a new column in the DataFrame
df['embeddings_2d'] = embeddings_2d.tolist()

# Save the DataFrame with 2D embeddings to a new JSON file
df.to_json('output_folder\\dataset_with_2d_embeddings.json', orient='records', lines=True)

print("2D embeddings have been successfully saved to 'dataset_with_2d_embeddings.json'.")

2D embeddings have been successfully saved to 'dataset_with_2d_embeddings.json'.


In [13]:
# Check the length of the embeddings array for the first row
embedding_length = len(df['embeddings_2d'].iloc[0])
print(f"The embeddings have {embedding_length} dimensions.")

The embeddings have 2 dimensions.
