# Import Libraries
Import libraries including pandas, NumPy, matplotlib, and Azure OpenAI SDK. Remove unused imports from the original notebook.

In [None]:
# Import Libraries
import pandas as pd
import time
from openai import AzureOpenAI
import os
from dotenv import load_dotenv

# Set Up Azure OpenAI Client
Configure the Azure OpenAI client using environment variables for API keys and endpoints.

In [None]:
# Load environment variables from .env file
load_dotenv()

# Initialize Azure OpenAI client correctly
client = AzureOpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    api_version="2023-05-15",
    azure_endpoint=os.getenv("OPENAI_API_BASE")
)

# Load and Prepare Data
Load the RedList data from a local CSV file or other accessible storage. Use pandas for data manipulation instead of PySpark.

In [None]:
# Load dataset with clusters
df = pd.read_json('output_folder\\clustered_dataset.json', lines=True)

In [None]:
# Visualize the clusters
print(df.groupby('cluster').size().reset_index(name='count'))

# Generate Summaries Using LLM
Define functions to interact with the Azure OpenAI API for generating summaries. Apply these functions to the prepared data using pandas.

In [None]:
# Define the function to process each cluster
def process_cluster(cluster_df):
    JoinDesc = '\n<threat-Separator>\n'.join(cluster_df['combined_text'].astype(str))

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": '''
                As a seasoned security analyst, you possess a deep understanding of security advisories. Your current task involves analyzing a list of security advisories, each separated by a '<Finding-Separator>'. These advisories have been pre-clustered using an unsupervised machine learning technique, indicating inherent commonalities. Your mission is to delve into these commonalities, uncover underlying patterns, and identify vulnerabilities. Follow this structured format in your response:
                1. **Cluster Title**: Create a concise title that encapsulates the core issue of the cluster. If acronyms are used, define them upon first mention.
                2. **Cluster Description**: Craft a detailed paragraph exploring the observed similarities among the advisories. Highlight recurring themes, shared vulnerabilities, or common sources of the advisories.
                3. **Suggested Actions**: Provide a list of concrete individual actionable steps that directly address the findings. Include specific links to relevant issues or gaps within the data whenever possible, to make the action more standalone. Ensure clarity, leaving no room for ambiguity during implementation.
                4. **Summary**: Conclude with a paragraph summarizing key points from the analysis. Emphasize the importance of the suggested actions and their potential impact on improving security.
                5. **Data References**: Where applicable, reference specific data points from the input data that support your analysis and suggested actions.

                Your goal is to offer practical, feasible guidance tailored to the specifics of this set of security advisories. Engineers should be able to follow these recommendations without additional interpretation.

                Example of an incorrect suggested action:
                - Patch Management and Security Updates: Develop a streamlined process for managing security updates, especially in environments with complex deployment scenarios. Prioritize the rollout of patches to first-party services before public release.

                Example of a correct suggested action:
                - Develop a streamlined process for managing security updates, especially in environments with complex deployment scenarios.
                - Prioritize the rollout of patches to first-party services before public release.
            '''},
            {"role": "user", "content": JoinDesc}
        ],
        temperature=0.2,
        max_tokens=4096,
        top_p=0.95,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )
    ClusterSummary = response.choices[0].message.content

    #print(ClusterSummary)
    #print('\n')
    #print('-----------------------------------------------------------------------------------------')

    # Return the cluster label and summary for aggregation
    return cluster_df['cluster'].iloc[0], ClusterSummary

    #time.sleep(5)  # Wait for 5 seconds before the next iteration

# Create a new dataframe to store the cluster summaries
cluster_summary_df = pd.DataFrame(columns=['cluster', 'ClusterSummary'])

# Group the original dataframe by 'cluster'
clustergroups = df.groupby(by='cluster')

# Process each cluster and store the results in the new dataframe
cluster_summaries = []  # Use a list to collect results for better performance
for _, grp in clustergroups:
    cluster, cluster_summary = process_cluster(grp)
    cluster_summaries.append({'cluster': cluster, 'ClusterSummary': cluster_summary})

# Convert the list of dictionaries to a DataFrame
cluster_summary_df = pd.DataFrame(cluster_summaries)

# Export Data
Save the processed data with generated summaries to a local file or other accessible storage in CSV format.

In [None]:
# Adjust the display settings for better readability
pd.set_option('display.max_colwidth', 100)

# Print the dataset with formatted ClusterSummary
for index, row in cluster_summary_df.iterrows():
    print(f"Cluster: {row['cluster']}")
    print(f"ClusterSummary:\n{row['ClusterSummary']}")
    print('-' * 80)

In [None]:
# Export Data
output_file_path = "output_folder\\cluster_summaries.json"

# Save the DataFrame to a CSV file
cluster_summary_df.to_json(output_file_path, index=False)
print(f"Generated summaries saved to {output_file_path}")