In [71]:
from langchain.prompts import PromptTemplate
import json
import pandas as pd
from pprint import pprint
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# from langchain_core.example_selectors import SemanticSimilarityExampleSelector
# from langchain_openai import OpenAIEmbeddings
# from langchain_chroma import Chroma
from langchain_openai import ChatOpenAI
import os
import re
os.environ["OPENAI_API_KEY"] = "TOKEN"

In [72]:
def read_sampled_policies(base_path="sampled_policies"):
    # Define the expected nine PaC tool folder names
    pac_tools = [
        "Open Policy Agent (OPA)",
        "HashiCorp Sentinel",
        "Pulumi",
        "Cedar Policy Language (CPL)",
        "Kyverno OSS",
        "Cloud Custodian",
        "AWS Config",
        "OpagateKeeper",
        "Kubewarden"
    ]

    # Dictionary to hold the content lists
    pac_contents = {tool: [] for tool in pac_tools}

    # Iterate over each PaC tool folder
    for tool in pac_tools:
        tool_path = os.path.join(base_path, tool)
        if not os.path.isdir(tool_path):
            print(f"Warning: Folder '{tool}' not found in '{base_path}'")
            continue

        # Traverse all files in the tool's folder
        for root, _, files in os.walk(tool_path):
            for file in files:
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read()
                        pac_contents[tool].append(content)
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")

    # Unpack dictionary into individual lists
    rego = pac_contents["Open Policy Agent (OPA)"]
    sentinel = pac_contents["HashiCorp Sentinel"]
    pulumi = pac_contents["Pulumi"]
    cedar = pac_contents["Cedar Policy Language (CPL)"]
    kyverno = pac_contents["Kyverno OSS"]
    custodian = pac_contents["Cloud Custodian"]
    aws_config = pac_contents["AWS Config"]
    opagatekeeper = pac_contents["OpagateKeeper"]
    kubewarden = pac_contents["Kubewarden"]

    return rego, sentinel, pulumi, cedar, kyverno, custodian, aws_config, opagatekeeper, kubewarden

In [73]:
rego, sentinel, pulumi, cedar, kyverno, custodian, aws_config, opagatekeeper, kubewarden = read_sampled_policies()
print(len(rego), len(sentinel), len(pulumi))  # Check how many files were read

193 11 14


In [74]:
def extract_valid_json(llm_response):
    """
    Extracts and parses a valid JSON dictionary from LLM response content.

    Parameters:
    - llm_response: A LangChain LLM result object (with a `content` attribute)

    Returns:
    - A dictionary if JSON is valid, else an error dictionary
    """
    try:
        # Extract content string
        content = llm_response.content

        # Remove Markdown code block markers ```json ... ```
        cleaned = re.sub(r"```json\s*|\s*```", "", content.strip(), flags=re.IGNORECASE)

        # Parse the cleaned string into a JSON object
        return json.loads(cleaned)
    except Exception as e:
        return {"error": str(e), "raw_content": content}

In [75]:
def analyze_pac_list(pac_code_list, language, llm):
    """
    Analyze a list of PaC policy code snippets using the GPT-4o-mini model via LangChain (new syntax).

    Args:
        pac_code_list (list): List of strings (each containing PaC policy code).
        language (str): The name of the PaC language (e.g., "Rego", "Sentinel").
        llm: LangChain LLM instance (ChatOpenAI).

    Returns:
        list: List of dictionaries parsed from JSON model outputs.
    """
    base_prompt = f"""


You are an expert in policy analysis, software governance, cloud-native security, and regulatory compliance.

Given the following policy code snippet written in a Policies-as-Code (PaC) language (e.g., {language}), analyze it and provide a structured taxonomy entry by addressing seven governance-oriented dimensions: start by identifying the main purpose of the script, then from the main purpose identify sub-purposes from that category and sub-category for the taxonomy. Your task is to assign the policy to one of the predefined taxonomy categories and sub-categories listed below. If no suitable category or sub-category applies, suggest a new, concise one that best captures the intent and logic of the policy.

Use the detailed taxonomy definitions provided to guide your decision-making. The output must be returned in strict JSON format for automated processing.

üß† Seven Analytical Dimensions:

1. Primary Purpose  
   What is the high-level governance or security domain this policy addresses? Express it in simple, unambiguous sentence.

2. Sub-purpose  
   What are the specific goals or aspects of governance that the policy addresses within the broader purpose? Express it in a simple, unambiguous sentence.

3. Taxonomy Category  
   Choose one from the predefined categories:  
   "Security Governance", "Compliance Governance", "Cost Optimization", "Workflow Automation", "Deployment Governance"  
   ‚Üí Maximum 4 words.

4. Taxonomy Sub-category  
   Choose the most relevant sub-category from the predefined list which correspond to the taxonomy category.  
   ‚Üí Maximum 4 words.

5. Policy Implemented  
   Describe the specific rule enforced. Express it in simple, unambiguous, actionable language.

6. Target Resource  
   What is the specific resource or artifact the policy is applied to?
   E.g., "Kubernetes Pod", "Terraform", "Dockerfile", "CI/CD Pipeline Script", "Network", "Clouds environment", "Tokens", "VMs Instances", "YAML", "Kubernetes DaemonSet", "Docker Containers", "API"

7. Rationale  
   Justify your categorization and interpretation of the policy by explaining how the code enforces the rule.

üßæ Taxonomy Reference:
Refer to the following taxonomy to guide your categorization. If the policy does not fit, suggest a new category or sub-category in a short, meaningful phrase.

üõ°Ô∏è Security Governance
Policies that enforce access restrictions, secure configurations, and threat prevention.
Sub-categories:

Access Control: Who can access what under which roles/contexts.

Configuration Validation: Enforce structural and operational correctness (e.g., valid volume types).

Secrets Management: Secure use and access of tokens, credentials, etc.

Network Management: Traffic rules, firewalls, ingress/egress.

Resource Management: Quotas, usage enforcement, secure provisioning of CPU/memory/etc.

Security Review Compliance: Ensures auditing or security checks before deployment.

Vulnerability Management: Prevent use of outdated/unsafe software versions.

Actions Restrictions: Deny execution of harmful commands.

Workloads Management: Runtime constraints (e.g., deny service token mounting in Pods).

üìã Compliance Governance
Policies that enforce conformance with standards, formats, or legal/commercial obligations.
Sub-categories:

Resource Compliance: Labels, field structure, formatting requirements. 

Service Compliance: services meet specific operational, security, or performance requirement.

Third-party License Compliance: Licensing checks.

Standards Enforcement: enforces adherence of a particular technology to its best practice implementations.

üí∏ Cost Optimization
Policies that reduce waste or unused resources in cloud environments.
(No sub-categories)

üîÅ Workflow Automation
Policies that trigger automatic remediation, auditing, or infrastructure provisioning.
(No sub-categories)

üöÄ Deployment Governance
Policies governing how and when deployments occur.
Sub-categories:

Access Control: Who can perform deployment actions.

Key Management: Monitor, rotate, or alert on key usage.

üîç Task Execution:

Here is the policy code snippet:

{{policy_code}}

Please return only a valid JSON dictionary with the seven fields above populated appropriately.

If the policy logic doesn't fit any of the categories, suggest a new taxonomy category and taxonomy sub-category.

"""
    # Use LangChain‚Äôs new syntax for prompt composition
    prompt = PromptTemplate(
        input_variables=["policy_code"],
        template=base_prompt
    )
    chain = prompt | llm

    results = []
    
    for i, code in enumerate(pac_code_list[:2]):
        formatted_prompt = prompt.format(policy_code=code)
        print(f"\n--- Prompt for Example {i+1} ---\n")
        print(formatted_prompt)
    
    for code in pac_code_list:
        try:
            output = chain.invoke({"policy_code": code})
            # print(output)
            parsed_json = extract_valid_json(output)  # where output is the LLM result object
            # print(parsed_json)
            # parsed = json.loads(parsed_json)
            # print(parsed)
            results.append(parsed_json)
            # print(results)
        except Exception as e:
            results.append({"error": str(e), "raw_output": output if 'output' in locals() else None})

    return results

In [76]:
# Let's assume you already have a list of Rego policies from the previous function
# and your model is defined as shown in your image:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2)

results = analyze_pac_list(rego, "rego policy library", llm)


--- Prompt for Example 1 ---




You are an expert in policy analysis, software governance, cloud-native security, and regulatory compliance.

Given the following policy code snippet written in a Policies-as-Code (PaC) language (e.g., rego policy library), analyze it and provide a structured taxonomy entry by addressing seven governance-oriented dimensions: start by identifying the main purpose of the script, then from the main purpose identify sub-purposes from that category and sub-category for the taxonomy. Your task is to assign the policy to one of the predefined taxonomy categories and sub-categories listed below. If no suitable category or sub-category applies, suggest a new, concise one that best captures the intent and logic of the policy.

Use the detailed taxonomy definitions provided to guide your decision-making. The output must be returned in strict JSON format for automated processing.

üß† Seven Analytical Dimensions:

1. Primary Purpose  
   What is the high-level go

In [77]:
# def save_results_to_excel(json_list, filename="pac_analysis_results.xlsx"):
#     """
#     Save a list of JSON dictionaries to an Excel file.

#     Parameters:
#     - json_list: List[Dict] ‚Äì the list of JSON objects (e.g., output from the LLM).
#     - filename: str ‚Äì the name of the Excel file to save (default: "pac_analysis_results.xlsx").

#     Returns:
#     - None (writes the Excel file to disk).
#     """
#     try:
#         df = pd.DataFrame(json_list)
#         df.to_excel(filename, index=False)
#         print(df.head(10))
#         print(f"Saved {len(df)} entries to '{filename}' successfully.")
#     except Exception as e:
#         print(f"Failed to save results: {e}")

def save_results_to_excel_with_code(json_list, pac_code_list, filename="pac_analysis_results.xlsx"):
    """
    Save a list of JSON dictionaries to an Excel file, including their associated policy code snippets.

    Parameters:
    - json_list: List[Dict] ‚Äì the list of JSON objects (e.g., output from the LLM).
    - pac_code_list: List[str] ‚Äì the corresponding list of code snippets.
    - filename: str ‚Äì the name of the Excel file to save.

    Returns:
    - None (writes the Excel file to disk).
    """
    try:
        # Inject each code snippet into its corresponding JSON dictionary
        enriched_results = []
        for json_obj, code in zip(json_list, pac_code_list):
            json_obj = dict(json_obj)  # Ensure it's mutable
            json_obj["Code Snippet"] = code
            enriched_results.append(json_obj)

        df = pd.DataFrame(enriched_results)
        df.to_excel(filename, index=False)
        print(df.head(5))
        print(f"Saved {len(df)} entries to '{filename}' successfully.")
    except Exception as e:
        print(f"Failed to save results: {e}")


In [78]:
# Assuming `results` is your list of JSON dictionaries
# save_results_to_excel(results, "OPA_results.xlsx")
save_results_to_excel_with_code(results, rego, "rego_results_full.xlsx")

                                     Primary Purpose  \
0  The policy addresses the enforcement of access...   
1  The policy enforces a specific condition for d...   
2  The policy addresses the enforcement of allowe...   
3  The policy ensures compliance with recommended...   
4  The policy addresses the validation and transf...   

                                         Sub-purpose      Taxonomy Category  \
0  It aims to deny access based on specific condi...    Security Governance   
1  It checks if a particular data attribute match...  Compliance Governance   
2  It specifically aims to restrict the execution...    Security Governance   
3  It specifically enforces the presence of requi...  Compliance Governance   
4  It ensures that data processing functions oper...    Workflow Automation   

        Taxonomy Sub-category  \
0        Actions Restrictions   
1         Resource Compliance   
2        Actions Restrictions   
3         Resource Compliance   
4  Data Processing Vali