In [None]:
import pandas as pd
import time
from openai import AzureOpenAI
from datetime import datetime
import re
from tabulate import tabulate

In [2]:
# ===============================
# 1. Feature Library
# ===============================

feature_library_data = {
    "feature_name": [
        "wirein_ct",
        "perc_hrg_wire_amt",
        "degree_centrality"
    ],
    "feature_meaning": [
        "Number of wire inbound transactions",
        "Percentage of wire amount associated with high-risk geographic country",
        "Number of connections an entity has in a network (degree centrality)"
    ]
}

feature_library_df = pd.DataFrame(feature_library_data)
feature_library_df.to_csv("feature_library.csv", index=False)
print("feature_library.csv saved.")

# ===============================
# 2. Feature Score
# ===============================

feature_score_data = {
    "feature_name": [
        "wirein_ct",
        "perc_hrg_wire_amt",
        "degree_centrality"
    ],
    "score": [
        0.52,
        0.36,
        0.12
    ]
}

feature_score_df = pd.DataFrame(feature_score_data)
feature_score_df.to_csv("feature_score.csv", index=False)
print("feature_score.csv saved.")

# ===============================
# 3. Risk Score
# ===============================

risk_score_data = {
    "risk_score": [
        0.8
    ]
}

risk_score_df = pd.DataFrame(risk_score_data)
risk_score_df.to_csv("risk_score.csv", index=False)
print("risk_score.csv saved.")

feature_library.csv saved.
feature_score.csv saved.
risk_score.csv saved.


In [3]:
# ===============================
# 1. Load the data
# ===============================

feature_library = pd.read_csv("feature_library.csv")
feature_score = pd.read_csv("feature_score.csv")
risk_score_df = pd.read_csv("risk_score.csv")

In [4]:
feature_library, feature_score, risk_score_df

(        feature_name                                    feature_meaning
 0          wirein_ct                Number of wire inbound transactions
 1  perc_hrg_wire_amt  Percentage of wire amount associated with high...
 2  degree_centrality  Number of connections an entity has in a netwo...,
         feature_name  score
 0          wirein_ct   0.52
 1  perc_hrg_wire_amt   0.36
 2  degree_centrality   0.12,
    risk_score
 0         0.8)

### Generate Narrative based on Model results

In [7]:
from narrative.prompt_generator import build_feature_contribution_prompt

# Assume you already have:
# - feature_score_df
# - feature_library_df
# - risk_score_df

prompt_text = build_feature_contribution_prompt(
    feature_score_df,
    feature_library_df,
    risk_score_df
)

print("=== Prompt Text ===")
print(prompt_text)


=== Prompt Text ===
You are a risk model explanation assistant. Given a risk score and a list of features with their descriptions and contributions, generate a clear, concise narrative explaining the risk score.

Risk Score: 80%
Top Features and Contributions:
- wirein_ct (Number of wire inbound transactions): 52% contribution
- perc_hrg_wire_amt (Percentage of wire amount associated with high-risk geographic country): 36% contribution
- degree_centrality (Number of connections an entity has in a network (degree centrality)): 12% contribution

Please produce a narrative that:
- Starts with the risk score
- Explains each feature’s contribution in plain language
- Highlights why each feature might indicate a higher risk



### Judge Model

In [None]:
# ========== 3. Call Azure OpenAI API ==========

# Fill in your Azure details here:
api_key = "YOUR_AZURE_OPENAI_API_KEY"
api_base = "YOUR_AZURE_OPENAI_ENDPOINT"       # e.g. "https://your-resource-name.openai.azure.com/"
api_version = "2024-02-15-preview"            # Adjust if needed
deployment_name = "YOUR_DEPLOYMENT_NAME"      # e.g. "gpt-4"

client = AzureOpenAI(
    api_key=api_key,
    api_version=api_version,
    azure_endpoint=api_base
)


# Measure generation time
start_time = time.time()

response = client.chat.completions.create(
    model=deployment_name,
    messages=[
        {"role": "system", "content": "You are a helpful assistant that explains model risk scores."},
        {"role": "user", "content": prompt_text}
    ],
    temperature=0.3,
    max_tokens=500
)

end_time = time.time()
generation_time = end_time - start_time

generated_text = response.choices[0].message.content.strip()

# ========== 4. Print the Response ==========
generated_text = response.choices[0].message.content.strip()
print("\n=== Generated Narrative ===")
print(generated_text)
print(f"\nGeneration Time: {generation_time:.2f} seconds")

# ========== 4. Evaluate the Explanation ==========
evaluation_prompt = f"""
Please evaluate the following risk explanation on the following criteria (scale 1-5):
1. Clarity
2. Conciseness
3. Completeness

Provide a short justification for each score.

Generated Explanation:
\"\"\"
{generated_text}
\"\"\"
"""

evaluation_response = client.chat.completions.create(
    model=deployment_name,
    messages=[
        {"role": "system", "content": "You are an evaluation assistant that rates the quality of risk explanations."},
        {"role": "user", "content": evaluation_prompt}
    ],
    temperature=0.0,
    max_tokens=300
)

evaluation_text = evaluation_response.choices[0].message.content.strip()

print("\n=== Evaluation ===")
print(evaluation_text)

In [None]:
# Define your deployed Azure OpenAI judge models
judge_models = {
    "GPT-4": "gpt-4-deployment",          # Replace with your Azure deployment name
    "GPT-4o": "gpt-4o-deployment",        # Replace with your Azure deployment name
    "GPT-35-Turbo": "gpt-35-turbo-deployment"  # Replace with your Azure deployment name
}

# Function to evaluate with each judge model
def evaluate_with_judge(deployment_name, prompt, output):
    eval_prompt = f"""
You are an evaluation assistant. Given a prompt and a model-generated answer, please assess the quality of the answer based on:
- Clarity (1-5)
- Conciseness (1-5)
- Completeness (1-5)
Provide a score for each, then write a short summary comment.

Prompt:
{prompt}

Model-Generated Answer:
{output}
"""
    response = openai.ChatCompletion.create(
        engine=deployment_name,
        messages=[
            {"role": "system", "content": "You are a helpful assistant that evaluates AI-generated text."},
            {"role": "user", "content": eval_prompt}
        ],
        temperature=0
    )
    eval_text = response['choices'][0]['message']['content']
    return eval_text

def extract_scores(eval_text):
    """
    Extracts clarity, conciseness, and completeness scores from the evaluation text.
    """
    clarity = conciseness = completeness = None

    clarity_match = re.search(r'Clarity[:\s]+(\d)', eval_text, re.IGNORECASE)
    conciseness_match = re.search(r'Conciseness[:\s]+(\d)', eval_text, re.IGNORECASE)
    completeness_match = re.search(r'Completeness[:\s]+(\d)', eval_text, re.IGNORECASE)

    if clarity_match:
        clarity = int(clarity_match.group(1))
    if conciseness_match:
        conciseness = int(conciseness_match.group(1))
    if completeness_match:
        completeness = int(completeness_match.group(1))

    return clarity, conciseness, completeness

# Evaluate using each judge model
results = []
start_time = datetime.now()

for judge_name, deployment_name in judge_models.items():
    eval_start = datetime.now()
    eval_text = evaluate_with_judge(deployment_name, prompt_text, generated_narrative)
    eval_end = datetime.now()
    time_taken = (eval_end - eval_start).total_seconds()
    # results.append({
    #     "Judge Model": judge_name,
    #     "Evaluation Text": eval_text.strip(),
    #     "Time Taken (s)": time_taken
    # })
    clarity, conciseness, completeness = extract_scores(eval_text)
    results.append({
        "Judge Model": judge_name,
        "Clarity": clarity,
        "Conciseness": conciseness,
        "Completeness": completeness,
        "Evaluation Summary": eval_text.strip(),
        "Time Taken (s)": f"{time_taken:.2f}"
    })

end_time = datetime.now()
total_time = (end_time - start_time).total_seconds()

# Convert results to DataFrame and save
df_results = pd.DataFrame(results)

print("\n=== Evaluation Results ===")
print(df_results.to_markdown(index=False))

df_results.to_csv("evaluation_results_azure_openai.csv", index=False)
print(f"\nTotal evaluation time: {total_time:.2f} seconds")

In [None]:
# Print results as a table using tabulate
print("\n=== Evaluation Results ===\n")
print(tabulate(df_results, headers="keys", tablefmt="fancy_grid", showindex=False))

In [6]:
# Example DataFrame with prompt and output columns
df = pd.DataFrame({
    "prompt": [
        "Explain the theory of relativity.",
        "What is the capital of France?"
    ],
    "generated_answer": [
        "The theory of relativity was developed by Albert Einstein...",
        "The capital of France is Paris."
    ]
})

df.to_csv("test.csv", index=False)
df

# Run evaluation
# df_with_scores = evaluate_dataframe(df, prompt_col="prompt", output_col="generated_answer")
# df_with_scores.to_csv(save_to_csv, index=False)
# print(f"Saved evaluation results to '{save_to_csv}'")

# print(df_with_scores.head())

Unnamed: 0,prompt,generated_answer
0,Explain the theory of relativity.,The theory of relativity was developed by Albe...
1,What is the capital of France?,The capital of France is Paris.


Can I use LLMs to figure out the relationship between features?

Not directly from the raw data — LLMs like GPT-4 are language models, and they do not learn relationships from raw data the way statistical or ML models (like XGBoost) do.

You can prompt an LLM to summarize relationships if you:

- Provide a summary of your feature importances or interactions (e.g. from SHAP values or partial dependence plots).
- Ask the LLM to generate human-readable explanations about possible relationships (e.g. "If feature A and feature B both have high values, how might that influence risk?").

For example:
If you compute SHAP interaction values (which show how pairs of features interact in the XGBoost model), you can prompt the LLM:

“Given that Feature A and Feature B have a strong interaction (SHAP interaction value = 0.25), what might that indicate about risk behavior?”

The LLM can then verbalize or hypothesize the interaction in plain language — but it’s not discovering new interactions itself. That part is still done by your ML tools.




They cannot themselves build a predictive model from raw data.
They can summarize existing knowledge you give them (like feature interactions, partial dependence plots, or SHAP values).



They cannot uncover feature interactions from raw tabular data on their own — that’s what tree-based models or techniques like SHAP do.

Given that Feature X has a contribution of 0.5% to the risk score, what might explain its low contribution relative to other features?

The LLM can generate reasons like:

“This feature might have a narrow value range, so it has less discriminative power.”

“This feature might overlap with other highly predictive features, so its unique contribution is reduced.”

Again, the LLM is explaining, not discovering the underlying math.


| **Question**                               | **LLM’s Role**                                         | **Best Tool**                                             |
| ------------------------------------------ | ------------------------------------------------------ | --------------------------------------------------------- |
| Discovering relationships between features | Explain them verbally, given data from XGBoost or SHAP | XGBoost (tree-based splits) + SHAP                        |
| Explaining low contribution features       | Very good at explaining why, given contribution data   | XGBoost + SHAP for quantifying                            |
| Generating human-readable insights         | Excellent, given data                                  | LLM (with model explanations)                             |


✅ Style guidance — Helps the LLM learn to produce text that matches your team's preferred tone and structure

✅ Evaluation — Provides a "gold standard" baseline to compare LLM outputs against