In [None]:
import pandas as pd
import time
from openai import AzureOpenAI
from datetime import datetime
import re
from tabulate import tabulate

In [2]:
# ===============================
# 1. Feature Library
# ===============================

feature_library_data = {
    "feature_name": [
        "wirein_ct",
        "perc_hrg_wire_amt",
        "degree_centrality"
    ],
    "feature_meaning": [
        "Number of wire inbound transactions",
        "Percentage of wire amount associated with high-risk geographic country",
        "Number of connections an entity has in a network (degree centrality)"
    ]
}

feature_library_df = pd.DataFrame(feature_library_data)
feature_library_df.to_csv("feature_library.csv", index=False)
print("feature_library.csv saved.")

# ===============================
# 2. Feature Score
# ===============================

feature_score_data = {
    "feature_name": [
        "wirein_ct",
        "perc_hrg_wire_amt",
        "degree_centrality"
    ],
    "score": [
        0.52,
        0.36,
        0.12
    ]
}

feature_score_df = pd.DataFrame(feature_score_data)
feature_score_df.to_csv("feature_score.csv", index=False)
print("feature_score.csv saved.")

# ===============================
# 3. Risk Score
# ===============================

risk_score_data = {
    "risk_score": [
        0.8
    ]
}

risk_score_df = pd.DataFrame(risk_score_data)
risk_score_df.to_csv("risk_score.csv", index=False)
print("risk_score.csv saved.")

feature_library.csv saved.
feature_score.csv saved.
risk_score.csv saved.


In [3]:
# ===============================
# 1. Load the data
# ===============================

feature_library = pd.read_csv("feature_library.csv")
feature_score = pd.read_csv("feature_score.csv")
risk_score_df = pd.read_csv("risk_score.csv")

In [6]:
feature_library, feature_score, risk_score_df

(        feature_name                                    feature_meaning
 0          wirein_ct                Number of wire inbound transactions
 1  perc_hrg_wire_amt  Percentage of wire amount associated with high...
 2  degree_centrality  Number of connections an entity has in a netwo...,
         feature_name  score
 0          wirein_ct   0.52
 1  perc_hrg_wire_amt   0.36
 2  degree_centrality   0.12,
    risk_score
 0         0.8)

In [17]:
# ========== 2. Prepare the Prompt ==========
# Sort feature scores by descending contribution
feature_score_df = feature_score_df.sort_values(by="score", ascending=False)

# Merge the scores with the feature meanings
merged_df = pd.merge(
    feature_score_df,
    feature_library_df,
    on="feature_name",
    how="left"
)

# Extract the risk score
risk_score = risk_score_df.iloc[0]["risk_score"]

# Format features for the prompt
features_text = ""
for _, row in merged_df.iterrows():
    feature_name = row["feature_name"]
    feature_meaning = row["feature_meaning"]
    score = row["score"]
    features_text += (
        f"- {feature_name} ({feature_meaning}): {score:.0%} contribution\n"
    )

# Build the prompt
prompt_text = f"""
You are a risk model explanation assistant. Given a risk score and a list of features with their descriptions and contributions, generate a clear, concise narrative explaining the risk score.

Risk Score: {risk_score:.0%}
Top Features and Contributions:
{features_text}

Please produce a narrative that:
- Starts with the risk score
- Explains each feature’s contribution in plain language
- Highlights why each feature might indicate a higher risk
"""

print("=== Prompt Text ===")
print(prompt_text)

=== Prompt Text ===

You are a risk model explanation assistant. Given a risk score and a list of features with their descriptions and contributions, generate a clear, concise narrative explaining the risk score.

Risk Score: 80%
Top Features and Contributions:
- wirein_ct (Number of wire inbound transactions): 52% contribution
- perc_hrg_wire_amt (Percentage of wire amount associated with high-risk geographic country): 36% contribution
- degree_centrality (Number of connections an entity has in a network (degree centrality)): 12% contribution


Please produce a narrative that:
- Starts with the risk score
- Explains each feature’s contribution in plain language
- Highlights why each feature might indicate a higher risk



In [None]:
# ========== 3. Call Azure OpenAI API ==========

# Fill in your Azure details here:
api_key = "YOUR_AZURE_OPENAI_API_KEY"
api_base = "YOUR_AZURE_OPENAI_ENDPOINT"       # e.g. "https://your-resource-name.openai.azure.com/"
api_version = "2024-02-15-preview"            # Adjust if needed
deployment_name = "YOUR_DEPLOYMENT_NAME"      # e.g. "gpt-4"

client = AzureOpenAI(
    api_key=api_key,
    api_version=api_version,
    azure_endpoint=api_base
)


# Measure generation time
start_time = time.time()

response = client.chat.completions.create(
    model=deployment_name,
    messages=[
        {"role": "system", "content": "You are a helpful assistant that explains model risk scores."},
        {"role": "user", "content": prompt_text}
    ],
    temperature=0.3,
    max_tokens=500
)

end_time = time.time()
generation_time = end_time - start_time

generated_text = response.choices[0].message.content.strip()

# ========== 4. Print the Response ==========
generated_text = response.choices[0].message.content.strip()
print("\n=== Generated Narrative ===")
print(generated_text)
print(f"\nGeneration Time: {generation_time:.2f} seconds")

# ========== 4. Evaluate the Explanation ==========
evaluation_prompt = f"""
Please evaluate the following risk explanation on the following criteria (scale 1-5):
1. Clarity
2. Conciseness
3. Completeness

Provide a short justification for each score.

Generated Explanation:
\"\"\"
{generated_text}
\"\"\"
"""

evaluation_response = client.chat.completions.create(
    model=deployment_name,
    messages=[
        {"role": "system", "content": "You are an evaluation assistant that rates the quality of risk explanations."},
        {"role": "user", "content": evaluation_prompt}
    ],
    temperature=0.0,
    max_tokens=300
)

evaluation_text = evaluation_response.choices[0].message.content.strip()

print("\n=== Evaluation ===")
print(evaluation_text)

In [None]:
# Define your deployed Azure OpenAI judge models
judge_models = {
    "GPT-4": "gpt-4-deployment",          # Replace with your Azure deployment name
    "GPT-4o": "gpt-4o-deployment",        # Replace with your Azure deployment name
    "GPT-35-Turbo": "gpt-35-turbo-deployment"  # Replace with your Azure deployment name
}

# Function to evaluate with each judge model
def evaluate_with_judge(deployment_name, prompt, output):
    eval_prompt = f"""
You are an evaluation assistant. Given a prompt and a model-generated answer, please assess the quality of the answer based on:
- Clarity (1-5)
- Conciseness (1-5)
- Completeness (1-5)
Provide a score for each, then write a short summary comment.

Prompt:
{prompt}

Model-Generated Answer:
{output}
"""
    response = openai.ChatCompletion.create(
        engine=deployment_name,
        messages=[
            {"role": "system", "content": "You are a helpful assistant that evaluates AI-generated text."},
            {"role": "user", "content": eval_prompt}
        ],
        temperature=0
    )
    eval_text = response['choices'][0]['message']['content']
    return eval_text

def extract_scores(eval_text):
    """
    Extracts clarity, conciseness, and completeness scores from the evaluation text.
    """
    clarity = conciseness = completeness = None

    clarity_match = re.search(r'Clarity[:\s]+(\d)', eval_text, re.IGNORECASE)
    conciseness_match = re.search(r'Conciseness[:\s]+(\d)', eval_text, re.IGNORECASE)
    completeness_match = re.search(r'Completeness[:\s]+(\d)', eval_text, re.IGNORECASE)

    if clarity_match:
        clarity = int(clarity_match.group(1))
    if conciseness_match:
        conciseness = int(conciseness_match.group(1))
    if completeness_match:
        completeness = int(completeness_match.group(1))

    return clarity, conciseness, completeness

# Evaluate using each judge model
results = []
start_time = datetime.now()

for judge_name, deployment_name in judge_models.items():
    eval_start = datetime.now()
    eval_text = evaluate_with_judge(deployment_name, prompt_text, generated_narrative)
    eval_end = datetime.now()
    time_taken = (eval_end - eval_start).total_seconds()
    # results.append({
    #     "Judge Model": judge_name,
    #     "Evaluation Text": eval_text.strip(),
    #     "Time Taken (s)": time_taken
    # })
    clarity, conciseness, completeness = extract_scores(eval_text)
    results.append({
        "Judge Model": judge_name,
        "Clarity": clarity,
        "Conciseness": conciseness,
        "Completeness": completeness,
        "Evaluation Summary": eval_text.strip(),
        "Time Taken (s)": f"{time_taken:.2f}"
    })

end_time = datetime.now()
total_time = (end_time - start_time).total_seconds()

# Convert results to DataFrame and save
df_results = pd.DataFrame(results)

print("\n=== Evaluation Results ===")
print(df_results.to_markdown(index=False))

df_results.to_csv("evaluation_results_azure_openai.csv", index=False)
print(f"\nTotal evaluation time: {total_time:.2f} seconds")

In [None]:
# Print results as a table using tabulate
print("\n=== Evaluation Results ===\n")
print(tabulate(df_results, headers="keys", tablefmt="fancy_grid", showindex=False))

Can I use LLMs to figure out the relationship between features?

Not directly from the raw data — LLMs like GPT-4 are language models, and they do not learn relationships from raw data the way statistical or ML models (like XGBoost) do.

You can prompt an LLM to summarize relationships if you:

- Provide a summary of your feature importances or interactions (e.g. from SHAP values or partial dependence plots).
- Ask the LLM to generate human-readable explanations about possible relationships (e.g. "If feature A and feature B both have high values, how might that influence risk?").

For example:
If you compute SHAP interaction values (which show how pairs of features interact in the XGBoost model), you can prompt the LLM:

“Given that Feature A and Feature B have a strong interaction (SHAP interaction value = 0.25), what might that indicate about risk behavior?”

The LLM can then verbalize or hypothesize the interaction in plain language — but it’s not discovering new interactions itself. That part is still done by your ML tools.




They cannot themselves build a predictive model from raw data.
They can summarize existing knowledge you give them (like feature interactions, partial dependence plots, or SHAP values).



They cannot uncover feature interactions from raw tabular data on their own — that’s what tree-based models or techniques like SHAP do.

Given that Feature X has a contribution of 0.5% to the risk score, what might explain its low contribution relative to other features?

The LLM can generate reasons like:

“This feature might have a narrow value range, so it has less discriminative power.”

“This feature might overlap with other highly predictive features, so its unique contribution is reduced.”

Again, the LLM is explaining, not discovering the underlying math.


| **Question**                               | **LLM’s Role**                                         | **Best Tool**                                             |
| ------------------------------------------ | ------------------------------------------------------ | --------------------------------------------------------- |
| Discovering relationships between features | Explain them verbally, given data from XGBoost or SHAP | XGBoost (tree-based splits) + SHAP                        |
| Explaining low contribution features       | Very good at explaining why, given contribution data   | XGBoost + SHAP for quantifying                            |
| Generating human-readable insights         | Excellent, given data                                  | LLM (with model explanations)                             |


✅ Style guidance — Helps the LLM learn to produce text that matches your team's preferred tone and structure

✅ Evaluation — Provides a "gold standard" baseline to compare LLM outputs against

In [15]:
import pandas as pd

# Define the data

data = {
    "entity_id": [123],
    "risk_score": [0.80],
    "input": [
        """Risk Score: 80%
Top Features and Contributions:
- Number of wire inbound transactions (Number of inbound wire transactions initiated by the entity): 52% contribution
- Percentage of wire amount associated with HRG country (Proportion of transactions linked to high-risk geographies): 36% contribution
- Number of connections an entity has in a network (Degree centrality in a network analysis): 12% contribution
"""
    ],
    "narrative": [
        """123: 80%
Number of wire inbound transactions is the most significant contributor to the risk score, accounting for 52% of the total risk. A high number of inbound wire transactions can be indicative of unusual or suspicious activity, especially if the volume is significantly higher than typical patterns for similar entities.
Percentage of wire amount associated with HRG country: Contributing 36% to the risk score, this feature highlights the proportion of wire transactions linked to a high-risk geographic (HRG) country.
Number of connections an entity has in a network: This feature contributes 12% to the risk score. Degree centrality measures the number of connections an entity has within a network, which can be indicative of its influence or involvement in complex networks. A high degree centrality may suggest that the entity is well-connected, potentially facilitating or participating in coordinated activities that could pose a risk.
"""
    ]
}


# Create DataFrame
df_human_narratives = pd.DataFrame(data)

# Save to CSV
df_human_narratives.to_csv("human_narratives.csv", index=False)

print("Human-written narrative CSV file created!")


Human-written narrative CSV file created!


In [16]:
import pandas as pd
from openai import AzureOpenAI

# Load human narrative examples
df_human_narratives = pd.read_csv("human_narratives.csv")

# Select the first example
example_input = df_human_narratives.iloc[0]['input']
example_narrative = df_human_narratives.iloc[0]['narrative']

# Load model output files
df_feature_library = pd.read_csv("feature_library.csv")
df_feature_score = pd.read_csv("feature_score.csv")
df_risk_score = pd.read_csv("risk_score.csv")



# Construct the prompt with the human example included
prompt_text = f"""
You are a risk model explanation assistant. Given a risk score and a list of features with their descriptions and contributions, generate a clear, concise narrative explaining the risk score.

Here is an example of a high-quality explanation from a human:

Input:
{example_input}

Output:
{example_narrative}

Now generate a narrative for the following data:

Input:
Risk Score: {risk_score:.0f}%
Top Features and Contributions:
{top_features_text}

Output:
"""

# # Send to Azure OpenAI
# client = AzureOpenAI(
#     api_key="YOUR_AZURE_OPENAI_API_KEY",
#     api_version="2023-05-15",
#     azure_endpoint="YOUR_AZURE_OPENAI_ENDPOINT"
# )

# response = client.chat.completions.create(
#     model="gpt-35-turbo",
#     messages=[
#         {"role": "system", "content": "You are a helpful assistant."},
#         {"role": "user", "content": prompt_text}
#     ]
# )

# generated_narrative = response.choices[0].message.content.strip()

# # Print the generated narrative
# print("=== Generated Narrative ===")
# print(generated_narrative)


KeyError: 'contribution'

In [None]:
judge_prompt = f"""
You are an evaluation assistant. You will compare two narratives explaining a risk score for a financial model. The first narrative is from a human expert, and the second narrative is generated by a language model. Please rate the clarity, completeness, and overall quality of each narrative on a scale from 1 to 5, and then provide an overall judgment of which narrative is better.

Human Narrative:
{human_narrative}

LLM-Generated Narrative:
{llm_narrative}

Please provide:
1. A table comparing clarity, completeness, and quality for each narrative.
2. An overall rating indicating which narrative is better and why.
"""

judge_response = client.chat.completions.create(
    model="gpt-4o",  # or gpt-35-turbo-instruct
    messages=[
        {"role": "system", "content": "You are a fair and thorough evaluator."},
        {"role": "user", "content": judge_prompt}
    ]
)

judge_evaluation = judge_response.choices[0].message.content.strip()
print("=== Judge Evaluation ===")
print(judge_evaluation)


In [None]:
prompt_text = f"""
You are a risk model explanation assistant. Given a risk score and a list of features with their descriptions and contributions, generate a clear, concise narrative explaining the risk score.

Here is an example of a high-quality narrative:

[Example Narrative]

Risk Score: 80%
Top Features and Contributions:
- wirein_ct (Number of wire inbound transactions): 52% contribution
- perc_hrg_wire_amt (Percentage of wire amount associated with HRG country): 36% contribution
- degree_centrality (Number of connections): 12% contribution

Now generate a narrative for the following data:
Risk Score: {risk_score * 100:.2f}%
Top Features and Contributions:
"""

# Append the top features list (same as before)


# Step 1. Load human-written narrative samples
df_human_samples = pd.read_csv("human_narratives.csv")

# Example: select a random sample or a relevant one
sample_narrative = df_human_samples.iloc[0]["narrative"]

# Step 2. Include it in the prompt
prompt_text = f"""
You are a risk model explanation assistant. Given a risk score and a list of features with their descriptions and contributions, generate a clear, concise narrative explaining the risk score.

Here is an example of a high-quality narrative:
{sample_narrative}

Now generate a narrative for the following data:
Risk Score: {round(risk_score * 100, 2)}%
Top Features and Contributions:
"""

# Continue as before...


In [None]:
# 📦 1. Import libraries
import pandas as pd
import numpy as np
import shap
import xgboost as xgb
from azure.ai.openai import OpenAIClient
from azure.identity import DefaultAzureCredential
from tabulate import tabulate

# 📦 2. Load your data
# Replace with your actual file paths
df_features = pd.read_csv("your_features.csv")  # includes all 700 features
df_labels = pd.read_csv("your_labels.csv")      # risk labels

X = df_features
y = df_labels['risk_score']

# 📦 3. Train XGBoost
model = xgb.XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1)
model.fit(X, y)

# 📦 4. Predict risk score for an example entity
entity_index = 0  # Choose the row you want to explain
entity_features = X.iloc[[entity_index]]
risk_score = model.predict(entity_features)[0]

# 📦 5. Compute SHAP values
explainer = shap.Explainer(model, X)
shap_values = explainer(entity_features)

# Extract feature importances
contributions = pd.DataFrame({
    "Feature": X.columns,
    "SHAP Value": shap_values.values[0]
})
contributions["Abs SHAP Value"] = np.abs(contributions["SHAP Value"])
contributions = contributions.sort_values("Abs SHAP Value", ascending=False)

# Select top N features for explanation
top_n = 5
top_features = contributions.head(top_n)

# 📦 6. Generate prompt for Azure OpenAI
prompt_text = f"""
You are a risk model explanation assistant. Given a risk score and a list of features with their descriptions and contributions, generate a clear, concise narrative explaining the risk score.

Risk Score: {round(risk_score * 100, 2)}%

Top Features and Contributions:
"""

# Load feature library (if you have one) to get descriptions
try:
    feature_library = pd.read_csv("feature_library.csv")
    feature_descriptions = feature_library.set_index("Feature Name")["Feature Meaning"].to_dict()
except FileNotFoundError:
    feature_descriptions = {f: f for f in X.columns}  # fallback

for _, row in top_features.iterrows():
    feature = row["Feature"]
    contribution = round(row["Abs SHAP Value"] / top_features["Abs SHAP Value"].sum() * 100, 1)
    description = feature_descriptions.get(feature, "No description available")
    prompt_text += f"- {feature} ({description}): {contribution}% contribution\n"

prompt_text += "\nPlease produce a narrative that:\n- Starts with the risk score\n- Explains each feature’s contribution in plain language\n- Highlights why each feature might indicate a higher risk.\n"

print("\n=== PROMPT TO LLM ===\n")
print(prompt_text)

# 📦 7. Connect to Azure OpenAI and call the model
AZURE_OPENAI_ENDPOINT = "https://your-endpoint.openai.azure.com/"
AZURE_OPENAI_API_KEY = "your-azure-openai-api-key"
DEPLOYMENT_NAME = "your-gpt-deployment"

client = OpenAIClient(
    endpoint=AZURE_OPENAI_ENDPOINT,
    credential=DefaultAzureCredential()
)

response = client.chat.completions.create(
    model=DEPLOYMENT_NAME,
    messages=[
        {"role": "system", "content": "You are a helpful assistant for risk model explanations."},
        {"role": "user", "content": prompt_text}
    ],
    temperature=0.2
)

generated_narrative = response.choices[0].message.content.strip()

# 📦 8. Print results
print("\n=== LLM RESPONSE ===\n")
print(generated_narrative)

# 📦 9. Store evaluation score placeholder (since Azure OpenAI API doesn't have native evaluation)
evaluation_score = "N/A (external manual evaluation recommended)"

# 📦 10. Display summary table
summary_table = pd.DataFrame({
    "Risk Score": [round(risk_score * 100, 2)],
    "Top Features": [", ".join(top_features['Feature'])],
    "Narrative": [generated_narrative],
    "Evaluation Score": [evaluation_score]
})

print("\n=== SUMMARY TABLE ===\n")
print(tabulate(summary_table, headers="keys", tablefmt="fancy_grid", showindex=False))


In [18]:
import pandas as pd
from openai import AzureOpenAI  # or your Azure OpenAI SDK import

# --- Load your dataframes ---
feature_score_df = pd.read_csv("feature_score.csv")
feature_library_df = pd.read_csv("feature_library.csv")
risk_score_df = pd.read_csv("risk_score.csv")

# Load human narrative CSV that contains at least columns 'entity_id', 'risk_score', 'narrative'
df_human_narratives = pd.read_csv("human_narratives.csv")

# Sort feature scores descending by 'score' (assumed between 0 and 1)
feature_score_df = feature_score_df.sort_values(by="score", ascending=False)

# Merge feature scores with meanings
merged_df = pd.merge(
    feature_score_df,
    feature_library_df,
    on="feature_name",
    how="left"
)

# Extract risk score
risk_score = risk_score_df.iloc[0]["risk_score"]

# Format features for prompt
features_text = ""
for _, row in merged_df.iterrows():
    feature_name = row["feature_name"]
    feature_meaning = row["feature_meaning"]
    score = row["score"]
    features_text += f"- {feature_name} ({feature_meaning}): {score:.0%} contribution\n"

# Select human narrative example — here assuming you use the first row
human_example_risk_score = df_human_narratives.iloc[0]["risk_score"]
human_example_narrative = df_human_narratives.iloc[0]["narrative"]

# Construct the prompt including human example for LLM to learn style
prompt_text = f"""
You are a risk model explanation assistant. Given a risk score and a list of features with their descriptions and contributions, generate a clear, concise narrative explaining the risk score.

Here is an example of a high-quality narrative from a human for reference:

Risk Score: {human_example_risk_score:.0%}
Narrative:
{human_example_narrative.strip()}

Now generate a narrative for the following data:

Risk Score: {risk_score:.0%}
Top Features and Contributions:
{features_text}

Please produce a narrative that:
- Starts with the risk score
- Explains each feature’s contribution in plain language
- Highlights why each feature might indicate a higher risk
"""

print("=== Prompt Text ===")
print(prompt_text)

=== Prompt Text ===

You are a risk model explanation assistant. Given a risk score and a list of features with their descriptions and contributions, generate a clear, concise narrative explaining the risk score.

Here is an example of a high-quality narrative from a human for reference:

Risk Score: 80%
Narrative:
123: 80%
Number of wire inbound transactions is the most significant contributor to the risk score, accounting for 52% of the total risk. A high number of inbound wire transactions can be indicative of unusual or suspicious activity, especially if the volume is significantly higher than typical patterns for similar entities.
Percentage of wire amount associated with HRG country: Contributing 36% to the risk score, this feature highlights the proportion of wire transactions linked to a high-risk geographic (HRG) country.
Number of connections an entity has in a network: This feature contributes 12% to the risk score. Degree centrality measures the number of connections an ent

In [None]:
# --- Azure OpenAI API Setup ---
client = AzureOpenAI(
    api_key="YOUR_AZURE_OPENAI_API_KEY",
    azure_endpoint="YOUR_AZURE_OPENAI_ENDPOINT",
    api_version="2023-05-15"
)

# Generate narrative from LLM
response = client.chat.completions.create(
    model="gpt-35-turbo",
    messages=[
        {"role": "system", "content": "You are a helpful assistant specialized in risk model explanation."},
        {"role": "user", "content": prompt_text}
    ],
    temperature=0.7,
    max_tokens=512
)

llm_narrative = response.choices[0].message.content.strip()

print("\n=== Generated Narrative ===")
print(llm_narrative)


# --- Evaluate the narratives with LLM as judge ---

judge_prompt = f"""
You are a judge AI that compares two narratives that explain a risk score based on given feature contributions. Evaluate their clarity, completeness, and informativeness.

Narrative 1 (Human):
{human_example_narrative.strip()}

Narrative 2 (LLM):
{llm_narrative}

Please provide a short evaluation score for each (1-10), and a brief explanation of which one you prefer and why.
"""

judge_response = client.chat.completions.create(
    model="gpt-35-turbo",
    messages=[
        {"role": "system", "content": "You are a helpful assistant that judges narrative explanations."},
        {"role": "user", "content": judge_prompt}
    ],
    temperature=0,
    max_tokens=256
)

evaluation = judge_response.choices[0].message.content.strip()

print("\n=== Evaluation by LLM Judge ===")
print(evaluation)