In [4]:
pip install -r requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


### OpenAI API Key

In [6]:
import os
from openai import OpenAI
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY not found in environment variables.")

client = OpenAI(api_key=OPENAI_API_KEY)

### Create compliance documents for context

In [7]:
from langchain_core.prompts import PromptTemplate

template=PromptTemplate(
    template="""
You are an expert in financial compliance monitoring trained to analyze employee and external email communications. Your task is to help generate realistic training data for a bank's email surveillance AI. 

Generate {num} diverse and representative email examples for the compliance category: **{category}**.

---

Each example must include:

1. A **realistic short scenario** or email snippet (3-4 lines max) that resembles an actual corporate email. It should be written in natural, professional language.
2. A flag: `is_violation = true` if the example clearly violates the given compliance category, or `false` if it's compliant or a false positive.
3. An **explanation** that justifies the label—clearly describing the intent or phrasing that makes it a violation or not.

---

Guidance:
- Include a variety of cases: clear violations, borderline/confusing cases, and compliant emails that may seem suspicious but are **not** violations.
- Use **real-world language**, not overly formal or synthetic text.
- Some examples should mimic **false positives**, where emails sound suspicious but are actually compliant.
- Avoid obvious keywords — think like someone trying to conceal intent.
- Each example must stand alone.
- Generate exactly {num} diverse and representative email examples for the category **{category}**. Do not generate more than {num} example(s).

---

Output format:
A JSON list of dictionaries with the following keys:
- "{category}"
- `description`: detaileddefinition of the compliance category in the context of financial institutions
- `email`: the email line or short message
- `is_violation`: true or false
- `explanation`: your reasoning

Output your response as a single JSON object in the following format:

{{
  "{category}": {{
    "description": "definition of what the '{category}' category means in financial compliance.",
    "examples": [
      {{
        "email": "Let's handle this off the record, please.",
        "is_violation": true,
        "explanation": "Suggests avoiding formal communication for secrecy."
      }},
      ...
    ]
  }}
}}
""",
    input_variables=['category','num']
)

In [None]:
# Generate 2 examples for 'Secrecy' category

openai_model=ChatOpenAI(model="gpt-4.1-nano", temperature=0.5)

prompt = template.invoke({"category":"Secrecy","num":2})
json_result = openai_model.invoke(prompt).content

print(json_result)

{
  "Secrecy": {
    "description": "The 'Secrecy' category pertains to communications where employees or external parties attempt to conceal sensitive information, discuss confidential matters informally, or encourage discretion that could undermine transparency and compliance protocols within the financial institution.",
    "examples": [
      {
        "email": "Please keep this between us; I don’t want anyone else to know about the upcoming deal.",
        "is_violation": true,
        "explanation": "The phrase 'keep this between us' indicates an attempt to conceal information, which violates policies requiring transparency and proper disclosure of sensitive financial activities."
      },
      {
        "email": "I just wanted to confirm that the client’s details are only shared with the authorized team, no outside parties involved.",
        "is_violation": false,
        "explanation": "This message emphasizes confidentiality and proper handling of client information, alignin

### Load compliance documents

In [8]:
import json
from langchain.docstore.document import Document
from langchain.chat_models import ChatOpenAI
import os

In [9]:
with open("compliance_docs.json") as f:
    raw_data = json.load(f)

#print(raw_data)

In [None]:
# docs is a list of LangChain Document objects.
#Each object contains the content and metadata for one compliance example.
docs = []
for category, cat_desc in raw_data.items():
    description = cat_desc.get("description", "")
    for example in cat_desc.get("examples", []):
        email = example.get("email", "")
        is_violation = example.get("is_violation")
        explanation = example.get("explanation")
        metadata = {"source": "compliance_docs.json"}

        text = f"Category: {category}\nCategory Description: {description}\nEmail: {email}\nViolation: {is_violation}\nExplanation: {explanation}"
        docs.append(Document(page_content=text, metadata=metadata))

docs
for doc in docs:
    print(doc.page_content)
    print(doc.metadata)


### Retrieve relevant compliance document snippets


FAISS VectorStore + OpenAI Embeddings

In [None]:
email_text = "Let's close the deal \nIt would be best to give importance to this deal with XCorp tomorrow. We might have to oversee some company policies, but it is for the greater good. Eager to close this deal asap."

email_text1 = "Spokesperson Q&A draft \nHere's the internal draft we'll use unless compliance red flags it. It reflects our recent positioning on derivatives."
email_text2 = "Password Reset Confirmation \nYour credentials have been reset as requested. Please change them after your next login. This is a system-generated message."

In [111]:
from langchain_community.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

# Build knowledge base using FAISS
knowledge_base = FAISS.from_documents(docs, embedding_model)

# Query for similar docs

def get_relevant_context(email_text, num_docs):
    #converts (email_text) into an embedding, then find top k most similar documents in the knowledge base based on vector distance
    results = knowledge_base.similarity_search(email_text, k=num_docs)

    all_results = "\n\n".join([doc.page_content for doc in results])
    context_snippets = [doc.page_content for doc in results[0:3]]

    context = "\n\n".join(context_snippets)
    return all_results, results, context

num_docs = 6
all_results, results, context = get_relevant_context(email_text,num_docs)

In [112]:
with open('output.txt', 'w', encoding='utf-8') as f:
    f.write(str(all_results))

In [113]:
import re

contexts_needed = 3
max_attempts = 3
final_contexts = []

for i, doc in enumerate(results[:contexts_needed]):
    context_to_check = doc.page_content
    print(context_to_check)
    relevant_found = False
    for attempt in range(max_attempts):
        prompt = f"""
You are a senior compliance auditor reviewing training data for a bank compliance LLM.

Please review the following entry for its relevance to the compliance scenario described in the email.

### Compliance Reference Context:
{context_to_check}

---

### Email to Analyze:
{email_text}

Evaluate:
- Give a 'yes' or 'no' answer to show if the document is relevant to the email to analyze.
- Relevance score: (1-5) is the context useful to analyze the email and categorize it?

Output format:
----------
  "Relevant: Yes/No"
  "Relevance Score": 1-5, 
"""
        response = client.chat.completions.create(
            model="gpt-4.1-nano",
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )
        analysis = response.choices[0].message.content
        match = re.search(r'Relevant:\s*(Yes|No)', analysis, re.IGNORECASE)
        if match and match.group(1).lower() == "yes":
            print(analysis)
            print(f"Context {i+1}: Relevant context accepted.")
            final_contexts.append(context_to_check)
            relevant_found = True
            break
        else:
            print(analysis)
            print(f"Context {i+1}: Not relevant (attempt {attempt+1}).")
            context_to_check = results[2+attempt+1].page_content
            print(context_to_check)
            
    if not relevant_found:
        print(f"Context {i+1}: No relevant context found after {max_attempts} attempts. Leaving slot empty.")
        final_contexts.append('')
    print("\n")
    
final_contexts_str = "\n\n".join(final_contexts)

print("Final relevant contexts:")
for idx, ctx in enumerate(final_contexts, 1):
    print(f"Context {idx}:", "Found" if ctx else "None")

Category: Bribery
Category Description: Bribery involves offering, giving, receiving, or soliciting something of value to influence the actions of an individual in a position of power, often to secure unfair advantages or preferential treatment. In financial institutions, such conduct typically aims to sway decisions or approvals improperly. This includes, but is not limited to: Offering, giving, receiving, or soliciting something of value to influence the actions of an individual in a position of power, often to secure unfair advantages or preferential treatment, Offering clients or vendors under-the-table benefits in exchange for contracts or information, Providing gifts or incentives to regulators, auditors, or procurement staff, Accepting “kickbacks” or personal compensation for steering business.
Email: We can close this deal faster if you consider a little extra for the team behind the scenes.
Violation: True
Explanation: Suggests offering something extra in exchange for favorabl

In [114]:
print(final_contexts_str)

Category: Bribery
Category Description: Bribery involves offering, giving, receiving, or soliciting something of value to influence the actions of an individual in a position of power, often to secure unfair advantages or preferential treatment. In financial institutions, such conduct typically aims to sway decisions or approvals improperly. This includes, but is not limited to: Offering, giving, receiving, or soliciting something of value to influence the actions of an individual in a position of power, often to secure unfair advantages or preferential treatment, Offering clients or vendors under-the-table benefits in exchange for contracts or information, Providing gifts or incentives to regulators, auditors, or procurement staff, Accepting “kickbacks” or personal compensation for steering business.
Email: We can close this deal faster if you consider a little extra for the team behind the scenes.
Violation: True
Explanation: Suggests offering something extra in exchange for favorabl

### Build prompt for LLM

In [120]:
prompt = f"""
You are an expert compliance AI working for a financial institution. Your responsibility is to carefully and objectively assess the following email communication for any signs of regulatory, ethical, or internal policy violations.

You must **strictly rely on the provided compliance reference context** as your decision-making framework. If the email appears suspicious or potentially non-compliant, classify it under the most relevant categories. If it is compliant, respond accordingly. All conclusions should be based on **direct alignment with the context definitions**, avoiding speculation or subjective interpretation.

---

### Compliance Reference Context:
{final_contexts_str}

---

### Email to Analyze:
{email_text}

---

### Your Task:
1. **Review the email** to determine whether it reflects any misconduct, unethical behavior, secrecy, or inappropriate communication practices, **based solely on the context above**.
2. **Only classify as non-compliant if the context clearly supports it.** Use conservative judgment: if the email seems vague or lacks sufficient context to warrant a violation, default to "Compliant".
3. **Do not hallucinate.** Ground all decisions strictly in the given context and the actual email content.
4. **If any violations are found**, categorize them using the labels provided in the context and explain your reasoning clearly and briefly.
5. **If compliant**, explain why it aligns with ethical and compliant communication.

---

### Output Format (Strict):
Compliant: Yes / No  
Category: [If non-compliant, select from: Secrecy, Market Manipulation/Misconduct, Bribery, Change in Communication, Complaints, Employee Ethics, Other. If compliant, write: None]  
Reason: [Brief explanation referencing specific phrases or intent, and how they align with the context]  
Source Lines: [Exact quoted lines from the email that triggered the classification. Use quotation marks.]

---
Did you rely on the context provided to make your classification: [Yes/No] 
If yes, please list the context(s) you used, and how they affected your classification.
"""

print(prompt)


You are an expert compliance AI working for a financial institution. Your responsibility is to carefully and objectively assess the following email communication for any signs of regulatory, ethical, or internal policy violations.

You must **strictly rely on the provided compliance reference context** as your decision-making framework. If the email appears suspicious or potentially non-compliant, classify it under the most relevant categories. If it is compliant, respond accordingly. All conclusions should be based on **direct alignment with the context definitions**, avoiding speculation or subjective interpretation.

---

### Compliance Reference Context:
Category: Bribery
Category Description: Bribery involves offering, giving, receiving, or soliciting something of value to influence the actions of an individual in a position of power, often to secure unfair advantages or preferential treatment. In financial institutions, such conduct typically aims to sway decisions or approvals 

### Get LLM output

In [123]:
response = client.chat.completions.create(model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a compliance classification assistant."},
        {"role": "user", "content": prompt}
    ],
    temperature=0.0)

output = response.choices[0].message.content.strip()
print(output)
    

Compliant: No  
Category: Employee Ethics  
Reason: The email suggests overlooking company policies for the sake of closing a deal, which indicates a willingness to compromise ethical standards and internal rules. This aligns with the definition of Employee Ethics violations, as it implies dishonesty and a lack of integrity in decision-making.  
Source Lines: "we might have to oversee some company policies, but it is for the greater good."  

Did you rely on the context provided to make your classification: Yes  
I used the Employee Ethics context, which defines violations as actions that compromise the firm's ethical standards, including the implication of bypassing rules for personal or business gain.


In [134]:
CATEGORY_WEIGHTS = {
    "Market Manipulation/Misconduct": 10,
    "Bribery": 9,
    "Secrecy": 8,
    "Change in Communication": 6,
    "Employee Ethics": 5,
    "Complaints": 4
}

categories = [l.split('Category:', 1)[1].strip() for l in line if 'Category:' in l]

# Get weights for each category in the output
category_weights = [(cat, CATEGORY_WEIGHTS.get(cat, 0)) for cat in categories]

# Print the results
for cat, weight in category_weights:
    print(f"Category: {cat}, Weight: {weight}")


Category: Employee Ethics, Weight: 5
