In [1]:
# ✅ TEST API KEYS
from PyPDF2 import PdfReader
from datasets import Dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
import evaluate
import torch
import google.generativeai as genai
import pandas as pd
import numpy as np
import os
print("🔐 Testing API Keys...")

os.environ["GEMINI_API_KEY"] = "AIzaSyD6NgaD8zkS65XpWcx4X-6aIZdfryty_u0"

def test_gemini_api():
    try:
        genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
        model = genai.GenerativeModel("gemini-1.5-flash")
        response = model.generate_content("Say 'Hello from Gemini!'")
        print("✅ Gemini response:", response.text.strip())
    except Exception as e:
        print("❌ Gemini error:", str(e))

test_gemini_api()

# ✅ STEP 3: CONFIGURE REQUEST TYPES
REQUEST_TYPES = {email,
    "Adjustment": [],
    "AU Transfer": [],
    "Closing Notice": ["Reallocation Fees", "Amendment Fees", "Reallocation Principal"],
    "Commitment Change": ["Cashless Roll", "Decrease", "Increase"],
    "Fee Payment": ["Ongoing Fee", "Letter of Credit Fee"],
    "Money Movement-Inbound": ["Principal", "Interest", "Principal + Interest", "Principal+Interest+Fee"],
    "Money Movement - Outbound": ["Timebound", "Foreign Currency"]
}

# ✅ STEP 4: GENERATE SYNTHETIC DATA FIRST (EXPANDED FOR SUB-REQUESTS)
import random

synthetic_data = []

money_inbound_templates = [
    "Effective {date}, the borrower intends to repay USD {amount} via wire transfer to Wells Fargo.",
    "We will remit USD {amount} on {date}. Please credit to account ending with {account}.",
    "USD {amount} will be wired to your account under SOFR on {date}. Reference ID: {ref}.",
    "This is a notification of loan repayment. USD {amount} will be sent on {date} to ABA {aba}.",
    "Please note the principal repayment of USD {amount} effective {date} as per SOFR terms."
]

for i in range(25):
    template = random.choice(money_inbound_templates)
    filled = template.format(
        date=f"March {random.randint(1,28)}, 2024",
        amount=f"{random.randint(1,25)*1000000:,}",
        account=f"{random.randint(1000,9999)}",
        aba=f"{random.randint(100000000,999999999)}",
        ref=f"CUSIP{random.randint(100000,999999)}"
    )
    synthetic_data.append({"email_text": filled, "request_type": "Money Movement-Inbound"})

# Include other request types (already added previously, no need to repeat here)
# Reuse same logic to generate 'synthetic_data'

synthetic_df = pd.DataFrame(synthetic_data)
synthetic_df.to_csv("synthetic_emails.csv", index=False)
print("✅ Synthetic data saved to synthetic_emails.csv")

# ✅ STEP 5: LOAD DATASET AND PREP LABELS
combined_df = pd.concat([
    pd.read_csv("emails_dataset.csv"),
    pd.read_csv("synthetic_emails.csv")
], ignore_index=True)
combined_df.drop_duplicates(subset=["email_text"], inplace=True)
df = combined_df
df["label"] = df["request_type"].astype("category").cat.codes
main_label_mapping = dict(enumerate(df["request_type"].astype("category").cat.categories))
num_labels = len(main_label_mapping)

# ✅ STEP 5: CONVERT TO DATASET AND TOKENIZE
dataset = Dataset.from_pandas(df)
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    tokens = tokenizer(example["email_text"], padding="max_length", truncation=True)
    tokens["labels"] = example["label"]
    return tokens

tokenized_dataset = dataset.map(tokenize_function, batched=True)
splits = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = splits["train"]
eval_dataset = splits["test"]

# ✅ STEP 6: DEFINE MODEL
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

# ✅ STEP 7: METRICS
eval_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return eval_metric.compute(predictions=predictions, references=labels)

# ✅ STEP 8: TRAINING ARGUMENTS
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)

# ✅ STEP 9: TRAIN
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

# ✅ STEP 10: SAVE MODEL
model.save_pretrained("email_classifier_bert")
tokenizer.save_pretrained("email_classifier_bert")

# ✅ STEP 11: INFERENCE FUNCTION
def predict_email_verbose(text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        probs = outputs.logits.softmax(dim=1).detach().cpu().numpy()[0]
    for i, p in enumerate(probs):
        print(f"{main_label_mapping[i]}: {round(p, 3)}")
    pred_index = np.argmax(probs)
    return {
        "Prediction": main_label_mapping[pred_index],
        "Confidence": round(probs[pred_index], 3)
    }

# ✅ STEP 12: ENSEMBLE CLASSIFIER (WITHOUT HF)
def classify_gemini(text):
    genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
    model = genai.GenerativeModel("gemini-1.5-flash")
    prompt = f"Classify this email into: {list(REQUEST_TYPES.keys())}.\n\nEmail:\n{text}\n\nClassification:"
    response = model.generate_content(prompt)
    return response.text.strip()

def priority_override(text, predicted_label):
    text_lower = text.lower()
    if "repay under sofr" in text_lower or "we will remit" in text_lower or ("aba" in text_lower and "usd" in text_lower):
        return "Money Movement-Inbound"
    return predicted_label

def ensemble_classify(text):
    base = predict_email_verbose(text)
    base_prediction = base["Prediction"]
    base_conf = base["Confidence"]

    try:
        gemini_pred = classify_gemini(text)
    except:
        gemini_pred = None

    votes = [base_prediction]
    if gemini_pred:
        votes.append(gemini_pred)

    vote_counts = {label: votes.count(label) for label in set(votes)}
    majority = max(vote_counts, key=vote_counts.get)

    # 🔐 Confidence rule: if model is too uncertain, prefer Gemini
    if base_conf < 0.5 and gemini_pred:
        final = gemini_pred
        print("⚠️ Low confidence fallback to Gemini")
    else:
        final = majority

    # ✅ Apply keyword-based override
    final = priority_override(text, final)

    return {
        "Final Classification": final,
        "Votes": votes,
        "Confidence": base_conf
    }

# ✅ STEP 13: PDF TEXT EXTRACTION
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text.strip()

# ✅ STEP 14: SYNTHETIC DATA GENERATION
import random

synthetic_data = []

money_inbound_templates = [
    "Effective {date}, the borrower intends to repay USD {amount} via wire transfer to Wells Fargo.",
    "We will remit USD {amount} on {date}. Please credit to account ending with {account}.",
    "USD {amount} will be wired to your account under SOFR on {date}. Reference ID: {ref}.",
    "This is a notification of loan repayment. USD {amount} will be sent on {date} to ABA {aba}.",
    "Please note the principal repayment of USD {amount} effective {date} as per SOFR terms."
]

for i in range(10):
    template = random.choice(money_inbound_templates)
    filled = template.format(
        date=f"March {random.randint(1,28)}, 2024",
        amount=f"{random.randint(1,25)*1000000:,}",
        account=f"{random.randint(1000,9999)}",
        aba=f"{random.randint(100000000,999999999)}",
        ref=f"CUSIP{random.randint(100000,999999)}"
    )
    synthetic_data.append({"email_text": filled, "request_type": "Money Movement-Inbound"})

# Add synthetic samples for other request types
adjustment_templates = [
    "We have identified a discrepancy and made an adjustment of USD {amount} to your account.",
    "An adjustment has been processed due to a previous miscalculation on {date}.",
    "Adjustment entry of USD {amount} recorded for interest correction.",
    "System adjustment performed to rectify overcharge of USD {amount} on {date}.",
    "Reconciliation complete. USD {amount} has been credited back."
]
for i in range(10):
    filled = random.choice(adjustment_templates).format(
        amount=f"{random.randint(1,5)*10000:,}",
        date=f"April {random.randint(1,28)}, 2024"
    )
    synthetic_data.append({"email_text": filled, "request_type": "Adjustment"})

au_transfer_templates = [
    "Please initiate AU transfer of USD {amount} to account {account}.",
    "AU transfer of USD {amount} has been approved for release.",
    "The AU transfer request for USD {amount} is scheduled for {date}.",
    "Transfer funds under AU regulation to the designated beneficiary. Amount: USD {amount}.",
    "Initiate AU transfer referencing transaction ID {ref}."
]
for i in range(10):
    filled = random.choice(au_transfer_templates).format(
        amount=f"{random.randint(1,10)*100000:,}",
        account=f"{random.randint(10000000,99999999)}",
        date=f"May {random.randint(1,28)}, 2024",
        ref=f"AU-{random.randint(10000,99999)}"
    )
    synthetic_data.append({"email_text": filled, "request_type": "AU Transfer"})

closing_notice_templates = [
    "This notice serves as confirmation of the facility closing effective {date}.",
    "Facility has been closed. Final payment of USD {amount} received.",
    "Loan closure completed on {date}. All dues cleared.",
    "As of {date}, the credit line stands closed per agreement.",
    "Final reallocation completed. Facility closed with ref ID {ref}."
]
for i in range(10):
    filled = random.choice(closing_notice_templates).format(
        date=f"June {random.randint(1,28)}, 2024",
        amount=f"{random.randint(5,20)*100000:,}",
        ref=f"CL-{random.randint(1000,9999)}"
    )
    synthetic_data.append({"email_text": filled, "request_type": "Closing Notice"})

commitment_change_templates = [
    "We request a decrease in commitment amount by USD {amount} effective {date}.",
    "An increase of USD {amount} has been approved for your credit facility.",
    "Cashless roll of existing commitment into new term commencing {date}.",
    "Commitment reduction of USD {amount} is scheduled for {date}.",
    "Please reflect updated commitment amount per attached schedule."
]
for i in range(10):
    filled = random.choice(commitment_change_templates).format(
        amount=f"{random.randint(1,15)*100000:,}",
        date=f"July {random.randint(1,28)}, 2024"
    )
    synthetic_data.append({"email_text": filled, "request_type": "Commitment Change"})

fee_payment_templates = [
    "Ongoing fee of USD {amount} due on {date}.",
    "Please process payment of USD {amount} for letter of credit fee.",
    "Fee invoice attached for USD {amount}.",
    "USD {amount} to be charged for maintenance fee.",
    "Scheduled fee of USD {amount} processed on {date}."
]
for i in range(10):
    filled = random.choice(fee_payment_templates).format(
        amount=f"{random.randint(1,10)*10000:,}",
        date=f"August {random.randint(1,28)}, 2024"
    )
    synthetic_data.append({"email_text": filled, "request_type": "Fee Payment"})

money_outbound_templates = [
    "Please initiate outbound payment of USD {amount} to foreign beneficiary.",
    "Outbound wire of USD {amount} scheduled for {date}.",
    "USD {amount} transfer initiated to HSBC London, reference {ref}.",
    "FX payment outbound initiated, value date {date}.",
    "Foreign currency outbound transfer completed: USD {amount}."
]
for i in range(10):
    filled = random.choice(money_outbound_templates).format(
        amount=f"{random.randint(1,25)*100000:,}",
        date=f"September {random.randint(1,28)}, 2024",
        ref=f"OUT-{random.randint(1000,9999)}"
    )
    synthetic_data.append({"email_text": filled, "request_type": "Money Movement - Outbound"})

# Convert to DataFrame and save
synthetic_df = pd.DataFrame(synthetic_data)
synthetic_df.to_csv("synthetic_emails.csv", index=False)
print("✅ Synthetic data saved to synthetic_emails.csv")

# ✅ STEP 15: SUB-REQUEST CLASSIFICATION
sub_request_mapping = {k: v for k, v in REQUEST_TYPES.items() if v}

def classify_sub_request(text, main_class):
    sub_options = sub_request_mapping.get(main_class, [])
    if not sub_options:
        return None
    prompt = f"""Classify this email into one of the following sub-request types: {sub_options}.

Email:
{text}

Sub-Request Classification:"""
    try:
        model = genai.GenerativeModel("gemini-1.5-flash")
        response = model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        print("⚠️ Sub-request classification failed:", e)
        return None

# ✅ STEP 16: TEST CLASSIFICATION
pdf_text = extract_text_from_pdf("sample1.pdf")
result = ensemble_classify(pdf_text)
print(result)

# Optional: Sub-request classification if needed
sub = classify_sub_request(pdf_text, result['Final Classification'])
if sub:
    print(f"🔍 Sub-Request Type: {sub}")



SyntaxError: invalid syntax (3282995753.py, line 28)

In [None]:
"""
from transformers import BertForSequenceClassification, BertTokenizerFast
import pandas as pd
import json

# Load your trained model and tokenizer
model = BertForSequenceClassification.from_pretrained("email_classifier_bert")
tokenizer = BertTokenizerFast.from_pretrained("email_classifier_bert")

# Rebuild label mapping from your original dataset
df = pd.read_csv("emails_dataset.csv")  # Make sure this is the dataset you trained on
df["label"] = df["request_type"].astype("category").cat.codes
main_label_mapping = dict(enumerate(df["request_type"].astype("category").cat.categories))

# Create id2label and label2id
id2label = {i: label for i, label in main_label_mapping.items()}
label2id = {label: i for i, label in main_label_mapping.items()}

# Inject into config
model.config.id2label = id2label
model.config.label2id = label2id

# Save to new folder to avoid Windows file-lock issue
new_model_dir = "email_classifier_bert_v2"
model.save_pretrained(new_model_dir, safe_serialization=False)
tokenizer.save_pretrained(new_model_dir)

# Save external label mapping
with open("label_mapping.json", "w") as f:
    json.dump({str(k): v for k, v in id2label.items()}, f)

print(f"✅ Model and label mapping saved to {new_model_dir}")
"""