### Tinker phishing classifier (clean)

This notebook runs **inference** with your fine-tuned Tinker sampler and produces a final DataFrame that includes:
- `category`: the model prediction (`GENUINE`/`PHISHING`/`ASSIGN_TO_AGENT`)
- `agent_notes_pred`: the model-provided agent notes extracted from the assistant text (e.g. `agent_notes: ...`)

**Prereqs**
- `TINKER_API_KEY` is set (this notebook loads `.env` if present).
- You have a completed run log with `checkpoints.jsonl` at `LOG_PATH` (default: `/tmp/tinker-examples/sl_ar_phishing`).

**Outputs**
- `df_tinker`: per-ticket predictions + `agent_notes_pred`
- `df_phishing_messages_tinker`: your evaluation DataFrame with predictions merged in


In [2]:
# Imports + environment

from __future__ import annotations

import asyncio
import json
import re
from enum import StrEnum
from pathlib import Path
from typing import Any, Dict, List

import pandas as pd
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from tqdm.auto import tqdm

load_dotenv()


  from .autonotebook import tqdm as notebook_tqdm


True

### 1) Build the evaluation dataset

This section reproduces the minimal data prep needed to create `lora_test` with these columns:
- `ticket_id`
- `messages_player`
- `ticket_tags_with_description`
- `features`
- `ml_features`
- `is_phishing` (label used for evaluation)


In [3]:
def lambda_sort_by_message_order(message_array: str) -> list[dict[str, Any]]:
    # `message_map_list` is a JSON-encoded list of messages with `message_order`.
    data = json.loads(message_array)
    return sorted(data, key=lambda obj: int(obj["message_order"]))


In [4]:
# Load raw messages + AR features (paths match the original notebook).

df_messages = pd.read_csv("../data/verification/failed_hs_bots_agents_messages_30d_all_games.csv")
df_ar_results = pd.read_csv("../data/verification/ar_sep_2025.csv")

# Parse the message list and build helper fields used downstream.
df_messages["messages_full"] = df_messages["message_map_list"].apply(lambda_sort_by_message_order)
df_messages["messages_player_only"] = df_messages["messages_full"].apply(
    lambda x: ",\n".join(
        f"{i}. {m['body']}"
        for i, m in enumerate((m for m in x if m.get("role") == "player"), start=1)
    )
)

# These are *ticket* agent notes extracted from the raw message map.
# (We keep them, but note: the model-predicted notes will be stored separately as `agent_notes_pred`.)
df_messages["agent_notes_ticket"] = df_messages["messages_full"].apply(
    lambda x: "\n".join(
        f"{i}. {m['body']}"
        for i, m in enumerate(
            (
                m
                for m in x
                if m.get("message_type") == "add_private_note" and m.get("role") == "agent"
            ),
            start=1,
        )
    )
)

# Outer merge to preserve rows even if one side is missing.
df_models_messages_features = df_messages.merge(df_ar_results, on="ticket_id", how="outer")
df_models_messages_features.shape


(139006, 44)

In [5]:
# Keep only rows where AR produced feature text.
# `features_messages` is expected to come from `ar_sep_2025.csv`.
df_models_messages_features = df_models_messages_features.loc[
    ~df_models_messages_features["message_map_list"].isna()
]
df_models_messages_features.shape


(139006, 44)

In [6]:
def get_phishing_attempts_either(row: pd.Series) -> bool | None:
    # Label is True if either the agent label or bot label says phishing.
    if row.get("phishing_attempts_agent") == 1 or row.get("phishing_attempts_bot") == 1:
        return True
    return None

df_models_messages_features["phishing_attempts_either"] = (
    df_models_messages_features.apply(get_phishing_attempts_either, axis=1).fillna(False)
)
df_models_messages_features["phishing_attempts_either"].value_counts(dropna=False)


  df_models_messages_features.apply(get_phishing_attempts_either, axis=1).fillna(False)


phishing_attempts_either
False    137138
True       1868
Name: count, dtype: int64

In [7]:
# Parse the AR blob embedded in messages (same helpers as original notebook).

def get_features_from_messages(x: list[dict[str, Any]]) -> str | None:
    substring = "Risk assessment results for malicious_account_recovery:"
    try:
        for m in x:
            body = m.get("body", "")
            if "Risk assessment results" in body:
                return body.split(substring)[1]
    except Exception:
        return None
    return None


def get_ml_result_from_messages(x: list[dict[str, Any]]) -> str | None:
    pattern = r"account_recovery_eligibility:\s*([A-Z_]+)(?=\s*âœ…?\s*SuspiciousSessionsRule)"
    try:
        for m in x:
            body = m.get("body", "")
            if "SuspiciousSessionsRule" in body:
                m_ = re.search(pattern, body)
                return m_.group(1) if m_ else None
    except Exception:
        return None
    return None


def get_ml_features_from_messages(x: list[dict[str, Any]]) -> str | None:
    pattern = r"SuspiciousSessionsRule[\s\S]*?(?=\s+Compare Tab:)"
    try:
        for m in x:
            body = m.get("body", "")
            if "SuspiciousSessionsRule" in body:
                m_ = re.search(pattern, body)
                return m_.group(0) if m_ else None
    except Exception:
        return None
    return None

df_models_messages_features["features_messages"] = df_models_messages_features["messages_full"].apply(get_features_from_messages)
df_models_messages_features["ml_features_messages"] = df_models_messages_features["messages_full"].apply(get_ml_features_from_messages)
df_models_messages_features["ml_result_messages"] = df_models_messages_features["messages_full"].apply(get_ml_result_from_messages)

display(
    df_models_messages_features["features_messages"].isna().mean(),
    df_models_messages_features["ml_features_messages"].isna().mean(),
    df_models_messages_features["ml_result_messages"].isna().mean(),
)


np.float64(0.317180553357409)

np.float64(0.44865689250823704)

np.float64(0.4779793677970735)

In [8]:
df_models_messages_features = df_models_messages_features.loc[
    ~df_models_messages_features["ml_features_messages"].isna()
]
df_models_messages_features.shape

(76640, 48)

In [9]:
# Build the dataset used for evaluation/inference.

training_data_list: list[dict[str, Any]] = []
for _, row in df_models_messages_features.iterrows():
    training_data_list.append(
        {
            "messages_full": row.get("messages_full"),
            "ticket_id": row.get("ticket_id"),
            "messages_player": row.get("messages_player_only"),
            "ticket_tags_with_description": row.get("ticket_tags_with_description"),
            "is_phishing": row.get("phishing_attempts_either"),
            "features": row.get("features_messages"),
            "ml_features": row.get("ml_features_messages"),
            "ml_result": row.get("ml_result_messages"),
            "agent_notes": row.get("agent_notes"),
        }
    )

lora_data = pd.DataFrame(training_data_list)
lora_data.shape


(76640, 9)

In [10]:
# Balance positives/negatives and split into train/test (for evaluation).

from sklearn.model_selection import train_test_split

true_df = lora_data[lora_data["is_phishing"] == True]
false_df = lora_data[lora_data["is_phishing"] == False]

false_down = false_df.sample(n=len(true_df), random_state=42)
lora_data_balanced = (
    pd.concat([true_df, false_down]).sample(frac=1, random_state=42).reset_index(drop=True)
)

lora_train, lora_test = train_test_split(
    lora_data_balanced,
    test_size=0.2,
    random_state=42,
    stratify=lora_data_balanced["is_phishing"],
)

display(lora_train["is_phishing"].value_counts(dropna=False))
display(lora_test["is_phishing"].value_counts(dropna=False))

lora_train.to_parquet("../data/finetuning/lora_train.parquet")
lora_test.to_parquet("../data/finetuning/lora_test.parquet")


is_phishing
True     1246
False    1246
Name: count, dtype: int64

is_phishing
True     312
False    312
Name: count, dtype: int64

In [None]:
from __future__ import annotations

from datetime import datetime
from pathlib import Path

# SAFE-BY-DEFAULT: this cell does NOT write any files unless you explicitly opt in.
WRITE_TRAINING_DATA = False
OVERWRITE_OUTPUTS = False
RUN_TAG = 'n_10'

system_prompt = Path("../data/verification/system_prompt.md").read_text()

ft_training_data_list: list[dict[str, str]] = []
for _, d in lora_train.iterrows():
    messages = d["messages_player"]
    features = d["features"]
    ml_features = d["ml_features"]
    is_phishing = d["is_phishing"]

    user_prompt = f"""
Chat log (player messages only):
---------------------------------------------------
\n{messages}\n\n
---------------------------------------------------
Compromised account analysis:
---------------------------------------------------
\n{features}\n\n
---------------------------------------------------

Machine learning features:
---------------------------------------------------
\n{ml_features}\n\n
---------------------------------------------------
"""

    ft_training_data_list.append(
        {
            "instruction": system_prompt,
            "input": user_prompt,
            # Keep the same output format your Tinker model expects.
            "output": f"is_phishing: {is_phishing}, agent_notes: {d.get('agent_notes')}",
        }
    )

print("Prepared examples:", len(ft_training_data_list))

# If you want to write, write to a VERSIONED path by default.
out_path = Path(f"../data/finetuning/training_data_{RUN_TAG}.json")

if WRITE_TRAINING_DATA:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    if out_path.exists() and not OVERWRITE_OUTPUTS:
        raise FileExistsError(
            f"Refusing to overwrite existing file: {out_path}. Set OVERWRITE_OUTPUTS=True to overwrite."
        )

    import json

    out_path.write_text(json.dumps(ft_training_data_list, indent=2, ensure_ascii=False), encoding="utf-8")
    print("Wrote:", out_path)
else:
    print("WRITE_TRAINING_DATA is False; not writing any files.")

Prepared examples: 2492
WRITE_TRAINING_DATA is False; not writing any files.
