<a href="https://colab.research.google.com/github/crignot/patent-law-predictive-dashboard/blob/main/patent_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Predictive Patent Application Dashboard â€” Model Training (Public Version)
# Data source: USPTO Office Action Research Dataset (links in README)
# Note: Raw dataset not included. This notebook shows the preprocessing, feature engineering, and model evaluation/training pipeline.


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from imblearn.over_sampling import SMOTE

from xgboost import XGBClassifier
import joblib

In [None]:
# Load local CSV (not included): office_actions.csv

DATA_PATH = "office_actions.csv"
NROWS = 1000000

df = pd.read_csv(DATA_PATH, nrows=NROWS)
print("Raw:", df.shape)
print("\nColumns:\n", df.columns)
df.head(3)


In [None]:
# sample + select relevant columns

office = df.sample(n=500_000, random_state=42)

keep_cols = [
    'app_id', 'mail_dt', 'art_unit',
    'rejection_101', 'rejection_102', 'rejection_103', 'rejection_112',
    'objection',
    'cite102_gt1', 'cite103_gt3', 'cite103_eq1', 'cite103_max',
    'allowed_claims'
]
office = office[keep_cols].copy()
print("Sampled:", office.shape)
office.head(3)

In [None]:
# Keep last office action per application
office = office.sort_values('mail_dt')
office = office.drop_duplicates('app_id', keep='last')
print("After dedupe:", office.shape, "| unique app_id:", office['app_id'].nunique())


In [None]:
# Cleaning + renaming
office = office.rename(columns={
    'app_id': 'ApplicationID',
    'mail_dt': 'MailDate',
    'art_unit': 'ArtUnit',
    'rejection_101': 'Rejection101',
    'rejection_102': 'Rejection102',
    'rejection_103': 'Rejection103',
    'rejection_112': 'Rejection112',
    'objection': 'Objection',
    'cite102_gt1': 'Cite102_GT1',
    'cite103_gt3': 'Cite103_GT3',
    'cite103_eq1': 'Cite103_EQ1',
    'cite103_max': 'Cite103_Max',
    'allowed_claims': 'OutcomeBinary'
})

office['MailDate'] = pd.to_datetime(office['MailDate'], errors='coerce')
office['Year'] = office['MailDate'].dt.year

office['OutcomeBinary'] = office['OutcomeBinary'].fillna(0).astype(int)
office['Outcome'] = np.where(office['OutcomeBinary'] > 0, "Allowed", "Not Allowed")

# ArtUnit cleaning + filter to realistic range (your original bounds)
office['ArtUnit'] = pd.to_numeric(office['ArtUnit'], errors='coerce')
office = office.dropna(subset=['ArtUnit', 'Year'])
office = office[(office['ArtUnit'] >= 1600) & (office['ArtUnit'] <= 3799)].copy()

print("Cleaned:", office.shape)
office[['ApplicationID','Year','ArtUnit','OutcomeBinary']].head(3)


In [None]:
# Tech Center mapping
office['TechCenter'] = (office['ArtUnit'] // 100) * 100

tc_labels = {
    1600: "Biotech & Organic Chemistry",
    1700: "Chemical & Materials Engineering",
    2100: "Computer Architecture & Software",
    2400: "Networking & Communications",
    2600: "Cryptography & Security",
    2800: "Semiconductors & Electrical Systems",
    3600: "Business Methods & Finance",
    3700: "Mechanical Engineering",
    2900: "Design Patents",
    3900: "Administrative / Special Programs",
    0: "Unknown / Invalid"
}

office['TechCenter'] = office['TechCenter'].map(tc_labels).fillna("Unknown / Missing")
office[['ArtUnit','TechCenter']].head(3)


In [None]:
#Prior art score
def compute_prior_art_score(row) -> int:
    score = 0
    if row["Cite103_EQ1"] == 1:
        score += 1
    if row["Cite103_GT3"] == 1:
        score += 3
    if row["Cite103_Max"] == 1:
        score += 5
    return score

office["PriorArtScore"] = office.apply(compute_prior_art_score, axis=1)
office["PriorArtScore"].value_counts().sort_index()


In [None]:
#Model dataframe
office_model = office.drop(['ApplicationID', 'MailDate', 'Outcome'], axis=1).copy()

office_model = pd.get_dummies(
    office_model,
    columns=['TechCenter'],
    drop_first=False,
    dtype=int
)

print("Model df:", office_model.shape)
office_model.head(3)


In [None]:
# Fixed feature list (for training + Streamlit compatibility)
MODEL_COLUMNS = [
    "Year",
    "ArtUnit",
    "Rejection101", "Rejection102", "Rejection103", "Rejection112", "Objection",
    "Cite102_GT1", "Cite103_GT3", "Cite103_EQ1", "Cite103_Max",
    "PriorArtScore",
    "TechCenter_Biotech & Organic Chemistry",
    "TechCenter_Business Methods & Finance",
    "TechCenter_Chemical & Materials Engineering",
    "TechCenter_Computer Architecture & Software",
    "TechCenter_Cryptography & Security",
    "TechCenter_Design Patents",
    "TechCenter_Mechanical Engineering",
    "TechCenter_Networking & Communications",
    "TechCenter_Semiconductors & Electrical Systems",
]

# Ensure all expected columns exist (if a TechCenter is missing from the sample)
for col in MODEL_COLUMNS:
    if col not in office_model.columns:
        office_model[col] = 0

# Keep only model columns + target
X = office_model[MODEL_COLUMNS].astype(int)
y = office_model["OutcomeBinary"].astype(int)

print("Target distribution:\n", y.value_counts(normalize=True).rename("pct"))


In [None]:
# Address class imbalance -> SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

print("Original:", X.shape, y.shape)
print("Resampled:", X_res.shape, y_res.shape)
print("Resampled target:\n", y_res.value_counts(normalize=True).rename("pct"))


In [None]:
# Train/Evaluate -> XGBoost
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, random_state=42, stratify=y_res
)

model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

pred = model.predict(X_test)
proba = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, pred))
print("Confusion matrix:\n", confusion_matrix(y_test, pred))


In [None]:
# Slice-based evaluation (example: performance by TechCenter and by PriorArtScore)
eval_df = X_test.copy()
eval_df["y_true"] = y_test.values
eval_df["y_pred"] = pred
eval_df["p_allowed"] = proba

# Reconstruct TechCenter label for slices (from one-hot columns)
tc_cols = [c for c in MODEL_COLUMNS if c.startswith("TechCenter_")]
eval_df["TechCenter"] = eval_df[tc_cols].idxmax(axis=1).str.replace("TechCenter_", "", regex=False)

def slice_report(group_col: str, top_k: int = 8):
    rows = []
    for g, sub in eval_df.groupby(group_col):
        if len(sub) < 200:  # skip tiny slices
            continue
        acc = (sub["y_true"] == sub["y_pred"]).mean()
        rows.append((g, len(sub), acc, sub["p_allowed"].mean()))
    out = pd.DataFrame(rows, columns=[group_col, "n", "accuracy", "avg_pred_prob_allowed"])
    return out.sort_values("n", ascending=False).head(top_k)

print("By TechCenter:")
display(slice_report("TechCenter"))

print("\nBy PriorArtScore:")
display(slice_report("PriorArtScore"))


In [None]:
# Save model artifact (for Streamlit to load this, keep the filename consistent)
MODEL_OUT = "final_patent_model.pkl"
joblib.dump(model, MODEL_OUT)

print("Saved:", MODEL_OUT)
print("Model features:", list(model.feature_names_in_))
