Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions ai_model/complete_ml_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@
test_connection
)
from operations.db_config import TABLES, DB_CONFIG
from ai_model.sensitive_feature_loader import (
load_excluded_ml_keys,
log_institution_ml_privacy_exclusions,
strip_excluded_features,
)

# Get the project root directory
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
Expand Down Expand Up @@ -230,6 +235,10 @@ def assign_credential_type(row):
enrollment_features + course_features + performance_features
)

_EXCLUDED_ML_KEYS = load_excluded_ml_keys()
log_institution_ml_privacy_exclusions(_EXCLUDED_ML_KEYS)
retention_features = strip_excluded_features(retention_features, _EXCLUDED_ML_KEYS)

print(f"Selected {len(retention_features)} features for modeling (reduced from 31 to prevent overfitting)")

# ============================================================================
Expand Down Expand Up @@ -736,6 +745,8 @@ def assign_alert_level(risk_score):
'Number_of_Credits_Earned_Year_1'
]

gateway_math_features = strip_excluded_features(gateway_math_features, _EXCLUDED_ML_KEYS)

print(f"\nUsing {len(gateway_math_features)} features (excluded gateway math features to prevent leakage)")

# Preprocess with clean feature set
Expand Down Expand Up @@ -847,6 +858,8 @@ def assign_alert_level(risk_score):
'Number_of_Credits_Earned_Year_1'
]

gateway_english_features = strip_excluded_features(gateway_english_features, _EXCLUDED_ML_KEYS)

print(f"\nUsing {len(gateway_english_features)} features (excluded gateway English features to prevent leakage)")

# Preprocess with clean feature set
Expand Down Expand Up @@ -962,6 +975,8 @@ def assign_alert_level(risk_score):
'CompletedGatewayMathYear1', 'CompletedGatewayEnglishYear1'
]

gpa_features = strip_excluded_features(gpa_features, _EXCLUDED_ML_KEYS)

print(f"\nUsing {len(gpa_features)} features (removed GPA-derived features)")
print("Removed: average_grade, GPA_Group_Year_1, course_completion_rate, total_credits_earned")

Expand Down
23 changes: 22 additions & 1 deletion ai_model/complete_ml_pipeline_csv_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,18 @@
from datetime import datetime
import warnings
import os
warnings.filterwarnings('ignore')
import sys

warnings.filterwarnings("ignore")

_REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if _REPO_ROOT not in sys.path:
sys.path.insert(0, _REPO_ROOT)
from ai_model.sensitive_feature_loader import (
load_excluded_ml_keys,
log_institution_ml_privacy_exclusions,
strip_excluded_features,
)

# Get the project root directory
PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
Expand Down Expand Up @@ -198,6 +209,10 @@ def assign_credential_type(row):
enrollment_features + course_features + performance_features
)

_EXCLUDED_ML_KEYS = load_excluded_ml_keys()
log_institution_ml_privacy_exclusions(_EXCLUDED_ML_KEYS)
retention_features = strip_excluded_features(retention_features, _EXCLUDED_ML_KEYS)

print(f"Selected {len(retention_features)} features for modeling (reduced from 31 to prevent overfitting)")

# ============================================================================
Expand Down Expand Up @@ -676,6 +691,8 @@ def assign_alert_level(risk_score):
'Number_of_Credits_Earned_Year_1'
]

gateway_math_features = strip_excluded_features(gateway_math_features, _EXCLUDED_ML_KEYS)

print(f"\nUsing {len(gateway_math_features)} features (excluded gateway math features to prevent leakage)")

# Preprocess with clean feature set
Expand Down Expand Up @@ -778,6 +795,8 @@ def assign_alert_level(risk_score):
'Number_of_Credits_Earned_Year_1'
]

gateway_english_features = strip_excluded_features(gateway_english_features, _EXCLUDED_ML_KEYS)

print(f"\nUsing {len(gateway_english_features)} features (excluded gateway English features to prevent leakage)")

# Preprocess with clean feature set
Expand Down Expand Up @@ -884,6 +903,8 @@ def assign_alert_level(risk_score):
'CompletedGatewayMathYear1', 'CompletedGatewayEnglishYear1'
]

gpa_features = strip_excluded_features(gpa_features, _EXCLUDED_ML_KEYS)

print(f"\nUsing {len(gpa_features)} features (removed GPA-derived features)")
print("Removed: average_grade, GPA_Group_Year_1, course_completion_rate, total_credits_earned")

Expand Down
73 changes: 73 additions & 0 deletions ai_model/sensitive_feature_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
"""
Issue #109: load institution ML feature exclusions from Postgres and strip them
from sklearn feature lists before training / inference in complete_ml_pipeline.py.
"""

from __future__ import annotations

import os
from typing import FrozenSet, List, Sequence

DEFAULT_INSTITUTION = "bscc"


def log_institution_ml_privacy_exclusions(excluded: FrozenSet[str]) -> None:
if excluded:
print(f"\n(#109) Institution ML privacy: excluding features {sorted(excluded)}")


def load_excluded_ml_keys() -> FrozenSet[str]:
"""
Reads excluded_ml_feature_keys for the default institution.
Returns empty set if DB is unreachable or the table/row is missing.
"""
try:
import psycopg2 # noqa: PLC0415
except ImportError:
return frozenset()

host = os.environ.get("DB_HOST", "127.0.0.1")
user = os.environ.get("DB_USER", "postgres")
password = os.environ.get("DB_PASSWORD", "postgres")
dbname = os.environ.get("DB_NAME", "postgres")
port = int(os.environ.get("DB_PORT", "54332"))

try:
conn = psycopg2.connect(
host=host, user=user, password=password, dbname=dbname, port=port
)
except Exception as exc: # noqa: BLE001
print(f"(#109) Could not connect for sensitive ML settings: {exc}")
return frozenset()

try:
with conn.cursor() as cur:
cur.execute(
"""
SELECT excluded_ml_feature_keys
FROM institution_sensitive_ml_settings
WHERE institution_code = %s
LIMIT 1
""",
(DEFAULT_INSTITUTION,),
)
row = cur.fetchone()
if not row or row[0] is None:
return frozenset()
keys = row[0]
if not isinstance(keys, list):
return frozenset()
return frozenset(str(k) for k in keys if isinstance(k, str))
except Exception as exc: # noqa: BLE001
print(f"(#109) Could not read institution_sensitive_ml_settings: {exc}")
return frozenset()
finally:
conn.close()


def strip_excluded_features(
features: Sequence[str], excluded: FrozenSet[str]
) -> List[str]:
if not excluded:
return list(features)
return [f for f in features if f not in excluded]
201 changes: 201 additions & 0 deletions codebenders-dashboard/app/admin/sensitive-ml/page.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
"use client"

import { useCallback, useEffect, useState } from "react"
import Link from "next/link"
import { ArrowLeft, Loader2, Shield } from "lucide-react"
import { Button } from "@/components/ui/button"
import { Switch } from "@/components/ui/switch"
import { Label } from "@/components/ui/label"
import { Input } from "@/components/ui/input"
import type { SensitiveMlFeatureMeta } from "@/lib/sensitive-population"

type SettingsPayload = {
institutionCode: string
excludedMlFeatureKeys: string[]
lowSampleThreshold: number
updatedAt: string | null
catalog: SensitiveMlFeatureMeta[]
}

export default function SensitiveMlSettingsPage() {
const [data, setData] = useState<SettingsPayload | null>(null)
const [excluded, setExcluded] = useState<Set<string>>(new Set())
const [threshold, setThreshold] = useState(30)
const [loading, setLoading] = useState(true)
const [saving, setSaving] = useState(false)
const [error, setError] = useState<string | null>(null)
const [saveNote, setSaveNote] = useState<string | null>(null)

const hydrateFromPayload = useCallback((payload: SettingsPayload) => {
setData(payload)
setExcluded(new Set(payload.excludedMlFeatureKeys))
setThreshold(payload.lowSampleThreshold)
}, [])

const load = useCallback(async () => {
setLoading(true)
setError(null)
try {
const res = await fetch("/api/admin/sensitive-ml-settings")
const json = await res.json()
if (!res.ok) throw new Error(json.error || "Failed to load")
hydrateFromPayload(json as SettingsPayload)
} catch (e) {
setData(null)
setError(e instanceof Error ? e.message : "Failed to load")
} finally {
setLoading(false)
}
}, [hydrateFromPayload])

useEffect(() => {
void load()
}, [load])

async function save() {
setSaving(true)
setSaveNote(null)
setError(null)
try {
const res = await fetch("/api/admin/sensitive-ml-settings", {
method: "PATCH",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
excludedMlFeatureKeys: [...excluded],
lowSampleThreshold: threshold,
}),
})
const json = await res.json()
if (!res.ok) throw new Error(json.error || "Save failed")
hydrateFromPayload(json as SettingsPayload)
setSaveNote("Saved. Re-run the Python ML pipeline so new exclusions apply to training and batch scores.")
} catch (e) {
setError(e instanceof Error ? e.message : "Save failed")
} finally {
setSaving(false)
}
}

function toggleKey(mlKey: string, off: boolean) {
setExcluded((prev) => {
const next = new Set(prev)
if (off) next.add(mlKey)
else next.delete(mlKey)
return next
})
}

return (
<div className="min-h-screen bg-background">
<div className="container mx-auto px-4 py-8 max-w-2xl">
<div className="flex items-center gap-4 mb-6">
<Link href="/admin/upload">
<Button variant="ghost" size="sm" className="gap-1">
<ArrowLeft className="h-4 w-4" />
Admin
</Button>
</Link>
<div>
<h1 className="text-2xl font-bold tracking-tight flex items-center gap-2">
<Shield className="h-6 w-6 text-muted-foreground" />
ML privacy & sensitive fields
</h1>
<p className="text-sm text-muted-foreground mt-0.5">
Issue #109 — exclude demographic / aid inputs from ML training; low-sample warnings for dashboards and NLQ.
</p>
</div>
</div>

{loading && (
<div className="flex items-center gap-2 text-muted-foreground py-12">
<Loader2 className="h-4 w-4 animate-spin" />
Loading settings…
</div>
)}

{error && !loading && (
<div className="border border-destructive/40 bg-destructive/10 text-destructive rounded-md p-4 text-sm mb-4">
{error}
</div>
)}

{data && !loading && (
<div className="space-y-8">
<section className="border rounded-lg p-4 space-y-4">
<h2 className="font-semibold text-sm">Features excluded from ML</h2>
<p className="text-xs text-muted-foreground leading-relaxed">
When enabled, the checked fields are removed from all Bishop ML feature sets before training and
inference in <code className="text-xs bg-muted px-1 rounded">complete_ml_pipeline.py</code>. They
remain in the database for reporting if present in source data.
</p>
<div className="space-y-4">
{data.catalog.map((item) => (
<div
key={item.mlKey}
className="flex items-start justify-between gap-4 border-b border-border/60 pb-3 last:border-0 last:pb-0"
>
<div className="min-w-0">
<p className="font-medium text-sm">{item.label}</p>
<p className="text-xs text-muted-foreground mt-0.5">{item.description}</p>
<p className="text-xs font-mono text-muted-foreground mt-1">{item.mlKey}</p>
</div>
<div className="flex items-center gap-2 shrink-0">
<Label htmlFor={`ex-${item.mlKey}`} className="text-xs text-muted-foreground whitespace-nowrap">
Exclude
</Label>
<Switch
id={`ex-${item.mlKey}`}
checked={excluded.has(item.mlKey)}
onCheckedChange={(v) => toggleKey(item.mlKey, v)}
/>
</div>
</div>
))}
</div>
</section>

<section className="border rounded-lg p-4 space-y-3">
<h2 className="font-semibold text-sm">Low-sample warning threshold</h2>
<p className="text-xs text-muted-foreground">
Dashboard KPIs and natural-language query results show a caution when the visible student count is below
this number (after filters).
</p>
<Input
type="number"
min={1}
max={50000}
className="max-w-[120px]"
value={threshold}
onChange={(e) => setThreshold(Number(e.target.value) || 1)}
/>
</section>

{saveNote && (
<div className="text-sm text-emerald-700 dark:text-emerald-400 border border-emerald-200 dark:border-emerald-800 rounded-md p-3 bg-emerald-50/80 dark:bg-emerald-950/30">
{saveNote}
</div>
)}

<div className="flex items-center gap-2">
<Button type="button" onClick={() => void save()} disabled={saving}>
{saving ? (
<>
<Loader2 className="h-4 w-4 animate-spin mr-2" />
Saving…
</>
) : (
"Save settings"
)}
</Button>
{data.updatedAt && (
<span className="text-xs text-muted-foreground">
Last updated {new Date(data.updatedAt).toLocaleString()}
</span>
)}
</div>
</div>
)}
</div>
</div>
)
}
Loading
Loading