diff --git a/ai_model/complete_ml_pipeline.py b/ai_model/complete_ml_pipeline.py index acb1158..c2f9300 100644 --- a/ai_model/complete_ml_pipeline.py +++ b/ai_model/complete_ml_pipeline.py @@ -38,6 +38,11 @@ test_connection ) from operations.db_config import TABLES, DB_CONFIG +from ai_model.sensitive_feature_loader import ( + load_excluded_ml_keys, + log_institution_ml_privacy_exclusions, + strip_excluded_features, +) # Get the project root directory PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -230,6 +235,10 @@ def assign_credential_type(row): enrollment_features + course_features + performance_features ) +_EXCLUDED_ML_KEYS = load_excluded_ml_keys() +log_institution_ml_privacy_exclusions(_EXCLUDED_ML_KEYS) +retention_features = strip_excluded_features(retention_features, _EXCLUDED_ML_KEYS) + print(f"Selected {len(retention_features)} features for modeling (reduced from 31 to prevent overfitting)") # ============================================================================ @@ -736,6 +745,8 @@ def assign_alert_level(risk_score): 'Number_of_Credits_Earned_Year_1' ] +gateway_math_features = strip_excluded_features(gateway_math_features, _EXCLUDED_ML_KEYS) + print(f"\nUsing {len(gateway_math_features)} features (excluded gateway math features to prevent leakage)") # Preprocess with clean feature set @@ -847,6 +858,8 @@ def assign_alert_level(risk_score): 'Number_of_Credits_Earned_Year_1' ] +gateway_english_features = strip_excluded_features(gateway_english_features, _EXCLUDED_ML_KEYS) + print(f"\nUsing {len(gateway_english_features)} features (excluded gateway English features to prevent leakage)") # Preprocess with clean feature set @@ -962,6 +975,8 @@ def assign_alert_level(risk_score): 'CompletedGatewayMathYear1', 'CompletedGatewayEnglishYear1' ] +gpa_features = strip_excluded_features(gpa_features, _EXCLUDED_ML_KEYS) + print(f"\nUsing {len(gpa_features)} features (removed GPA-derived features)") print("Removed: average_grade, GPA_Group_Year_1, course_completion_rate, total_credits_earned") diff --git a/ai_model/complete_ml_pipeline_csv_only.py b/ai_model/complete_ml_pipeline_csv_only.py index 303f97a..ed38a52 100644 --- a/ai_model/complete_ml_pipeline_csv_only.py +++ b/ai_model/complete_ml_pipeline_csv_only.py @@ -27,7 +27,18 @@ from datetime import datetime import warnings import os -warnings.filterwarnings('ignore') +import sys + +warnings.filterwarnings("ignore") + +_REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if _REPO_ROOT not in sys.path: + sys.path.insert(0, _REPO_ROOT) +from ai_model.sensitive_feature_loader import ( + load_excluded_ml_keys, + log_institution_ml_privacy_exclusions, + strip_excluded_features, +) # Get the project root directory PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__)) @@ -198,6 +209,10 @@ def assign_credential_type(row): enrollment_features + course_features + performance_features ) +_EXCLUDED_ML_KEYS = load_excluded_ml_keys() +log_institution_ml_privacy_exclusions(_EXCLUDED_ML_KEYS) +retention_features = strip_excluded_features(retention_features, _EXCLUDED_ML_KEYS) + print(f"Selected {len(retention_features)} features for modeling (reduced from 31 to prevent overfitting)") # ============================================================================ @@ -676,6 +691,8 @@ def assign_alert_level(risk_score): 'Number_of_Credits_Earned_Year_1' ] +gateway_math_features = strip_excluded_features(gateway_math_features, _EXCLUDED_ML_KEYS) + print(f"\nUsing {len(gateway_math_features)} features (excluded gateway math features to prevent leakage)") # Preprocess with clean feature set @@ -778,6 +795,8 @@ def assign_alert_level(risk_score): 'Number_of_Credits_Earned_Year_1' ] +gateway_english_features = strip_excluded_features(gateway_english_features, _EXCLUDED_ML_KEYS) + print(f"\nUsing {len(gateway_english_features)} features (excluded gateway English features to prevent leakage)") # Preprocess with clean feature set @@ -884,6 +903,8 @@ def assign_alert_level(risk_score): 'CompletedGatewayMathYear1', 'CompletedGatewayEnglishYear1' ] +gpa_features = strip_excluded_features(gpa_features, _EXCLUDED_ML_KEYS) + print(f"\nUsing {len(gpa_features)} features (removed GPA-derived features)") print("Removed: average_grade, GPA_Group_Year_1, course_completion_rate, total_credits_earned") diff --git a/ai_model/sensitive_feature_loader.py b/ai_model/sensitive_feature_loader.py new file mode 100644 index 0000000..0aaba10 --- /dev/null +++ b/ai_model/sensitive_feature_loader.py @@ -0,0 +1,73 @@ +""" +Issue #109: load institution ML feature exclusions from Postgres and strip them +from sklearn feature lists before training / inference in complete_ml_pipeline.py. +""" + +from __future__ import annotations + +import os +from typing import FrozenSet, List, Sequence + +DEFAULT_INSTITUTION = "bscc" + + +def log_institution_ml_privacy_exclusions(excluded: FrozenSet[str]) -> None: + if excluded: + print(f"\n(#109) Institution ML privacy: excluding features {sorted(excluded)}") + + +def load_excluded_ml_keys() -> FrozenSet[str]: + """ + Reads excluded_ml_feature_keys for the default institution. + Returns empty set if DB is unreachable or the table/row is missing. + """ + try: + import psycopg2 # noqa: PLC0415 + except ImportError: + return frozenset() + + host = os.environ.get("DB_HOST", "127.0.0.1") + user = os.environ.get("DB_USER", "postgres") + password = os.environ.get("DB_PASSWORD", "postgres") + dbname = os.environ.get("DB_NAME", "postgres") + port = int(os.environ.get("DB_PORT", "54332")) + + try: + conn = psycopg2.connect( + host=host, user=user, password=password, dbname=dbname, port=port + ) + except Exception as exc: # noqa: BLE001 + print(f"(#109) Could not connect for sensitive ML settings: {exc}") + return frozenset() + + try: + with conn.cursor() as cur: + cur.execute( + """ + SELECT excluded_ml_feature_keys + FROM institution_sensitive_ml_settings + WHERE institution_code = %s + LIMIT 1 + """, + (DEFAULT_INSTITUTION,), + ) + row = cur.fetchone() + if not row or row[0] is None: + return frozenset() + keys = row[0] + if not isinstance(keys, list): + return frozenset() + return frozenset(str(k) for k in keys if isinstance(k, str)) + except Exception as exc: # noqa: BLE001 + print(f"(#109) Could not read institution_sensitive_ml_settings: {exc}") + return frozenset() + finally: + conn.close() + + +def strip_excluded_features( + features: Sequence[str], excluded: FrozenSet[str] +) -> List[str]: + if not excluded: + return list(features) + return [f for f in features if f not in excluded] diff --git a/codebenders-dashboard/app/admin/sensitive-ml/page.tsx b/codebenders-dashboard/app/admin/sensitive-ml/page.tsx new file mode 100644 index 0000000..de4eb14 --- /dev/null +++ b/codebenders-dashboard/app/admin/sensitive-ml/page.tsx @@ -0,0 +1,201 @@ +"use client" + +import { useCallback, useEffect, useState } from "react" +import Link from "next/link" +import { ArrowLeft, Loader2, Shield } from "lucide-react" +import { Button } from "@/components/ui/button" +import { Switch } from "@/components/ui/switch" +import { Label } from "@/components/ui/label" +import { Input } from "@/components/ui/input" +import type { SensitiveMlFeatureMeta } from "@/lib/sensitive-population" + +type SettingsPayload = { + institutionCode: string + excludedMlFeatureKeys: string[] + lowSampleThreshold: number + updatedAt: string | null + catalog: SensitiveMlFeatureMeta[] +} + +export default function SensitiveMlSettingsPage() { + const [data, setData] = useState(null) + const [excluded, setExcluded] = useState>(new Set()) + const [threshold, setThreshold] = useState(30) + const [loading, setLoading] = useState(true) + const [saving, setSaving] = useState(false) + const [error, setError] = useState(null) + const [saveNote, setSaveNote] = useState(null) + + const hydrateFromPayload = useCallback((payload: SettingsPayload) => { + setData(payload) + setExcluded(new Set(payload.excludedMlFeatureKeys)) + setThreshold(payload.lowSampleThreshold) + }, []) + + const load = useCallback(async () => { + setLoading(true) + setError(null) + try { + const res = await fetch("/api/admin/sensitive-ml-settings") + const json = await res.json() + if (!res.ok) throw new Error(json.error || "Failed to load") + hydrateFromPayload(json as SettingsPayload) + } catch (e) { + setData(null) + setError(e instanceof Error ? e.message : "Failed to load") + } finally { + setLoading(false) + } + }, [hydrateFromPayload]) + + useEffect(() => { + void load() + }, [load]) + + async function save() { + setSaving(true) + setSaveNote(null) + setError(null) + try { + const res = await fetch("/api/admin/sensitive-ml-settings", { + method: "PATCH", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + excludedMlFeatureKeys: [...excluded], + lowSampleThreshold: threshold, + }), + }) + const json = await res.json() + if (!res.ok) throw new Error(json.error || "Save failed") + hydrateFromPayload(json as SettingsPayload) + setSaveNote("Saved. Re-run the Python ML pipeline so new exclusions apply to training and batch scores.") + } catch (e) { + setError(e instanceof Error ? e.message : "Save failed") + } finally { + setSaving(false) + } + } + + function toggleKey(mlKey: string, off: boolean) { + setExcluded((prev) => { + const next = new Set(prev) + if (off) next.add(mlKey) + else next.delete(mlKey) + return next + }) + } + + return ( +
+
+
+ + + +
+

+ + ML privacy & sensitive fields +

+

+ Issue #109 — exclude demographic / aid inputs from ML training; low-sample warnings for dashboards and NLQ. +

+
+
+ + {loading && ( +
+ + Loading settings… +
+ )} + + {error && !loading && ( +
+ {error} +
+ )} + + {data && !loading && ( +
+
+

Features excluded from ML

+

+ When enabled, the checked fields are removed from all Bishop ML feature sets before training and + inference in complete_ml_pipeline.py. They + remain in the database for reporting if present in source data. +

+
+ {data.catalog.map((item) => ( +
+
+

{item.label}

+

{item.description}

+

{item.mlKey}

+
+
+ + toggleKey(item.mlKey, v)} + /> +
+
+ ))} +
+
+ +
+

Low-sample warning threshold

+

+ Dashboard KPIs and natural-language query results show a caution when the visible student count is below + this number (after filters). +

+ setThreshold(Number(e.target.value) || 1)} + /> +
+ + {saveNote && ( +
+ {saveNote} +
+ )} + +
+ + {data.updatedAt && ( + + Last updated {new Date(data.updatedAt).toLocaleString()} + + )} +
+
+ )} +
+
+ ) +} diff --git a/codebenders-dashboard/app/ai-transparency/page.tsx b/codebenders-dashboard/app/ai-transparency/page.tsx index 1ad998c..584d717 100644 --- a/codebenders-dashboard/app/ai-transparency/page.tsx +++ b/codebenders-dashboard/app/ai-transparency/page.tsx @@ -268,6 +268,33 @@ export default function AITransparencyPage() { ) })} + + + Sensitive population safeguards (#109) + + Institutional controls for demographic / aid-related model inputs, contextual warnings, and auditability. + + + +

+ ML feature exclusions. Institution admins can exclude + specific fields (e.g., race, ethnicity, Pell status) from all Bishop ML training and batch inference via{" "} + + ML privacy settings + + . Exclusions are stored in Postgres and read by{" "} + complete_ml_pipeline.py; predictions in the database + should be refreshed after changes. +

+

+ Context warnings. The home dashboard and natural-language + query interface show cautions when the active student count is below a configurable threshold or when SQL + references sensitive columns. Query runs that trigger these checks are recorded in the server-side audit + log with extra CSV columns for compliance review (admin / IR export). +

+
+
+ {/* Footer / contact */}

diff --git a/codebenders-dashboard/app/api/admin/sensitive-ml-settings/route.ts b/codebenders-dashboard/app/api/admin/sensitive-ml-settings/route.ts new file mode 100644 index 0000000..72c6c9b --- /dev/null +++ b/codebenders-dashboard/app/api/admin/sensitive-ml-settings/route.ts @@ -0,0 +1,109 @@ +import { type NextRequest, NextResponse } from "next/server" +import { getPool } from "@/lib/db" +import { canAccess, type Role } from "@/lib/roles" +import { SENSITIVE_ML_FEATURE_CATALOG, normalizeExcludedKeys } from "@/lib/sensitive-population" +import { + DEFAULT_INSTITUTION, + fetchSensitiveMlSettings, + type InstitutionSensitiveMlSettings, +} from "@/lib/sensitive-ml-settings-db" + +function settingsResponseBody(settings: InstitutionSensitiveMlSettings) { + return { + institutionCode: settings.institutionCode, + excludedMlFeatureKeys: settings.excludedMlFeatureKeys, + lowSampleThreshold: settings.lowSampleThreshold, + updatedAt: settings.updatedAt, + catalog: SENSITIVE_ML_FEATURE_CATALOG, + } +} + +function requireAdminIr(request: NextRequest): Role | null { + const role = request.headers.get("x-user-role") as Role | null + if (!role || !canAccess("/api/admin/sensitive-ml-settings", role)) return null + return role +} + +export async function GET(request: NextRequest) { + const role = requireAdminIr(request) + if (!role) { + return NextResponse.json({ error: "Forbidden" }, { status: 403 }) + } + + try { + const pool = getPool() + const settings = await fetchSensitiveMlSettings(pool) + return NextResponse.json(settingsResponseBody(settings)) + } catch (error) { + console.error("sensitive-ml-settings GET:", error) + return NextResponse.json( + { error: "Failed to load settings", details: error instanceof Error ? error.message : String(error) }, + { status: 500 } + ) + } +} + +export async function PATCH(request: NextRequest) { + const role = request.headers.get("x-user-role") as Role | null + if (!role || role !== "admin") { + return NextResponse.json( + { error: "Forbidden — only institution admins may change ML privacy settings" }, + { status: 403 } + ) + } + + let body: unknown + try { + body = await request.json() + } catch { + return NextResponse.json({ error: "Invalid JSON" }, { status: 400 }) + } + + const b = body as Record + const excludedRaw = b.excludedMlFeatureKeys + const thresholdRaw = b.lowSampleThreshold + + if (thresholdRaw !== undefined && (typeof thresholdRaw !== "number" || !Number.isFinite(thresholdRaw))) { + return NextResponse.json({ error: "lowSampleThreshold must be a number" }, { status: 400 }) + } + + const excludedMlFeatureKeys = excludedRaw !== undefined ? normalizeExcludedKeys(excludedRaw) : undefined + const lowSampleThreshold = + thresholdRaw !== undefined ? Math.min(50000, Math.max(1, Math.round(thresholdRaw))) : undefined + + if (excludedMlFeatureKeys === undefined && lowSampleThreshold === undefined) { + return NextResponse.json( + { error: "Provide excludedMlFeatureKeys and/or lowSampleThreshold" }, + { status: 400 } + ) + } + + const userEmail = request.headers.get("x-user-email") ?? "" + + try { + const pool = getPool() + const current = await fetchSensitiveMlSettings(pool) + + const nextExcluded = excludedMlFeatureKeys ?? current.excludedMlFeatureKeys + const nextThreshold = lowSampleThreshold ?? current.lowSampleThreshold + + await pool.query( + `UPDATE institution_sensitive_ml_settings + SET excluded_ml_feature_keys = $2, + low_sample_threshold = $3, + updated_at = now(), + updated_by_email = NULLIF($4, '') + WHERE institution_code = $1`, + [DEFAULT_INSTITUTION, nextExcluded, nextThreshold, userEmail || null] + ) + + const settings = await fetchSensitiveMlSettings(pool) + return NextResponse.json(settingsResponseBody(settings)) + } catch (error) { + console.error("sensitive-ml-settings PATCH:", error) + return NextResponse.json( + { error: "Failed to save settings", details: error instanceof Error ? error.message : String(error) }, + { status: 500 } + ) + } +} diff --git a/codebenders-dashboard/app/api/dashboard/kpis/route.ts b/codebenders-dashboard/app/api/dashboard/kpis/route.ts index 928932f..45eb6af 100644 --- a/codebenders-dashboard/app/api/dashboard/kpis/route.ts +++ b/codebenders-dashboard/app/api/dashboard/kpis/route.ts @@ -1,14 +1,17 @@ import { type NextRequest, NextResponse } from "next/server" import { getPool } from "@/lib/db" +import { buildExcludedMlFeaturesKpiMessage, buildLowSampleWarningMessage } from "@/lib/sensitive-population" +import { fetchSensitiveMlSettings } from "@/lib/sensitive-ml-settings-db" export async function GET(request: NextRequest) { try { const { searchParams } = new URL(request.url) - const cohort = searchParams.get("cohort") || "" + const cohort = searchParams.get("cohort") || "" const enrollmentType = searchParams.get("enrollmentType") || "" const credentialType = searchParams.get("credentialType") || "" const pool = getPool() + const sensitiveSettings = await fetchSensitiveMlSettings(pool) const conditions: string[] = [] const params: unknown[] = [] @@ -47,12 +50,28 @@ export async function GET(request: NextRequest) { return NextResponse.json({ error: "No data found" }, { status: 404 }) } + const totalStudents = Number(kpis.total_students || 0) + const threshold = sensitiveSettings.lowSampleThreshold + const sensitiveMessages: string[] = [] + + if (totalStudents > 0 && totalStudents < threshold) { + sensitiveMessages.push(buildLowSampleWarningMessage(threshold)) + } + + if (sensitiveSettings.excludedMlFeatureKeys.length > 0) { + sensitiveMessages.push(buildExcludedMlFeaturesKpiMessage(sensitiveSettings.excludedMlFeatureKeys)) + } + return NextResponse.json({ overallRetentionRate: Number(kpis.overall_retention_rate || 0).toFixed(1), avgPredictedRetention: Number(kpis.avg_predicted_retention || 0).toFixed(1), highCriticalRiskCount: Number(kpis.high_critical_risk_count || 0), avgCourseCompletionRate: Number(kpis.avg_course_completion_rate || 0).toFixed(1), - totalStudents: Number(kpis.total_students || 0), + totalStudents, + sensitivePopulation: { + lowSampleWarning: totalStudents > 0 && totalStudents < threshold, + messages: sensitiveMessages, + }, }) } catch (error) { console.error("KPI fetch error:", error) diff --git a/codebenders-dashboard/app/api/query-history/export/route.ts b/codebenders-dashboard/app/api/query-history/export/route.ts index 27b503f..0280c9f 100644 --- a/codebenders-dashboard/app/api/query-history/export/route.ts +++ b/codebenders-dashboard/app/api/query-history/export/route.ts @@ -13,6 +13,12 @@ function escapeCsvField(value: unknown): string { return str } +function csvSensitiveLowSampleCell(value: unknown): string { + if (value === true) return "true" + if (value === false) return "false" + return "" +} + export async function GET(request: NextRequest) { const role = request.headers.get("x-user-role") as Role | null if (!role || !canAccess("/api/query-history/export", role)) { @@ -40,7 +46,9 @@ export async function GET(request: NextRequest) { const lines = raw.split("\n").filter(Boolean) const rows: string[] = [ - ["timestamp", "institution", "prompt", "vizType", "rowCount"].join(","), + ["timestamp", "institution", "prompt", "vizType", "rowCount", "sensitiveSqlColumns", "sensitiveLowSample"].join( + "," + ), ] for (const line of lines) { @@ -58,6 +66,8 @@ export async function GET(request: NextRequest) { if (toDate && ts > toDate) continue } + const sensCols = entry.sensitiveSqlColumns + const sensColsStr = Array.isArray(sensCols) ? sensCols.join(";") : "" rows.push( [ escapeCsvField(entry.timestamp), @@ -65,6 +75,8 @@ export async function GET(request: NextRequest) { escapeCsvField(entry.prompt), escapeCsvField(entry.vizType), escapeCsvField(entry.rowCount), + escapeCsvField(sensColsStr), + escapeCsvField(csvSensitiveLowSampleCell(entry.sensitiveLowSample)), ].join(",") ) } diff --git a/codebenders-dashboard/app/api/query-history/route.ts b/codebenders-dashboard/app/api/query-history/route.ts index b12d333..2c1a918 100644 --- a/codebenders-dashboard/app/api/query-history/route.ts +++ b/codebenders-dashboard/app/api/query-history/route.ts @@ -11,6 +11,21 @@ interface QueryHistoryEntry { vizType: string rowCount: number timestamp: string + sensitiveSqlColumns?: string[] + sensitiveLowSample?: boolean +} + +function optionalSensitiveAuditFields(entry: Record): Partial< + Pick +> { + const out: Partial> = {} + if (Array.isArray(entry.sensitiveSqlColumns) && entry.sensitiveSqlColumns.length > 0) { + out.sensitiveSqlColumns = entry.sensitiveSqlColumns as string[] + } + if (typeof entry.sensitiveLowSample === "boolean") { + out.sensitiveLowSample = entry.sensitiveLowSample + } + return out } export async function POST(request: NextRequest) { @@ -44,6 +59,7 @@ export async function POST(request: NextRequest) { vizType: entry.vizType, rowCount: entry.rowCount, timestamp: entry.timestamp, + ...optionalSensitiveAuditFields(entry), } try { diff --git a/codebenders-dashboard/app/api/sensitive-context/route.ts b/codebenders-dashboard/app/api/sensitive-context/route.ts new file mode 100644 index 0000000..411848a --- /dev/null +++ b/codebenders-dashboard/app/api/sensitive-context/route.ts @@ -0,0 +1,27 @@ +import { type NextRequest, NextResponse } from "next/server" +import { getPool } from "@/lib/db" +import { canAccess, type Role } from "@/lib/roles" +import { fetchSensitiveMlSettings } from "@/lib/sensitive-ml-settings-db" + +/** Public to all authenticated roles — used for NLQ warnings and dashboard copy (#109). */ +export async function GET(request: NextRequest) { + const role = request.headers.get("x-user-role") as Role | null + if (!role || !canAccess("/api/sensitive-context", role)) { + return NextResponse.json({ error: "Forbidden" }, { status: 403 }) + } + + try { + const pool = getPool() + const settings = await fetchSensitiveMlSettings(pool) + return NextResponse.json({ + lowSampleThreshold: settings.lowSampleThreshold, + excludedMlFeatureKeys: settings.excludedMlFeatureKeys, + }) + } catch (error) { + console.error("sensitive-context GET:", error) + return NextResponse.json( + { error: "Failed to load context", details: error instanceof Error ? error.message : String(error) }, + { status: 500 } + ) + } +} diff --git a/codebenders-dashboard/app/page.tsx b/codebenders-dashboard/app/page.tsx index 409f4e2..073cecf 100644 --- a/codebenders-dashboard/app/page.tsx +++ b/codebenders-dashboard/app/page.tsx @@ -14,7 +14,8 @@ import { SelectTrigger, SelectValue, } from "@/components/ui/select" -import { TrendingUp, Users, AlertTriangle, BookOpen, Search, Table2, X } from "lucide-react" +import { TrendingUp, Users, AlertTriangle, BookOpen, Search, Table2, X, ShieldAlert } from "lucide-react" +import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert" import Link from "next/link" import { GlossaryMetricEntryLink } from "@/components/glossary-metric-entry-link" import { useDataLineage } from "@/components/data-lineage-drawer" @@ -27,6 +28,10 @@ interface KPIData { highCriticalRiskCount: number avgCourseCompletionRate: string totalStudents: number + sensitivePopulation?: { + lowSampleWarning: boolean + messages: string[] + } } interface RiskAlertData { @@ -162,6 +167,8 @@ export default function DashboardPage() { // eslint-disable-next-line react-hooks/exhaustive-deps }, [cohort, enrollmentType, credentialType]) + const kpiSensitiveMessages = kpis?.sensitivePopulation?.messages + return (

@@ -274,6 +281,18 @@ export default function DashboardPage() {
)} + {kpiSensitiveMessages && kpiSensitiveMessages.length > 0 && ( + + + Sensitive population safeguards + + {kpiSensitiveMessages.map((m, i) => ( +

{m}

+ ))} +
+
+ )} + {/* KPI Cards */}
{ + try { + const res = await fetch("/api/sensitive-context") + if (!res.ok) return DEFAULT_LOW_SAMPLE_THRESHOLD + const data = (await res.json()) as { lowSampleThreshold?: unknown } + return typeof data.lowSampleThreshold === "number" + ? data.lowSampleThreshold + : DEFAULT_LOW_SAMPLE_THRESHOLD + } catch { + return DEFAULT_LOW_SAMPLE_THRESHOLD + } +} + +function sensitiveQueryGuard( + sql: string, + rowCount: number, + lowSampleThreshold: number, +): { sqlCols: string[]; sensitiveLowSample: boolean; warnMsgs: string[] } { + const sqlCols = findSensitiveMlKeysReferencedInSql(sql) + const sensitiveLowSample = + typeof rowCount === "number" && rowCount >= 0 && rowCount < lowSampleThreshold + const warnMsgs: string[] = [] + if (sqlCols.length > 0) warnMsgs.push(buildSensitivePopulationSqlWarningMessage(sqlCols)) + if (sensitiveLowSample) warnMsgs.push(buildLowSampleWarningMessage(lowSampleThreshold)) + return { sqlCols, sensitiveLowSample, warnMsgs } +} const INSTITUTIONS = [ { name: "Bishop State", code: "bscc" }, @@ -35,6 +70,7 @@ export default function QueryPage() { const [summary, setSummary] = useState(null) const [summaryLoading, setSummaryLoading] = useState(false) const [summaryError, setSummaryError] = useState(null) + const [sensitiveWarnings, setSensitiveWarnings] = useState([]) const [history, setHistory] = useState(() => { // Read from localStorage on mount (client-only) if (typeof window === "undefined") return [] @@ -59,6 +95,7 @@ export default function QueryPage() { if (!activePrompt.trim()) return setIsAnalyzing(true) + setSensitiveWarnings([]) try { const enableLLM = process.env.NEXT_PUBLIC_ENABLE_LLM === "1" console.log("enableLLM", enableLLM) @@ -92,6 +129,14 @@ export default function QueryPage() { console.log("query result:", result) setQueryResult(result) + const lowSampleThreshold = await fetchLowSampleThreshold() + const { sqlCols, sensitiveLowSample, warnMsgs } = sensitiveQueryGuard( + plan.sql, + result.rowCount, + lowSampleThreshold, + ) + setSensitiveWarnings(warnMsgs) + // Persist history entry const entry: HistoryEntry = { id: crypto.randomUUID(), @@ -100,6 +145,8 @@ export default function QueryPage() { prompt: activePrompt, rowCount: result.rowCount, vizType: plan.vizType, + ...(sqlCols.length > 0 ? { sensitiveSqlColumns: sqlCols } : {}), + ...(sensitiveLowSample ? { sensitiveLowSample: true } : {}), } // Prepend and cap at 50 entries setHistory(prev => { @@ -339,6 +386,23 @@ export default function QueryPage() {

{summaryError}

)} + {sensitiveWarnings.length > 0 && ( + + + Sensitive population safeguards + + {sensitiveWarnings.map((w, i) => ( +

{w}

+ ))} +

+ This run is written to the server-side query audit log with extra columns when these conditions + apply. Use Export in the Recent Queries sidebar to download the CSV (admin and + IR roles). +

+
+
+ )} +
diff --git a/codebenders-dashboard/components/nav-header.tsx b/codebenders-dashboard/components/nav-header.tsx index 19c0e5d..2679b8f 100644 --- a/codebenders-dashboard/components/nav-header.tsx +++ b/codebenders-dashboard/components/nav-header.tsx @@ -23,6 +23,7 @@ const NAV_LINKS: Array<{ href: string; label: string; roles?: Role[] }> = [ { href: "/discovery/aascu", label: "Discovery", roles: ["admin", "ir", "leadership"] }, { href: AI_TRANSPARENCY_HREF, label: "AI Transparency" }, { href: "/admin/upload", label: "Admin", roles: ["admin", "ir"] }, + { href: "/admin/sensitive-ml", label: "ML privacy", roles: ["admin", "ir"] }, ] export function NavHeader({ email, role }: NavHeaderProps) { diff --git a/codebenders-dashboard/lib/__tests__/sensitive-population.test.ts b/codebenders-dashboard/lib/__tests__/sensitive-population.test.ts new file mode 100644 index 0000000..c733359 --- /dev/null +++ b/codebenders-dashboard/lib/__tests__/sensitive-population.test.ts @@ -0,0 +1,25 @@ +import { describe, expect, it } from "vitest" +import { findSensitiveMlKeysReferencedInSql, normalizeExcludedKeys } from "@/lib/sensitive-population" + +describe("findSensitiveMlKeysReferencedInSql", () => { + it("detects quoted identifiers", () => { + expect(findSensitiveMlKeysReferencedInSql(`SELECT "Race", "Cohort" FROM student_level_with_predictions`)).toEqual([ + "Race", + ]) + }) + + it("detects bare identifiers", () => { + expect(findSensitiveMlKeysReferencedInSql(`SELECT Ethnicity FROM t`)).toEqual(["Ethnicity"]) + }) + + it("returns sorted unique keys", () => { + const sql = `WHERE "Gender" = 'F' AND Pell_Status_First_Year = 1` + expect(findSensitiveMlKeysReferencedInSql(sql)).toEqual(["Gender", "Pell_Status_First_Year"]) + }) +}) + +describe("normalizeExcludedKeys", () => { + it("filters unknown keys", () => { + expect(normalizeExcludedKeys(["Race", "NotAFeature", "Race"])).toEqual(["Race"]) + }) +}) diff --git a/codebenders-dashboard/lib/roles.ts b/codebenders-dashboard/lib/roles.ts index 6b4dce4..147b049 100644 --- a/codebenders-dashboard/lib/roles.ts +++ b/codebenders-dashboard/lib/roles.ts @@ -13,6 +13,7 @@ export const ROUTE_PERMISSIONS: Array<{ prefix: string; roles: Role[] }> = [ { prefix: "/api/query-history/export", roles: ["admin", "ir"] }, { prefix: "/admin", roles: ["admin", "ir"] }, { prefix: "/api/admin", roles: ["admin", "ir"] }, + { prefix: "/api/sensitive-context", roles: ALL_ROLES }, { prefix: "/discovery", roles: ["admin", "ir", "leadership"] }, ] diff --git a/codebenders-dashboard/lib/sensitive-ml-settings-db.ts b/codebenders-dashboard/lib/sensitive-ml-settings-db.ts new file mode 100644 index 0000000..01be18b --- /dev/null +++ b/codebenders-dashboard/lib/sensitive-ml-settings-db.ts @@ -0,0 +1,56 @@ +import type { Pool } from "pg" + +export type InstitutionSensitiveMlSettings = { + institutionCode: string + excludedMlFeatureKeys: string[] + lowSampleThreshold: number + updatedAt: string | null +} + +export const DEFAULT_INSTITUTION = "bscc" + +function pgErrorCode(err: unknown): string { + return err && typeof err === "object" && "code" in err ? String((err as { code: unknown }).code) : "" +} + +const FALLBACK: InstitutionSensitiveMlSettings = { + institutionCode: DEFAULT_INSTITUTION, + excludedMlFeatureKeys: [], + lowSampleThreshold: 30, + updatedAt: null, +} + +function rowToSettings(row: Record): InstitutionSensitiveMlSettings { + return { + institutionCode: String(row.institution_code ?? DEFAULT_INSTITUTION), + excludedMlFeatureKeys: Array.isArray(row.excluded_ml_feature_keys) + ? (row.excluded_ml_feature_keys as string[]) + : [], + lowSampleThreshold: Math.max(1, Number(row.low_sample_threshold ?? 30)), + updatedAt: row.updated_at ? new Date(row.updated_at as string).toISOString() : null, + } +} + +/** + * Loads institution ML privacy settings. Uses fallback if the table is missing (local dev). + */ +export async function fetchSensitiveMlSettings(pool: Pool): Promise { + try { + const res = await pool.query( + `SELECT institution_code, excluded_ml_feature_keys, low_sample_threshold, updated_at + FROM institution_sensitive_ml_settings + WHERE institution_code = $1 + LIMIT 1`, + [DEFAULT_INSTITUTION] + ) + const row = res.rows[0] + if (!row) return FALLBACK + return rowToSettings(row as Record) + } catch (err: unknown) { + if (pgErrorCode(err) === "42P01") { + console.warn("institution_sensitive_ml_settings missing — using defaults (#109)") + return FALLBACK + } + throw err + } +} diff --git a/codebenders-dashboard/lib/sensitive-population.ts b/codebenders-dashboard/lib/sensitive-population.ts new file mode 100644 index 0000000..46f1efc --- /dev/null +++ b/codebenders-dashboard/lib/sensitive-population.ts @@ -0,0 +1,105 @@ +/** + * Issue #109 — sensitive-population safeguards: canonical ML feature keys, + * SQL surface detection for NLQ audit + warnings, and validation for admin API. + */ + +export type SensitiveMlFeatureMeta = { + /** Column name used in `complete_ml_pipeline.py` feature lists. */ + mlKey: string + /** Human label for admin UI and transparency copy. */ + label: string + /** Short rationale for IR/admin. */ + description: string +} + +/** Features institutions may exclude from all Bishop ML models (demographic / aid). */ +export const SENSITIVE_ML_FEATURE_CATALOG: SensitiveMlFeatureMeta[] = [ + { + mlKey: "Student_Age", + label: "Student age", + description: "Age band or numeric age used in demographic risk signals.", + }, + { + mlKey: "Race", + label: "Race", + description: "Race category from PDP / institutional records.", + }, + { + mlKey: "Ethnicity", + label: "Ethnicity", + description: "Ethnicity / Hispanic origin indicators.", + }, + { + mlKey: "Gender", + label: "Gender", + description: "Gender or sex field as reported in source data.", + }, + { + mlKey: "First_Gen", + label: "First-generation status", + description: "First-generation college student indicator.", + }, + { + mlKey: "Pell_Status_First_Year", + label: "Pell / aid status (year 1)", + description: "Federal aid / Pell eligibility proxy — can correlate with socioeconomic status.", + }, +] + +const CATALOG_KEYS = new Set(SENSITIVE_ML_FEATURE_CATALOG.map((f) => f.mlKey)) + +const ML_KEY_TO_LABEL: Record = Object.fromEntries( + SENSITIVE_ML_FEATURE_CATALOG.map((f) => [f.mlKey, f.label]) +) + +export function isAllowedSensitiveMlKey(key: string): boolean { + return CATALOG_KEYS.has(key) +} + +export function normalizeExcludedKeys(keys: unknown): string[] { + if (!Array.isArray(keys)) return [] + const out: string[] = [] + for (const k of keys) { + if (typeof k !== "string" || !CATALOG_KEYS.has(k)) continue + if (!out.includes(k)) out.push(k) + } + return out +} + +/** Regex fragments: quoted identifiers and bare identifiers (word boundary). */ +function sqlPatternsForMlKey(mlKey: string): RegExp[] { + const q = mlKey.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + return [new RegExp(`"${q}"`, "i"), new RegExp(`\\b${q}\\b`, "i")] +} + +const DETECTOR_ENTRIES = SENSITIVE_ML_FEATURE_CATALOG.map((f) => ({ + mlKey: f.mlKey, + patterns: sqlPatternsForMlKey(f.mlKey), +})) + +/** + * Detect references to sensitive columns in generated or hand-written SQL + * (NLQ / execute-sql). Used for contextual warnings and audit (#109 / #67). + */ +export function findSensitiveMlKeysReferencedInSql(sql: string): string[] { + if (!sql || !sql.trim()) return [] + const hit = new Set() + for (const { mlKey, patterns } of DETECTOR_ENTRIES) { + if (patterns.some((p) => p.test(sql))) hit.add(mlKey) + } + return [...hit].sort() +} + +export function buildLowSampleWarningMessage(threshold: number): string { + return `Low sample size — interpret with care. This view has fewer than ${threshold} students and predictions may be unreliable.` +} + +export function buildSensitivePopulationSqlWarningMessage(columns: string[]): string { + const labels = columns.map((k) => ML_KEY_TO_LABEL[k] ?? k).join(", ") + return `This query references sensitive demographic or aid-related fields (${labels}). Interpret results with institutional context; broad-brush AI summaries can misrepresent under-served groups.` +} + +/** KPI banner when institution excludes ML inputs (#109). */ +export function buildExcludedMlFeaturesKpiMessage(excludedKeys: string[]): string { + return `Institutional ML privacy settings exclude these inputs from model training and batch inference: ${excludedKeys.join(", ")}. Re-run the ML pipeline after changing exclusions.` +} diff --git a/codebenders-dashboard/lib/types.ts b/codebenders-dashboard/lib/types.ts index 066e6d3..6a4c348 100644 --- a/codebenders-dashboard/lib/types.ts +++ b/codebenders-dashboard/lib/types.ts @@ -20,6 +20,10 @@ export interface HistoryEntry { prompt: string rowCount: number vizType: QueryPlan["vizType"] + /** Issue #109 — NLQ touched sensitive SQL columns (audit / export). */ + sensitiveSqlColumns?: string[] + /** Issue #109 — result row count below institutional low-sample threshold. */ + sensitiveLowSample?: boolean } export interface PDPRecord { diff --git a/supabase/migrations/20260503160000_institution_sensitive_ml_settings.sql b/supabase/migrations/20260503160000_institution_sensitive_ml_settings.sql new file mode 100644 index 0000000..f74d2af --- /dev/null +++ b/supabase/migrations/20260503160000_institution_sensitive_ml_settings.sql @@ -0,0 +1,18 @@ +-- Issue #109: per-institution sensitive-population / ML feature exclusion settings. + +CREATE TABLE IF NOT EXISTS public.institution_sensitive_ml_settings ( + institution_code text PRIMARY KEY, + excluded_ml_feature_keys text[] NOT NULL DEFAULT ARRAY[]::text[], + low_sample_threshold integer NOT NULL DEFAULT 30 + CHECK (low_sample_threshold >= 1 AND low_sample_threshold <= 50000), + updated_at timestamptz NOT NULL DEFAULT now(), + updated_by_user_id uuid, + updated_by_email text +); + +COMMENT ON TABLE public.institution_sensitive_ml_settings IS + 'Issue #109: ML features excluded from training/inference and thresholds for low-sample warnings.'; + +INSERT INTO public.institution_sensitive_ml_settings (institution_code) +VALUES ('bscc') +ON CONFLICT (institution_code) DO NOTHING;