# OM SAI RAM

# ML FRAMEWORK FOR AUTISM

## Importing libraries

In [2]:
# --------------------------
# Imports
# --------------------------
import os, math, re, warnings, zipfile, subprocess
from pathlib import Path

# Data & Math
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt

# Scikit-learn
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, RocCurveDisplay
)

# ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Audio
import librosa

# Suppress warnings for clean output
warnings.filterwarnings("ignore")

# --------------------------
# Directories
# --------------------------
OUTDIR = Path("outputs")
OUTDIR.mkdir(exist_ok=True)


## Importing questionnare

In [4]:
dfq = pd.read_csv("toddler_autism.csv")  # change if filename differs
dfq.columns = [c.strip().replace(" ", "_") for c in dfq.columns]

## questinnare preprocessing

In [6]:
# --------------------------
# Questionnaire Preprocessing
# --------------------------

# Detect label column automatically
label_candidates = [c for c in dfq.columns if "class" in c.lower() or "asd" in c.lower()]
label_col = label_candidates[0] if label_candidates else "Family_ASD_History"

# Function: convert Yes/No/etc. → binary
def to_binary_series(s):
    mapping = {
        "yes": 1, "no": 0, "true": 1, "false": 0,
        "asd": 1, "no_asd": 0, "non-asd": 0,
        "negative": 0, "positive": 1
    }
    return s.astype(str).str.strip().str.lower().map(mapping)

# Clean labels
dfq[label_col] = to_binary_series(dfq[label_col]).fillna(0).astype(int)

# Preprocess features
q_feats = []
for c in dfq.columns:
    if c == label_col:
        continue
    if dfq[c].dtype == object:               # convert categorical to binary if possible
        dfq[c] = to_binary_series(dfq[c])
    dfq[c] = pd.to_numeric(dfq[c], errors="coerce")  # force numeric
    if dfq[c].notna().sum() > 0:
        q_feats.append(c)

# Final X, y
Xq = dfq[q_feats].fillna(dfq[q_feats].median())  # impute missing with median
y  = dfq[label_col].values

print(f"[questionnaire] rows={len(dfq)} label={label_col} features={len(q_feats)}")


[questionnaire] rows=1000 label=Family_ASD_History features=7


## importing audio

In [8]:
# === SPEECH IMPORTING (FEATURE EXTRACTION) ===

def extract_audio_features(path, sr_target=16000, n_mfcc=13):
    try:
        # Load audio
        y, sr = librosa.load(path, sr=sr_target, mono=True)
        if y.size == 0:
            return None

        # Trim silence
        y, _ = librosa.effects.trim(y, top_db=30)

        # MFCC features
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        mfcc_mean = mfcc.mean(axis=1)
        mfcc_std  = mfcc.std(axis=1)

        # Deltas
        d1 = librosa.feature.delta(mfcc)
        d2 = librosa.feature.delta(mfcc, order=2)

        # Spectral features
        centroid  = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
        rolloff   = librosa.feature.spectral_rolloff(y=y, sr=sr).mean()
        bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr).mean()

        # Collect into dictionary
        feat = {}
        feat.update({f"mfcc_mean_{i}": v for i, v in enumerate(mfcc_mean)})
        feat.update({f"mfcc_std_{i}":  v for i, v in enumerate(mfcc_std)})
        feat.update({f"delta1_{i}":    v for i, v in enumerate(d1.mean(axis=1))})
        feat.update({f"delta2_{i}":    v for i, v in enumerate(d2.mean(axis=1))})
        feat["centroid"]   = centroid
        feat["rolloff"]    = rolloff
        feat["bandwidth"]  = bandwidth
        feat["audio_file"] = str(path)
        return feat

    except Exception as e:
        print(f"[error] {path}: {e}")
        return None


def build_speech_features_df(audio_root, max_files=500, exts=(".wav", ".flac", ".mp3")):
    files = [f for f in Path(audio_root).rglob("*") if f.suffix.lower() in exts][:max_files]
    rows = []
    for i, f in enumerate(files, 1):
        feat = extract_audio_features(f)
        if feat:
            rows.append(feat)
        if i % 100 == 0:
            print(f"[audio] processed {i}/{len(files)}")

    df = pd.DataFrame(rows)
    df = df.dropna(axis=1, how="all").reset_index(drop=True)  # remove empty cols
    return df


# Build dataset
dfs = build_speech_features_df("data/speech", max_files=500)
print(f"[speech] rows={len(dfs)}, features={dfs.shape[1]-1}")


[audio] processed 100/500
[audio] processed 200/500
[audio] processed 300/500
[audio] processed 400/500
[audio] processed 500/500
[speech] rows=500, features=55


## Speech Preprocessing

In [10]:
# === SPEECH PREPROCESSING ===

# Example function for speech preprocessing
def preprocess_audio(file_path, sr=16000):
    # Load audio
    y, sr = librosa.load(file_path, sr=sr)

    # 1. Noise reduction (simple method: spectral gating or median filter)
    y = librosa.effects.preemphasis(y)

    # 2. Silence removal (trim leading/trailing silence)
    y, _ = librosa.effects.trim(y, top_db=20)

    return y, sr

# Example demo on one file (optional check)
example_file = "data/speech/sample.wav"  # <-- replace with actual file
if os.path.exists(example_file):
    y, sr = preprocess_audio(example_file)
    print(f"[speech preprocessing] Processed {len(y)} samples at {sr} Hz")
else:
    print("[speech preprocessing] Demo skipped (no sample file found).")

[speech preprocessing] Demo skipped (no sample file found).


## aliging speech with questionarre

In [12]:
# === ALIGNING SPEECH WITH QUESTIONNAIRE ===

if not dfs.empty:
    reps = math.ceil(len(dfq) / len(dfs))  # repeat speech features
    dfs_rep = pd.concat([dfs.drop(columns=["audio_file"])] * reps,
                        ignore_index=True).iloc[:len(dfq)]
    Xs = dfs_rep.reset_index(drop=True)
    Xh = pd.concat([dfq[q_feats].reset_index(drop=True), Xs], axis=1)

    print(f"[align] Questionnaire rows={len(dfq)}, Speech rows={len(dfs)}, Hybrid shape={Xh.shape}")
else:
    print("[align] No speech features available → Skipping Speech & Hybrid.")
    Xs = pd.DataFrame(index=dfq.index)  # empty placeholder
    Xh = dfq[q_feats].copy()


[align] Questionnaire rows=1000, Speech rows=500, Hybrid shape=(1000, 62)


## Imputation +SMOTE

In [14]:
# === BLOCK 2: IMPUTATION + SMOTE ===
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

imputer = SimpleImputer(strategy="mean")

# Handle missing values
Xq_imp = imputer.fit_transform(dfq[q_feats])   # Questionnaire
Xs_imp = imputer.fit_transform(Xs)             # Speech
Xh_imp = imputer.fit_transform(Xh)             # Hybrid

# Balance using SMOTE
sm = SMOTE(random_state=42)
Xq_bal, y_bal = sm.fit_resample(Xq_imp, y)
Xs_bal, y_bal = sm.fit_resample(Xs_imp, y)
Xh_bal, y_bal = sm.fit_resample(Xh_imp, y)

print(f"[SMOTE] Questionnaire {Xq_bal.shape} | Speech {Xs_bal.shape} | Hybrid {Xh_bal.shape}")
print(f"[SMOTE] Class distribution: {np.bincount(y_bal)}")


[SMOTE] Questionnaire (1300, 7) | Speech (1300, 55) | Hybrid (1300, 62)
[SMOTE] Class distribution: [650 650]


## spilitting

In [16]:
# === BLOCK 1: TRAIN-TEST SPLIT ===
from sklearn.model_selection import train_test_split

Xq_train, Xq_test, y_train, y_test = train_test_split(
    Xq_bal, y_bal, test_size=0.2, random_state=42, stratify=y_bal
)
Xs_train, Xs_test, _, _ = train_test_split(
    Xs_bal, y_bal, test_size=0.2, random_state=42, stratify=y_bal
)
Xh_train, Xh_test, _, _ = train_test_split(
    Xh_bal, y_bal, test_size=0.2, random_state=42, stratify=y_bal
)

print(f"[Split] Questionnaire train={Xq_train.shape}, test={Xq_test.shape}")
print(f"[Split] Speech train={Xs_train.shape}, test={Xs_test.shape}")
print(f"[Split] Hybrid train={Xh_train.shape}, test={Xh_test.shape}")


[Split] Questionnaire train=(1040, 7), test=(260, 7)
[Split] Speech train=(1040, 55), test=(260, 55)
[Split] Hybrid train=(1040, 62), test=(260, 62)


## Standard Scaler

In [18]:
# === BLOCK 3: STANDARDIZATION ===
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

Xq_train = scaler.fit_transform(Xq_train)
Xq_test  = scaler.transform(Xq_test)

Xs_train = scaler.fit_transform(Xs_train)
Xs_test  = scaler.transform(Xs_test)

Xh_train = scaler.fit_transform(Xh_train)
Xh_test  = scaler.transform(Xh_test)

print("[Scaler] Standardization applied to all feature sets")


[Scaler] Standardization applied to all feature sets


## Define Models

In [20]:
# === BLOCK 4: DEFINE MODELS ===
models = [
    ("LogReg",   lambda: LogisticRegression(max_iter=700, class_weight="balanced")),
    ("SVM",      lambda: SVC(kernel="rbf", probability=True, class_weight="balanced")),
    ("RF",       lambda: RandomForestClassifier(n_estimators=700, class_weight="balanced", random_state=42)),
    ("XGB",      lambda: XGBClassifier(n_estimators=700, eval_metric="logloss", random_state=42)),
    ("CatBoost", lambda: CatBoostClassifier(
        iterations=700, learning_rate=0.05, depth=6,
        auto_class_weights="Balanced", verbose=0, random_state=42
    ))
]

print(f"[Models] {len(models)} models initialized")


[Models] 5 models initialized


## Train and Eval Function

In [22]:
# === BLOCK 5: TRAIN & EVALUATE FUNCTION ===
def train_and_eval(X_train, y_train, X_test, y_test, model, name, prefix):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1] if hasattr(model, "predict_proba") else None

    # Metrics
    acc  = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec  = recall_score(y_test, y_pred, zero_division=0)
    f1   = f1_score(y_test, y_pred, zero_division=0)
    auc  = roc_auc_score(y_test, y_proba) if y_proba is not None else np.nan

    return {
        "model": name,
        "feature_set": prefix,
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "roc_auc": auc
    }


## Training Loop and Results

In [24]:
# === BLOCK 6: TRAINING LOOP & RESULTS ===
results = []

for name, mdl_fn in models:
    for prefix, (Xtr, Xte) in [
        ("Questionnaire", (Xq_train, Xq_test)),
        ("Speech", (Xs_train, Xs_test)),
        ("Hybrid", (Xh_train, Xh_test))
    ]:
        mdl = mdl_fn()
        results.append(train_and_eval(Xtr, y_train, Xte, y_test, mdl, name, prefix))

met = pd.DataFrame(results).sort_values(by="accuracy", ascending=False)
print(met)

# Save results
met.to_csv("outputs/model_results.csv", index=False)


       model    feature_set  accuracy  precision    recall        f1   roc_auc
5        SVM         Hybrid  0.703846   0.712000  0.684615  0.698039  0.742544
14  CatBoost         Hybrid  0.700000   0.728070  0.638462  0.680328  0.783254
11       XGB         Hybrid  0.684615   0.693548  0.661538  0.677165  0.771006
6         RF  Questionnaire  0.680769   0.732673  0.569231  0.640693  0.722219
8         RF         Hybrid  0.669231   0.686441  0.623077  0.653226  0.744822
12  CatBoost  Questionnaire  0.657692   0.681416  0.592308  0.633745  0.732663
4        SVM         Speech  0.653846   0.658730  0.638462  0.648438  0.700740
9        XGB  Questionnaire  0.650000   0.661157  0.615385  0.637450  0.718166
3        SVM  Questionnaire  0.619231   0.639640  0.546154  0.589212  0.684142
7         RF         Speech  0.607692   0.616667  0.569231  0.592000  0.619852
13  CatBoost         Speech  0.603846   0.606299  0.592308  0.599222  0.619852
2     LogReg         Hybrid  0.596154   0.601626  0.