In [1]:
# Lab07_all_questions.py
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.stats import randint
import warnings
warnings.filterwarnings("ignore")
import time

def load_data(path="/mnt/data/extracted_audio_features.csv"):
    df = pd.read_csv('/Users/dhrithijuvva/Desktop/Women_ML/extracted_audio_features.csv')
    feature_cols = [c for c in df.columns if c.startswith("MFCC")]
    X = df[feature_cols].values
    y = df["Label"].values
    return df, X, y, feature_cols

def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    out = {}
    for label, Xv, yv in [("train", X_train, y_train), ("test", X_test, y_test)]:
        ypred = model.predict(Xv)
        out[label] = {
            "accuracy": accuracy_score(yv, ypred),
            "precision": precision_score(yv, ypred, average='weighted', zero_division=0),
            "recall": recall_score(yv, ypred, average='weighted', zero_division=0),
            "f1": f1_score(yv, ypred, average='weighted', zero_division=0)
        }
    return out

def run_classifiers_comparison(X_train, y_train, X_test, y_test):
    classifiers = {
        "Perceptron": Perceptron(max_iter=1000),
        "SVM_rbf": SVC(kernel='rbf', probability=False),
        "DecisionTree": DecisionTreeClassifier(random_state=42),
        "RandomForest": RandomForestClassifier(random_state=42),
        "GaussianNB": GaussianNB(),
        "MLP": MLPClassifier(hidden_layer_sizes=(50,), max_iter=500, random_state=42)
    }
    results = {}
    for name, clf in classifiers.items():
        t0 = time.time()
        res = evaluate_model(clf, X_train, y_train, X_test, y_test)
        t1 = time.time()
        res['time_s'] = t1 - t0
        results[name] = res
    return results

def randomized_search_example(X_train, y_train):
    # Example for RandomForest hyperparameters
    param_dist = {
        "n_estimators": randint(10,200),
        "max_depth": randint(2,20),
        "min_samples_split": randint(2,10)
    }
    rf = RandomForestClassifier(random_state=42)
    rs = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=10, cv=3, scoring='accuracy', random_state=42, n_jobs=-1)
    rs.fit(X_train, y_train)
    return rs

if __name__ == "__main__":
    df, X, y, feature_cols = load_data()
    # Use two classes for classification assessments
    labels = np.unique(y)
    if len(labels) < 2:
        raise RuntimeError("Need at least two classes for Lab07")
    classes = (labels[0], labels[1])
    mask = np.isin(y, classes)
    X2 = X[mask]
    y2 = y[mask]
    # encode labels as strings for models
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    y2_enc = le.fit_transform(y2)
    Xtr, Xte, ytr, yte = train_test_split(X2, y2_enc, test_size=0.3, random_state=42, stratify=y2_enc)

    # A2/A3 compare classifiers
    comp = run_classifiers_comparison(Xtr, ytr, Xte, yte)
    print("Lab07 A3: Classifier comparison (train/test metrics)")
    for clf_name, metrics in comp.items():
        print(f"--- {clf_name} (time {metrics['time_s']:.2f}s) ---")
        print("Train: acc {:.3f}, prec {:.3f}, rec {:.3f}, f1 {:.3f}".format(
            metrics['train']['accuracy'], metrics['train']['precision'], metrics['train']['recall'], metrics['train']['f1']
        ))
        print("Test:  acc {:.3f}, prec {:.3f}, rec {:.3f}, f1 {:.3f}".format(
            metrics['test']['accuracy'], metrics['test']['precision'], metrics['test']['recall'], metrics['test']['f1']
        ))

    # A2: RandomizedSearchCV example
    rs = randomized_search_example(Xtr, ytr)
    print("Lab07 A2: RandomizedSearch best params:", rs.best_params_, "best_score:", rs.best_score_)


Lab07 A3: Classifier comparison (train/test metrics)
--- Perceptron (time 0.02s) ---
Train: acc 0.978, prec 0.979, rec 0.978, f1 0.978
Test:  acc 0.979, prec 0.980, rec 0.979, f1 0.979
--- SVM_rbf (time 1.10s) ---
Train: acc 0.982, prec 0.982, rec 0.982, f1 0.982
Test:  acc 0.983, prec 0.983, rec 0.983, f1 0.983
--- DecisionTree (time 0.20s) ---
Train: acc 1.000, prec 1.000, rec 1.000, f1 1.000
Test:  acc 0.992, prec 0.992, rec 0.992, f1 0.992
--- RandomForest (time 2.40s) ---
Train: acc 1.000, prec 1.000, rec 1.000, f1 1.000
Test:  acc 0.997, prec 0.997, rec 0.997, f1 0.997
--- GaussianNB (time 0.01s) ---
Train: acc 0.969, prec 0.969, rec 0.969, f1 0.969
Test:  acc 0.973, prec 0.973, rec 0.973, f1 0.973
--- MLP (time 0.90s) ---
Train: acc 0.999, prec 0.999, rec 0.999, f1 0.999
Test:  acc 0.998, prec 0.998, rec 0.998, f1 0.998
Lab07 A2: RandomizedSearch best params: {'max_depth': 13, 'min_samples_split': 2, 'n_estimators': 58} best_score: 0.9961274494895735
