<a href="https://colab.research.google.com/github/friedelj/ML540/blob/main/ProjectDataStudy1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This code attempts to study the Kaggle Autism data, out of SAGEMAKER

In [None]:
pip install pandas numpy matplotlib seaborn scikit-learn tensorflow shap plotly

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import time
import psutil
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve,
    classification_report
)
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load and clean data
df = pd.read_csv("Autistic Spectrum Disorder Screening for Children.csv")
df = df.dropna()

In [None]:
# Replace '?' with NaN and drop missing
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

In [None]:
# Encode target variable (binary)
df['Class/ASD'] = df['Class/ASD'].apply(lambda x: 1 if x == 'YES' else 0)

In [None]:
# Encode binary categorical fields
binary_columns = ['jundice', 'austim', 'used_app_before', 'gender']
for col in binary_columns:
    df[col] = df[col].apply(lambda x: 1 if str(x).lower() in ['yes', 'm', 'male', 'true', '1'] else 0)

In [None]:
# Encode all other categorical features using Label Encoding
categorical_cols = ['ethnicity', 'contry_of_res', 'age_desc', 'relation']
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

In [None]:
# Drop any remaining non-numeric or unneeded columns
if 'result' in df.columns:
    df = df.drop(columns=['result'])  # 'result' might be redundant with features

In [None]:
# Separate features and target
X = df.drop(columns=['Class/ASD'])
y = df['Class/ASD']

In [None]:
df.head(10)

In [None]:
# Feature Engineering - correlation
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# Train/test/val split
from sklearn.model_selection import train_test_split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

In [None]:
# Normalize numerical values
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)

In [None]:
# Model definition
model = models.Sequential([
    layers.Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(16, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train
start_time = time.time()
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=30, batch_size=16, verbose=1
)
inference_time = time.time() - start_time

In [None]:
print("Train label distribution:", np.bincount(y_train))
print("Validation label distribution:", np.bincount(y_val))
print("Test label distribution:", np.bincount(y_test))

In [None]:
# Evaluate
y_pred_prob = model.predict(X_test).ravel()
y_pred = (y_pred_prob > 0.5).astype(int)

In [None]:
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
print("Test label distribution:", np.bincount(y_test))

if len(np.unique(y_test)) < 2:
    print("ROC AUC cannot be computed because only one class is present in y_test.")
else:
    print(f"AUC: {roc_auc_score(y_test, y_pred_prob):.2f}")

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues")
plt.title("Confusion Matrix")
plt.show()

In [None]:
# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_prob)
plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.show()

In [None]:
# Feature importance using SHAP
explainer = shap.DeepExplainer(model, X_train[:100])
shap_values = explainer.shap_values(X_test[:100])

In [None]:
# SHAP Summary Plot
import shap

# Use a small background dataset for SHAP
background = X_train[:100]

# SHAP expects a model that outputs probability
explainer = shap.DeepExplainer(model, background)

# Compute SHAP values for test samples
shap_values = explainer.shap_values(X_test[:100])

# Check if model output is binary classification (1 output neuron)
# shap_values will be a list of arrays
if isinstance(shap_values, list) and len(shap_values) == 1:
    shap.summary_plot(shap_values[0], X_test[:100], feature_names=X.columns)
else:
    # For multiclass or other cases
    shap.summary_plot(shap_values, X_test[:100], feature_names=X.columns)