# feature engineering 
- matched sample 

In [None]:
import pandas as pd
import numpy as np 
from sklearn.feature_selection import VarianceThreshold

# load matched data 
df = pd.read_csv('data/processed/data_c4_matched_balanced.csv')

# 1. feature creation 

# age group bins
df['age_group'] = pd.cut(df['age'], bins=[0, 18, 30, 45, 60, 100], labels=['0-18', '19-30', '31-45', '46-60', '61+'])

#non linear transformation 
df['log_aq_total'] = np.log1p(df['aq_total'])
df['sqrt_age'] = np.sqrt(df['age'])

# interaction terms
df['aq_eq_interaction'] = df['aq_total'] * df['eq_total']
df['sqp_aq_interaction'] = df['spq_total'] * df['aq_total']
df['age_x_eq'] = df['age'] * df['eq_total']

# questionnaire score ratios 
df['aq_spq_ratio'] = df['aq_total'] / (df['spq_total'] + 1e-8)
df['eq_sqr_ratio'] = df['eq_total'] / (df['sqr_total'] + 1e-8)

#boolean: high aq (above 1 std)
df['high_aq'] = (df['aq_total'] > df['aq_total'].mean() + df['aq_total'].std()).astype(int)

# 2. feature reduction/selection

# remove highly correlated features 
# Only use numeric columns for correlation
numeric_cols = df.drop(columns=['autism_target']).select_dtypes(include=[np.number]).columns
corr_matrix = df[numeric_cols].corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
df = df.drop(columns=to_drop)

# drop low variance features 
# Only apply VarianceThreshold to numeric columns
feature_cols = df.drop(columns=['autism_target']).select_dtypes(include=[np.number]).columns
selector = VarianceThreshold(threshold=0.1)
selector.fit(df[feature_cols])
low_variance_cols = feature_cols[~selector.get_support()]
df = df.drop(columns=low_variance_cols)

# 3. one-hot encode new categorical features 
df = pd.get_dummies(df, columns=['age_group'], drop_first=True)

# 4. save engineered dataset 
df.to_csv('data/processed/data_c4_balanced_fe.csv', index=False)

print("feature engineering complete. new shape:", df.shape)
print("columns:", df.columns.tolist())

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.impute import SimpleImputer

# df load data
df = pd.read_csv('data/processed/data_c4_balanced_fe.csv')
x = df.drop(columns=['autism_target'])
y = df['autism_target']

# Handle missing values
imputer = SimpleImputer(strategy='mean')
x_imputed = pd.DataFrame(imputer.fit_transform(x), columns=x.columns)

x_train, x_val, y_train, y_val = train_test_split(x_imputed, y, stratify=y, test_size=0.2, random_state=42)

# logistic reg
logreg = LogisticRegression(max_iter=2000, class_weight='balanced')
logreg.fit(x_train, y_train)
print("logistic regression:")
print(classification_report(y_val, logreg.predict(x_val)))
print("ROC-AUC:", roc_auc_score(y_val, logreg.predict_proba(x_val)[:, 1]))

#random forest 
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced')
rf.fit(x_train, y_train)
print("random forest:")
print(classification_report(y_val, rf.predict(x_val)))
print("ROC-AUC:", roc_auc_score(y_val, rf.predict_proba(x_val)[:, 1]))

# xgboost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(x_train, y_train)
print("xgboost:")
print(classification_report(y_val, xgb.predict(x_val)))
print("ROC-AUC:", roc_auc_score(y_val, xgb.predict_proba(x_val)[:, 1]))


# hyperparameter tuning

In [None]:
# log reg with improved convergence and feature scaling
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Create a pipeline with scaling to help with convergence
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(solver='liblinear', max_iter=10000))
])

# Expanded parameter grid
param_grid = {
    'logreg__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'logreg__class_weight': [None, 'balanced'],
    'logreg__penalty': ['l1', 'l2'],
    'logreg__solver': ['liblinear', 'saga']  # These solvers support both L1 and L2
}

# Use more cross-validation folds for better estimates
grid = GridSearchCV(pipe, param_grid, scoring='f1', cv=5, n_jobs=-1)
grid.fit(x_train, y_train)
from pprint import pprint

print("Best parameters:")
pprint(grid.best_params_)
print(f"Best F1 score: {grid.best_score_:.3f}")

# Evaluate on validation set
best_model = grid.best_estimator_
y_pred = best_model.predict(x_val)
print("\nValidation set performance:")
print(classification_report(y_val, y_pred))
print(f"ROC-AUC: {roc_auc_score(y_val, best_model.predict_proba(x_val)[:, 1]):.3f}")

# Print all results as a DataFrame
import pandas as pd
results_df = pd.DataFrame(grid.cv_results_)
print("\nTop 5 parameter combinations:")
top_results = results_df.sort_values('mean_test_score', ascending=False).head(5)
print(top_results[['params', 'mean_test_score', 'std_test_score']])


In [None]:
# Expanded parameter grid for Random Forest with more options
rf_param_grid = {
    'rf__n_estimators': [100, 200, 300, 500],  # Try more trees
    'rf__max_depth': [8, 10, 15, 20, None],  # More granular depth options around best value
    'rf__min_samples_split': [2, 5, 10],  # Add higher value
    'rf__min_samples_leaf': [1, 2, 4],  # Control leaf size
    'rf__max_features': ['sqrt', 'log2', None],  # Try different feature selection strategies
    'rf__class_weight': [None, 'balanced', 'balanced_subsample'],  # Add balanced_subsample
    'rf__bootstrap': [True, False]  # Try with and without bootstrap
}

# Create a pipeline with scaling for Random Forest
rf_pipe = Pipeline([
    ('scaler', StandardScaler()),  # Add scaling which might help
    ('rf', RandomForestClassifier(random_state=42))  # Add random_state for reproducibility
])

# Use 5-fold CV for more robust estimates
rf_grid = GridSearchCV(rf_pipe, rf_param_grid, scoring='f1', cv=5, n_jobs=-1, verbose=1)
rf_grid.fit(x_train, y_train)
print("RF best params:", rf_grid.best_params_)
print(f"RF best F1: {rf_grid.best_score_:.3f}")

# Evaluate on validation set
rf_best_model = rf_grid.best_estimator_
rf_y_pred = rf_best_model.predict(x_val)
print("\nRF Validation set performance:")
print(classification_report(y_val, rf_y_pred))
print(f"ROC-AUC: {roc_auc_score(y_val, rf_best_model.predict_proba(x_val)[:, 1]):.3f}")

In [None]:
# Create a more extensive parameter grid for XGBoost
xgb_param_grid = {
    'xgb__n_estimators': [100, 200, 300, 500],  # Try more trees
    'xgb__max_depth': [3, 4, 5, 6, 8, 10],      # More granular depth options
    'xgb__learning_rate': [0.01, 0.05, 0.1, 0.2],  # More learning rate options
    'xgb__min_child_weight': [1, 3, 5],         # Control overfitting
    'xgb__gamma': [0, 0.1, 0.2],                # Minimum loss reduction for partition
    'xgb__subsample': [0.8, 0.9, 1.0],          # Fraction of samples for trees
    'xgb__colsample_bytree': [0.8, 0.9, 1.0],   # Fraction of features for trees
    'xgb__reg_alpha': [0, 0.1, 1],              # L1 regularization
    'xgb__reg_lambda': [1, 1.5, 2],             # L2 regularization
    'xgb__scale_pos_weight': [1, sum(y_train==0)/sum(y_train==1)]  # Handle class imbalance
}

# Create pipeline with preprocessing
xgb_pipe = Pipeline([
    ('scaler', StandardScaler()),  # Add scaling which might help
    ('xgb', XGBClassifier(eval_metric='logloss', verbosity=0))  # Remove use_label_encoder, add verbosity=0
])

# Use 5-fold CV for more robust estimates
xgb_grid = GridSearchCV(xgb_pipe, xgb_param_grid, scoring='f1', cv=5, n_jobs=-1, verbose=1)
xgb_grid.fit(x_train, y_train)
print("XGB best params:", xgb_grid.best_params_)
print(f"XGB best F1: {xgb_grid.best_score_:.3f}")

# Evaluate on validation set
xgb_best_model = xgb_grid.best_estimator_
xgb_y_pred = xgb_best_model.predict(x_val)
print("\nXGB Validation set performance:")
print(classification_report(y_val, xgb_y_pred))
print(f"ROC-AUC: {roc_auc_score(y_val, xgb_best_model.predict_proba(x_val)[:, 1]):.3f}")