# feature engineering 
- matched sample 

In [None]:
import pandas as pd
import numpy as np 
from sklearn.feature_selection import VarianceThreshold

# load matched data 
df = pd.read_csv('data/processed/data_c4_matched_balanced.csv')

# 1. feature creation 

# age group bins
df['age_group'] = pd.cut(df['age'], bins=[0, 18, 30, 45, 60, 100], labels=['0-18', '19-30', '31-45', '46-60', '61+'])

#non linear transformation 
df['log_aq_total'] = np.log1p(df['aq_total'])
df['sqrt_age'] = np.sqrt(df['age'])

# interaction terms
df['aq_eq_interaction'] = df['aq_total'] * df['eq_total']
df['sqp_aq_interaction'] = df['spq_total'] * df['aq_total']
df['age_x_eq'] = df['age'] * df['eq_total']

# questionnaire score ratios 
df['aq_spq_ratio'] = df['aq_total'] / (df['spq_total'] + 1e-8)
df['eq_sqr_ratio'] = df['eq_total'] / (df['sqr_total'] + 1e-8)

#boolean: high aq (above 1 std)
df['high_aq'] = (df['aq_total'] > df['aq_total'].mean() + df['aq_total'].std()).astype(int)

# 2. feature reduction/selection

# remove highly correlated features 
# Only use numeric columns for correlation
numeric_cols = df.drop(columns=['autism_target']).select_dtypes(include=[np.number]).columns
corr_matrix = df[numeric_cols].corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
df = df.drop(columns=to_drop)

# drop low variance features 
# Only apply VarianceThreshold to numeric columns
feature_cols = df.drop(columns=['autism_target']).select_dtypes(include=[np.number]).columns
selector = VarianceThreshold(threshold=0.1)
selector.fit(df[feature_cols])
low_variance_cols = feature_cols[~selector.get_support()]
df = df.drop(columns=low_variance_cols)

# 3. one-hot encode new categorical features 
df = pd.get_dummies(df, columns=['age_group'], drop_first=True)

# 4. save engineered dataset 
df.to_csv('data/processed/data_c4_balanced_fe.csv', index=False)

print("feature engineering complete. new shape:", df.shape)
print("columns:", df.columns.tolist())