In [17]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset
df = pd.read_csv('Data for Postpartum Depression Prediction in Bangladesh/PPD_dataset.csv')

# Clean column names
df.columns = df.columns.str.strip().str.replace("’", "'").str.replace("‘", "'")

# Drop unnecessary columns
df = df.drop(['sr', 'PHQ9 Score', 'EPDS Score'], axis=1)

# Fix: Standardize category text to match exactly in ordinal_categories
df['Total children'] = df['Total children'].str.lower().str.strip()
df['Total children'] = df['Total children'].replace({
    'more than two': 'More than two',  # Capitalize M to match ordinal_categories
    'more than Two': 'More than two'   # just in case
})

# Define columns
numerical_cols = ['Age', 'Number of the latest pregnancy']

binary_cols = [
    'Residence', 'Marital status', 'Family type', 'Pregnancy plan', 'Regular checkups',
    'Fear of pregnancy', 'Mode of delivery', 'Gender of newborn', 'Birth compliancy',
    'Breastfeed', 'Newborn illness', 'Worry about newborn', 'Relax/sleep when newborn is tended',
    'Relax/sleep when the newborn is asleep', 'Angry after latest child birth',
    'Depression before pregnancy', 'Depression during pregnancy', 'Major changes or losses during pregnancy',
    'Abuse', 'Trust and share feelings'
]

ordinal_cols = [
    'Monthly income before latest pregnancy', 'Current monthly income', "Husband's monthly income",
    'Total children', 'Number of household members', 'Pregnancy length', 'Age of newborn',
    'Age of immediate older children', 'Recieved Support', 'Need for Support',
    'Relationship with the in-laws', 'Relationship with husband', 'Relationship with the newborn',
    'Relationship between father and newborn', 'Feeling about motherhood'
]

categorical_cols = [
    'Education Level', 'Occupation before latest pregnancy', 'Occupation After Your Latest Childbirth',
    "Husband's education level", 'Addiction', 'Disease before pregnancy', 'History of pregnancy loss',
    'Diseases during pregnancy', 'Feeling for regular activities'
]

# Binary mappings (unchanged)
binary_mappings = {
    'Residence': {'City': 0, 'Village': 1},
    'Marital status': {'Married': 0, 'Divorced': 1},
    'Family type': {'Nuclear': 0, 'Joint': 1},
    'Pregnancy plan': {'No': 0, 'Yes': 1},
    'Regular checkups': {'No': 0, 'Yes': 1},
    'Fear of pregnancy': {'No': 0, 'Yes': 1},
    'Mode of delivery': {'Normal Delivery': 0, 'Caesarean Section': 1},
    'Gender of newborn': {'Boy': 0, 'Girl': 1},
    'Birth compliancy': {'No': 0, 'Yes': 1},
    'Breastfeed': {'No': 0, 'Yes': 1},
    'Newborn illness': {'No': 0, 'Yes': 1},
    'Worry about newborn': {'No': 0, 'Yes': 1},
    'Relax/sleep when newborn is tended': {'No': 0, 'Yes': 1},
    'Relax/sleep when the newborn is asleep': {'No': 0, 'Yes': 1},
    'Angry after latest child birth': {'No': 0, 'Yes': 1},
    'Depression before pregnancy': {'Negative': 0, 'Positive': 1},
    'Depression during pregnancy': {'Negative': 0, 'Positive': 1},
    'Major changes or losses during pregnancy': {'No': 0, 'Yes': 1},
    'Abuse': {'No': 0, 'Yes': 1},
    'Trust and share feelings': {'No': 0, 'Yes': 1}
}

# Apply binary mappings
for col, mapping in binary_mappings.items():
    df[col] = df[col].map(mapping)

# Ordinal categories (fixed 'More than two' capitalization)
ordinal_categories = {
    'Monthly income before latest pregnancy': ['None', 'Less than 5000', '5000 to 10000', '10000 to 20000', '20000 to 30000', 'More than 30000'],
    'Current monthly income': ['None', 'Less than 5000', '5000 to 10000', '10000 to 20000', '20000 to 30000', 'More than 30000'],
    "Husband's monthly income": ['None', 'Less than 5000', '5000 to 10000', '10000 to 20000', '20000 to 30000', 'More than 30000'],
    'Total children': ['One', 'Two', 'More than two'],  # Note lowercase 'two' here must match the data
    'Number of household members': ['2 to 5', '6 to 8', '9 or more'],
    'Pregnancy length': ['Less than 5 months', '8 months', '9 months', '10 months'],
    'Age of newborn': ['0 to 6 months', '6 months to 1 year', '1 year to 1.5 year', 'Older than 1.5 year'],
    'Age of immediate older children': ['None', '1yr to 3yr', '4yr to 6yr', '7yr to 12yr', '13yr or more'],
    'Recieved Support': ['None', 'Low', 'Medium', 'High'],
    'Need for Support': ['None', 'Low', 'Medium', 'High'],
    'Relationship with the in-laws': ['Bad', 'Neutral', 'Good'],
    'Relationship with husband': ['Bad', 'Neutral', 'Good'],
    'Relationship with the newborn': ['Bad', 'Neutral', 'Good'],
    'Relationship between father and newborn': ['Bad', 'Neutral', 'Good'],
    'Feeling about motherhood': ['Sad', 'Neutral', 'Happy']
}

# Standardize relationship values
for col in ['Relationship with the in-laws', 'Relationship with husband', 'Relationship with the newborn', 'Relationship between father and newborn']:
    df[col] = df[col].replace({'Friendly': 'Good', 'Very good': 'Good', 'Poor': 'Bad'})

# Preprocessing pipelines

# Numeric pipeline: impute with mean + scale
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Ordinal pipeline: impute with most frequent + ordinal encode with categories
ordinal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=[ordinal_categories[col] for col in ordinal_cols],
                              handle_unknown='use_encoded_value',
                              unknown_value=-1))
])

# Categorical pipeline: impute with most frequent + one-hot encode
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

binary_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

transformers.append(('bin', binary_pipeline, binary_cols))


# Final column transformer
preprocessor = ColumnTransformer(
    transformers=transformers,
    remainder='drop'
)

# Split features and targets
X = df.drop(['PHQ9 Result', 'EPDS Result'], axis=1)
y_phq9 = df['PHQ9 Result']
y_epds = df['EPDS Result']

# Fit and transform
X_preprocessed = preprocessor.fit_transform(X)

print(f"Preprocessed feature matrix shape: {X_preprocessed.shape}")
print("Preprocessing complete. Ready for modeling.")


Preprocessed feature matrix shape: (800, 47)
Preprocessing complete. Ready for modeling.


In [26]:
print(df.columns.tolist())


['Age', 'Residence', 'Education Level', 'Marital status', 'Occupation before latest pregnancy', 'Monthly income before latest pregnancy', 'Occupation After Your Latest Childbirth', 'Current monthly income', "Husband's education level", "Husband's monthly income", 'Addiction', 'Total children', 'Disease before pregnancy', 'History of pregnancy loss', 'Family type', 'Number of household members', 'Relationship with the in-laws', 'Relationship with husband', 'Relationship with the newborn', 'Relationship between father and newborn', 'Feeling about motherhood', 'Recieved Support', 'Need for Support', 'Major changes or losses during pregnancy', 'Abuse', 'Trust and share feelings', 'Number of the latest pregnancy', 'Pregnancy length', 'Pregnancy plan', 'Regular checkups', 'Fear of pregnancy', 'Diseases during pregnancy', 'Age of newborn', 'Age of immediate older children', 'Mode of delivery', 'Gender of newborn', 'Birth compliancy', 'Breastfeed', 'Newborn illness', 'Worry about newborn', 'Re

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr, chi2_contingency
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import os
import joblib

# Load the dataset
df = pd.read_csv('Data for Postpartum Depression Prediction in Bangladesh/PPD_dataset.csv')

# Clean column names
df.columns = df.columns.str.strip().str.replace("’", "'").str.replace("‘", "'")

# Drop unnecessary columns
df = df.drop(['sr', 'PHQ9 Score', 'EPDS Score'], axis=1)

# Fix: Standardize category text to match exactly in ordinal_categories
df['Total children'] = df['Total children'].str.lower().str.strip()
df['Total children'] = df['Total children'].replace({
    'more than two': 'More than two',
    'more than Two': 'More than two'
})

# Check for 'Received Support' vs 'Recieved Support'
if 'Received Support' in df.columns and 'Recieved Support' not in df.columns:
    df = df.rename(columns={'Received Support': 'Recieved Support'})

# Define columns
numerical_cols = ['Age', 'Number of the latest pregnancy']
binary_cols = [
    'Residence', 'Marital status', 'Family type', 'Pregnancy plan', 'Regular checkups',
    'Fear of pregnancy', 'Mode of delivery', 'Gender of newborn', 'Birth compliancy',
    'Breastfeed', 'Newborn illness', 'Worry about newborn', 'Relax/sleep when newborn is tended',
    'Relax/sleep when the newborn is asleep', 'Angry after latest child birth',
    'Depression before pregnancy', 'Depression during pregnancy', 'Major changes or losses during pregnancy',
    'Abuse', 'Trust and share feelings'
]
ordinal_cols = [
    'Monthly income before latest pregnancy', 'Current monthly income', "Husband's monthly income",
    'Total children', 'Number of household members', 'Pregnancy length', 'Age of newborn',
    'Age of immediate older children', 'Recieved Support', 'Need for Support',
    'Relationship with the in-laws', 'Relationship with husband', 'Relationship with the newborn',
    'Relationship between father and newborn', 'Feeling about motherhood'
]
categorical_cols = [
    'Education Level', 'Occupation before latest pregnancy', 'Occupation After Your Latest Childbirth',
    "Husband's education level", 'Addiction', 'Disease before pregnancy', 'History of pregnancy loss',
    'Diseases during pregnancy', 'Feeling for regular activities'
]
target_cols = ['PHQ9 Result', 'EPDS Result']

# 1. Descriptive Statistics
print("=== Descriptive Statistics ===")
print("\nNumerical Columns:")
print(df[numerical_cols].describe())
print("\nCategorical Columns:")
for col in binary_cols + ordinal_cols + categorical_cols:
    print(f"\n{col}:\n{df[col].value_counts()}")
print("\nTarget Columns:")
for col in target_cols:
    print(f"\n{col}:\n{df[col].value_counts()}")

# 2. Statistical Analysis
# Encode targets for correlation analysis
phq9_mapping = {'Minimal': 0, 'Mild': 1, 'Moderate': 2, 'Moderately Severe': 3, 'Severe': 4}
epds_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
df['PHQ9_encoded'] = df['PHQ9 Result'].map(phq9_mapping)
df['EPDS_encoded'] = df['EPDS Result'].map(epds_mapping)

# Spearman Correlation for Numerical Features vs. Targets
print("\n=== Spearman Correlation with Targets ===")
significant_features = []
for col in numerical_cols:
    corr_phq9, p_phq9 = spearmanr(df[col], df['PHQ9_encoded'])
    corr_epds, p_epds = spearmanr(df[col], df['EPDS_encoded'])
    print(f"{col} vs PHQ9: correlation={corr_phq9:.3f}, p-value={p_phq9:.3f}")
    print(f"{col} vs EPDS: correlation={corr_epds:.3f}, p-value={p_epds:.3f}")
    if p_phq9 < 0.05 or p_epds < 0.05:
        significant_features.append(col)

# Chi-Square Tests for Categorical Features vs. Targets
print("\n=== Chi-Square Tests with Targets ===")
def chi_square_test(col, target):
    contingency = pd.crosstab(df[col], df[target])
    chi2, p, dof, ex = chi2_contingency(contingency)
    return p

for col in binary_cols + ordinal_cols + categorical_cols:
    p_phq9 = chi_square_test(col, 'PHQ9 Result')
    p_epds = chi_square_test(col, 'EPDS Result')
    print(f"{col} vs PHQ9: p-value={p_phq9:.3f}")
    print(f"{col} vs EPDS: p-value={p_epds:.3f}")
    if p_phq9 < 0.05 or p_epds < 0.05:
        significant_features.append(col)

# Ensure unique significant features
significant_features = list(set(significant_features))
print(f"\nSignificant features: {significant_features}")

# 3. Visualizations
os.makedirs('plots', exist_ok=True)

# Function to sanitize file names
def sanitize_filename(name):
    return name.replace('/', '_').replace('\\', '_').replace(':', '_').replace(' ', '_')

# Histograms for Numerical Columns
for col in numerical_cols:
    plt.figure(figsize=(8, 6))
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.savefig(f'plots/hist_{sanitize_filename(col)}.png')
    plt.close()

# Bar Plots for Categorical Columns
for col in binary_cols + ordinal_cols + categorical_cols:
    plt.figure(figsize=(10, 6))
    df[col].value_counts().head(10).plot(kind='bar')
    plt.title(f'Top Categories in {col}')
    plt.xticks(rotation=45)
    plt.savefig(f'plots/bar_{sanitize_filename(col)}.png')
    plt.close()

# Box Plots for Numerical Columns vs. Targets
for col in numerical_cols:
    for target in target_cols:
        plt.figure(figsize=(10, 6))
        sns.boxplot(x=target, y=col, data=df)
        plt.title(f'{col} by {target}')
        plt.xticks(rotation=45)
        plt.savefig(f'plots/box_{sanitize_filename(col)}_vs_{sanitize_filename(target)}.png')
        plt.close()

# Stacked Bar Plots for Categorical Columns vs. Targets
for col in binary_cols + ordinal_cols + categorical_cols:
    for target in target_cols:
        plt.figure(figsize=(12, 6))
        ct = pd.crosstab(df[col], df[target], normalize='index')
        ct.plot(kind='bar', stacked=True, ax=plt.gca())
        plt.title(f'{col} vs {target}')
        plt.xticks(rotation=45)
        plt.legend(title=target)
        plt.savefig(f'plots/stacked_{sanitize_filename(col)}_vs_{sanitize_filename(target)}.png')
        plt.close()

# Correlation Heatmap
corr_matrix = df[numerical_cols + ['PHQ9_encoded', 'EPDS_encoded']].corr(method='spearman')
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Spearman Correlation Matrix')
plt.savefig('plots/correlation_heatmap.png')
plt.close()

# 4. Preprocessing
# Binary mappings
binary_mappings = {
    'Residence': {'City': 0, 'Village': 1},
    'Marital status': {'Married': 0, 'Divorced': 1},
    'Family type': {'Nuclear': 0, 'Joint': 1},
    'Pregnancy plan': {'No': 0, 'Yes': 1},
    'Regular checkups': {'No': 0, 'Yes': 1},
    'Fear of pregnancy': {'No': 0, 'Yes': 1},
    'Mode of delivery': {'Normal Delivery': 0, 'Caesarean Section': 1},
    'Gender of newborn': {'Boy': 0, 'Girl': 1},
    'Birth compliancy': {'No': 0, 'Yes': 1},
    'Breastfeed': {'No': 0, 'Yes': 1},
    'Newborn illness': {'No': 0, 'Yes': 1},
    'Worry about newborn': {'No': 0, 'Yes': 1},
    'Relax/sleep when newborn is tended': {'No': 0, 'Yes': 1},
    'Relax/sleep when the newborn is asleep': {'No': 0, 'Yes': 1},
    'Angry after latest child birth': {'No': 0, 'Yes': 1},
    'Depression before pregnancy': {'Negative': 0, 'Positive': 1},
    'Depression during pregnancy': {'Negative': 0, 'Positive': 1},
    'Major changes or losses during pregnancy': {'No': 0, 'Yes': 1},
    'Abuse': {'No': 0, 'Yes': 1},
    'Trust and share feelings': {'No': 0, 'Yes': 1}
}

# Apply binary mappings
for col, mapping in binary_mappings.items():
    df[col] = df[col].map(mapping)

# Ordinal categories
ordinal_categories = {
    'Monthly income before latest pregnancy': ['None', 'Less than 5000', '5000 to 10000', '10000 to 20000', '20000 to 30000', 'More than 30000'],
    'Current monthly income': ['None', 'Less than 5000', '5000 to 10000', '10000 to 20000', '20000 to 30000', 'More than 30000'],
    "Husband's monthly income": ['None', 'Less than 5000', '5000 to 10000', '10000 to 20000', '20000 to 30000', 'More than 30000'],
    'Total children': ['One', 'Two', 'More than two'],
    'Number of household members': ['2 to 5', '6 to 8', '9 or more'],
    'Pregnancy length': ['Less than 5 months', '8 months', '9 months', '10 months'],
    'Age of newborn': ['0 to 6 months', '6 months to 1 year', '1 year to 1.5 year', 'Older than 1.5 year'],
    'Age of immediate older children': ['None', '1yr to 3yr', '4yr to 6yr', '7yr to 12yr', '13yr or more'],
    'Recieved Support': ['None', 'Low', 'Medium', 'High'],
    'Need for Support': ['None', 'Low', 'Medium', 'High'],
    'Relationship with the in-laws': ['Bad', 'Neutral', 'Good'],
    'Relationship with husband': ['Bad', 'Neutral', 'Good'],
    'Relationship with the newborn': ['Bad', 'Neutral', 'Good'],
    'Relationship between father and newborn': ['Bad', 'Neutral', 'Good'],
    'Feeling about motherhood': ['Sad', 'Neutral', 'Happy']
}

# Standardize relationship values
for col in ['Relationship with the in-laws', 'Relationship with husband', 'Relationship with the newborn', 'Relationship between father and newborn']:
    df[col] = df[col].replace({'Friendly': 'Good', 'Very good': 'Good', 'Poor': 'Bad'})

# Use significant features for preprocessing, fallback to all if none are significant
selected_features = list(set(significant_features)) if significant_features else numerical_cols + binary_cols + ordinal_cols + categorical_cols
print(f"\nSelected features for preprocessing: {selected_features}")

# Update column lists to include only significant features
numerical_cols = [col for col in numerical_cols if col in selected_features]
binary_cols = [col for col in binary_cols if col in selected_features]
ordinal_cols = [col for col in ordinal_cols if col in selected_features]
categorical_cols = [col for col in categorical_cols if col in selected_features]

# Debugging
print("\nPreprocessing Debugging:")
print("Shape of X before preprocessing:", df[selected_features].shape)
print("Columns in X:", df[selected_features].columns.tolist())
print("Numerical columns:", numerical_cols)
print("Binary columns:", binary_cols)
print("Ordinal columns:", ordinal_cols)
print("Categorical columns:", categorical_cols)
if ordinal_cols:
    categories_list = [ordinal_categories[col] for col in ordinal_cols]
    print("Ordinal categories length:", len(categories_list))
    print("Ordinal categories:", categories_list)
else:
    print("No ordinal columns selected.")

# Preprocessing pipelines
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
ordinal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=[ordinal_categories[col] for col in ordinal_cols], handle_unknown='use_encoded_value', unknown_value=-1))
]) if ordinal_cols else None
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

# Build transformers list dynamically
transformers = []
if numerical_cols:
    transformers.append(('num', numeric_pipeline, numerical_cols))
if ordinal_cols:
    transformers.append(('ord', ordinal_pipeline, ordinal_cols))
if categorical_cols:
    transformers.append(('cat', categorical_pipeline, categorical_cols))

# Final column transformer
preprocessor = ColumnTransformer(
    transformers=transformers,
    remainder='passthrough'
)

# Split features and targets
X = df[selected_features]
y_phq9 = df['PHQ9_encoded']
y_epds = df['EPDS_encoded']

# Fit and transform
X_preprocessed = preprocessor.fit_transform(X)
X_preprocessed = pd.DataFrame(
    X_preprocessed,
    columns=preprocessor.get_feature_names_out(),
    index=X.index  # This keeps the row alignment with y
)

# Save preprocessor for future use
joblib.dump(preprocessor, 'preprocessor.pkl')

print(f"\nPreprocessed feature matrix shape: {X_preprocessed.shape}")
print("Analysis and preprocessing complete. All plots saved in 'plots' directory. Ready for modeling.")

=== Descriptive Statistics ===

Numerical Columns:
              Age  Number of the latest pregnancy
count  800.000000                       800.00000
mean    27.713750                         1.63125
std      4.491328                         0.89514
min     13.000000                         1.00000
25%     25.000000                         1.00000
50%     27.000000                         1.00000
75%     31.000000                         2.00000
max     45.000000                         7.00000

Categorical Columns:

Residence:
Residence
City       636
Village    164
Name: count, dtype: int64

Marital status:
Marital status
Married     795
Divorced      5
Name: count, dtype: int64

Family type:
Family type
Nuclear    420
Joint      380
Name: count, dtype: int64

Pregnancy plan:
Pregnancy plan
Yes    567
No     233
Name: count, dtype: int64

Regular checkups:
Regular checkups
Yes    752
No      48
Name: count, dtype: int64

Fear of pregnancy:
Fear of pregnancy
Yes    487
No     313
Nam

In [28]:
# === 5. Modeling Setup ===

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import shap
import joblib
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Ensure reproducibility
RANDOM_STATE = 42

# Create directories to save results
os.makedirs('models', exist_ok=True)
os.makedirs('results', exist_ok=True)
os.makedirs('shap_plots', exist_ok=True)

# Define target variables
targets = {
    'PHQ9_encoded': y_phq9,
    'EPDS_encoded': y_epds
}

# Define models and their hyperparameters
models = {
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
        'params': {
            'C': [0.01, 0.1, 1, 10],
            'solver': ['lbfgs']
        }
    },
    'DecisionTree': {
        'model': DecisionTreeClassifier(random_state=RANDOM_STATE),
        'params': {
            'max_depth': [None, 5, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(random_state=RANDOM_STATE),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5]
        }
    },
    'SVM': {
        'model': SVC(probability=True, random_state=RANDOM_STATE),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        }
    },
    'XGBoost': {
        'model': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=RANDOM_STATE),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.2]
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(random_state=RANDOM_STATE),
        'params': {
            'n_estimators': [100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7]
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance']
        }
    },
    'NaiveBayes': {
        'model': GaussianNB(),
        'params': {}
    },
    'MLPClassifier': {
        'model': MLPClassifier(max_iter=1000, random_state=RANDOM_STATE),
        'params': {
            'hidden_layer_sizes': [(50,), (100,), (100, 50)],
            'activation': ['relu', 'tanh'],
            'solver': ['adam']
        }
    }
}

# Function to evaluate and save results
def evaluate_model(model, X_test, y_test, target_name, model_name):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)

    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    cm = confusion_matrix(y_test, y_pred)

    # Save classification report
    report_df = pd.DataFrame(report).transpose()
    report_df.to_csv(f'results/{target_name}_{model_name}_classification_report.csv')

    # Save confusion matrix
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{model_name} Confusion Matrix for {target_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.savefig(f'results/{target_name}_{model_name}_confusion_matrix.png')
    plt.close()

    return acc

# Iterate over each target variable
for target_name, y in targets.items():
    print(f"\n=== Modeling for {target_name} ===")

    X_preprocessed_df = pd.DataFrame(
    X_preprocessed,
    columns=preprocessor.get_feature_names_out(),
    index=X.index
    )
    data_model = pd.concat([X_preprocessed_df, y], axis=1)


# Drop NaNs in target
    data_model = data_model.dropna(subset=[target_name])

# Separate clean inputs
    X_clean = data_model.drop(columns=[target_name])
    y_clean = data_model[target_name]

# Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y_clean, test_size=0.2, random_state=RANDOM_STATE, stratify=y_clean
    )


# Optionally: drop rows with any NaN values
X_train = X_train.dropna()
y_train = y_train[X_train.index]  # Ensure matching labels

# OR drop columns (less common)
# X_train = X_train.dropna(axis=1)

from sklearn.pipeline import Pipeline

# Loop through models
for model_name, model_info in models.items():
    print(f"\nTraining {model_name} for target: {target_name}")

    # Set up GridSearchCV
    grid = GridSearchCV(
        estimator=model_info['model'],
        param_grid=model_info['params'],
        scoring='accuracy',
        cv=5,
        n_jobs=-1,
        verbose=1
    )

    # Fit model
    grid.fit(X_train, y_train)

    # Save best model
    joblib.dump(grid.best_estimator_, f'models/{target_name}_{model_name}.joblib')

    # Evaluate best model
    accuracy = evaluate_model(grid.best_estimator_, X_test, y_test, target_name, model_name)
    print(f"{model_name} Accuracy for {target_name}: {accuracy:.4f}")

  



=== Modeling for PHQ9_encoded ===

=== Modeling for EPDS_encoded ===

Training LogisticRegression for target: EPDS_encoded
Fitting 5 folds for each of 4 candidates, totalling 20 fits


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values