In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr, chi2_contingency
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import shap
import joblib
import os


In [2]:
# Load the dataset
df = pd.read_csv('Data for Postpartum Depression Prediction in Bangladesh/PPD_dataset.csv')

# Clean column names
df.columns = df.columns.str.strip().str.replace("’", "'").str.replace("‘", "'")

# Drop unnecessary columns
df = df.drop(['sr', 'PHQ9 Score', 'EPDS Score'], axis=1)

# Fix: Standardize category text to match exactly in ordinal_categories
df['Total children'] = df['Total children'].str.lower().str.strip()
df['Total children'] = df['Total children'].replace({
    'more than two': 'More than two',  # Capitalize M to match ordinal_categories
    'more than Two': 'More than two'   # just in case
})

# Check for 'Received Support' vs 'Recieved Support'
if 'Received Support' in df.columns and 'Recieved Support' not in df.columns:
    df = df.rename(columns={'Received Support': 'Recieved Support'})

# Define columns
numerical_cols = ['Age', 'Number of the latest pregnancy']
binary_cols = [
    'Residence', 'Marital status', 'Family type', 'Pregnancy plan', 'Regular checkups',
    'Fear of pregnancy', 'Mode of delivery', 'Gender of newborn', 'Birth compliancy',
    'Breastfeed', 'Newborn illness', 'Worry about newborn', 'Relax/sleep when newborn is tended',
    'Relax/sleep when the newborn is asleep', 'Angry after latest child birth',
    'Depression before pregnancy', 'Depression during pregnancy', 'Major changes or losses during pregnancy',
    'Abuse', 'Trust and share feelings'
]
ordinal_cols = [
    'Monthly income before latest pregnancy', 'Current monthly income', "Husband's monthly income",
    'Total children', 'Number of household members', 'Pregnancy length', 'Age of newborn',
    'Age of immediate older children', 'Recieved Support', 'Need for Support',
    'Relationship with the in-laws', 'Relationship with husband', 'Relationship with the newborn',
    'Relationship between father and newborn', 'Feeling about motherhood'
]
categorical_cols = [
    'Education Level', 'Occupation before latest pregnancy', 'Occupation After Your Latest Childbirth',
    "Husband's education level", 'Addiction', 'Disease before pregnancy', 'History of pregnancy loss',
    'Diseases during pregnancy', 'Feeling for regular activities'
]

# Binary mappings
binary_mappings = {
    'Residence': {'City': 0, 'Village': 1},
    'Marital status': {'Married': 0, 'Divorced': 1},
    'Family type': {'Nuclear': 0, 'Joint': 1},
    'Pregnancy plan': {'No': 0, 'Yes': 1},
    'Regular checkups': {'No': 0, 'Yes': 1},
    'Fear of pregnancy': {'No': 0, 'Yes': 1},
    'Mode of delivery': {'Normal Delivery': 0, 'Caesarean Section': 1},
    'Gender of newborn': {'Boy': 0, 'Girl': 1},
    'Birth compliancy': {'No': 0, 'Yes': 1},
    'Breastfeed': {'No': 0, 'Yes': 1},
    'Newborn illness': {'No': 0, 'Yes': 1},
    'Worry about newborn': {'No': 0, 'Yes': 1},
    'Relax/sleep when newborn is tended': {'No': 0, 'Yes': 1},
    'Relax/sleep when the newborn is asleep': {'No': 0, 'Yes': 1},
    'Angry after latest child birth': {'No': 0, 'Yes': 1},
    'Depression before pregnancy': {'Negative': 0, 'Positive': 1},
    'Depression during pregnancy': {'Negative': 0, 'Positive': 1},
    'Major changes or losses during pregnancy': {'No': 0, 'Yes': 1},
    'Abuse': {'No': 0, 'Yes': 1},
    'Trust and share feelings': {'No': 0, 'Yes': 1}
}

# Apply binary mappings
for col, mapping in binary_mappings.items():
    df[col] = df[col].map(mapping)

# Ordinal categories
ordinal_categories = {
    'Monthly income before latest pregnancy': ['None', 'Less than 5000', '5000 to 10000', '10000 to 20000', '20000 to 30000', 'More than 30000'],
    'Current monthly income': ['None', 'Less than 5000', '5000 to 10000', '10000 to 20000', '20000 to 30000', 'More than 30000'],
    "Husband's monthly income": ['None', 'Less than 5000', '5000 to 10000', '10000 to 20000', '20000 to 30000', 'More than 30000'],
    'Total children': ['One', 'Two', 'More than two'],
    'Number of household members': ['2 to 5', '6 to 8', '9 or more'],
    'Pregnancy length': ['Less than 5 months', '8 months', '9 months', '10 months'],
    'Age of newborn': ['0 to 6 months', '6 months to 1 year', '1 year to 1.5 year', 'Older than 1.5 year'],
    'Age of immediate older children': ['None', '1yr to 3yr', '4yr to 6yr', '7yr to 12yr', '13yr or more'],
    'Recieved Support': ['None', 'Low', 'Medium', 'High'],
    'Need for Support': ['None', 'Low', 'Medium', 'High'],
    'Relationship with the in-laws': ['Bad', 'Neutral', 'Good'],
    'Relationship with husband': ['Bad', 'Neutral', 'Good'],
    'Relationship with the newborn': ['Bad', 'Neutral', 'Good'],
    'Relationship between father and newborn': ['Bad', 'Neutral', 'Good'],
    'Feeling about motherhood': ['Sad', 'Neutral', 'Happy']
}

# Standardize relationship values
for col in ['Relationship with the in-laws', 'Relationship with husband', 'Relationship with the newborn', 'Relationship between father and newborn']:
    df[col] = df[col].replace({'Friendly': 'Good', 'Very good': 'Good', 'Poor': 'Bad'})

# Use significant features for preprocessing, fallback to all if none are significant
selected_features = list(set(significant_features)) if significant_features else numerical_cols + binary_cols + ordinal_cols + categorical_cols
print(f"\nSelected features for preprocessing: {selected_features}")

# Update column lists to include only significant features
numerical_cols = [col for col in numerical_cols if col in selected_features]
binary_cols = [col for col in binary_cols if col in selected_features]
ordinal_cols = [col for col in ordinal_cols if col in selected_features]
categorical_cols = [col for col in categorical_cols if col in selected_features]

# Debugging
print("\nPreprocessing Debugging:")
print("Shape of X before preprocessing:", df[selected_features].shape)
print("Columns in X:", df[selected_features].columns.tolist())
print("Numerical columns:", numerical_cols)
print("Binary columns:", binary_cols)
print("Ordinal columns:", ordinal_cols)
print("Categorical columns:", categorical_cols)
if ordinal_cols:
    categories_list = [ordinal_categories[col] for col in ordinal_cols]
    print("Ordinal categories length:", len(categories_list))
    print("Ordinal categories:", categories_list)
else:
    print("No ordinal columns selected.")

# Preprocessing pipelines
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
ordinal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=[ordinal_categories[col] for col in ordinal_cols], handle_unknown='use_encoded_value', unknown_value=-1))
]) if ordinal_cols else None
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

# Binary pipeline: impute with most frequent
binary_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

# Build transformers list dynamically
transformers = []
if numerical_cols:
    transformers.append(('num', numeric_pipeline, numerical_cols))
if ordinal_cols:
    transformers.append(('ord', ordinal_pipeline, ordinal_cols))
if categorical_cols:
    transformers.append(('cat', categorical_pipeline, categorical_cols))
if binary_cols:
    transformers.append(('bin', binary_pipeline, binary_cols))

# Final column transformer
preprocessor = ColumnTransformer(
    transformers=transformers,
    remainder='drop'
)

# Split features and targets
X = df[selected_features]
y_phq9 = df['PHQ9_encoded']
y_epds = df['EPDS_encoded']

# Fit and transform
X_preprocessed = preprocessor.fit_transform(X)
X_preprocessed = pd.DataFrame(
    X_preprocessed,
    columns=preprocessor.get_feature_names_out(),
    index=X.index  # This keeps the row alignment with y
)

# Save preprocessor for future use
joblib.dump(preprocessor, 'preprocessor.pkl')

print(f"\nPreprocessed feature matrix shape: {X_preprocessed.shape}")
print("Analysis and preprocessing complete. All plots saved in 'plots' directory. Ready for modeling.")

# The rest of the notebook continues unchanged with modeling and SMOTE usage


NameError: name 'significant_features' is not defined