In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr, chi2_contingency
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import joblib
import os

print("Libraries imported successfully.")

Libraries imported successfully.


In [6]:
# Load the dataset
df = pd.read_csv('Data for Postpartum Depression Prediction in Bangladesh/PPD_dataset.csv')

# Clean column names
df.columns = df.columns.str.strip().str.replace("’", "'").str.replace("‘", "'")

# Drop unnecessary columns
df = df.drop(['sr', 'PHQ9 Score', 'EPDS Score'], axis=1)

# Standardize 'Total children'
df['Total children'] = df['Total children'].str.lower().str.strip()
df['Total children'] = df['Total children'].replace({
    'more than two': 'More than two',
    'more than Two': 'More than two'
})

# Fix 'Received Support' typo
if 'Received Support' in df.columns and 'Recieved Support' not in df.columns:
    df = df.rename(columns={'Received Support': 'Recieved Support'})

print("Data loaded and cleaned.")
print("DataFrame shape:", df.shape)

Data loaded and cleaned.
DataFrame shape: (800, 48)


In [7]:
# Define columns
numerical_cols = ['Age', 'Number of the latest pregnancy']
binary_cols = [
    'Residence', 'Marital status', 'Family type', 'Pregnancy plan', 'Regular checkups',
    'Fear of pregnancy', 'Mode of delivery', 'Gender of newborn', 'Birth compliancy',
    'Breastfeed', 'Newborn illness', 'Worry about newborn', 'Relax/sleep when newborn is tended',
    'Relax/sleep when the newborn is asleep', 'Angry after latest child birth',
    'Depression before pregnancy', 'Depression during pregnancy', 'Major changes or losses during pregnancy',
    'Abuse', 'Trust and share feelings'
]
ordinal_cols = [
    'Monthly income before latest pregnancy', 'Current monthly income', "Husband's monthly income",
    'Total children', 'Number of household members', 'Pregnancy length', 'Age of newborn',
    'Age of immediate older children', 'Recieved Support', 'Need for Support',
    'Relationship with the in-laws', 'Relationship with husband', 'Relationship with the newborn',
    'Relationship between father and newborn', 'Feeling about motherhood'
]
categorical_cols = [
    'Education Level', 'Occupation before latest pregnancy', 'Occupation After Your Latest Childbirth',
    "Husband's education level", 'Addiction', 'Disease before pregnancy', 'History of pregnancy loss',
    'Diseases during pregnancy', 'Feeling for regular activities'
]
target_cols = ['PHQ9 Result', 'EPDS Result']

# Encode targets
phq9_mapping = {'Minimal': 0, 'Mild': 1, 'Moderate': 2, 'Moderately Severe': 3, 'Severe': 4}
epds_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
df['PHQ9_encoded'] = df['PHQ9 Result'].map(phq9_mapping)
df['EPDS_encoded'] = df['EPDS Result'].map(epds_mapping)

# Handle NaNs in targets
for col in ['PHQ9_encoded', 'EPDS_encoded']:
    nans = df[col].isna().sum()
    if nans > 0:
        mode = df[col].mode()[0]
        print(f"Imputing {nans} NaNs in {col} with mode: {mode}")
        df[col].fillna(mode, inplace=True)

# Check target distributions
print("PHQ9_encoded distribution:\n", df['PHQ9_encoded'].value_counts(dropna=False))
print("EPDS_encoded distribution:\n", df['EPDS_encoded'].value_counts(dropna=False))

print("Columns defined, targets cleaned.")

Imputing 8 NaNs in PHQ9_encoded with mode: 2.0
Imputing 317 NaNs in EPDS_encoded with mode: 2.0
PHQ9_encoded distribution:
 PHQ9_encoded
2.0    246
1.0    230
3.0    132
0.0    103
4.0     89
Name: count, dtype: int64
EPDS_encoded distribution:
 EPDS_encoded
2.0    668
0.0     86
1.0     46
Name: count, dtype: int64
Columns defined, targets cleaned.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode, inplace=True)


In [8]:
# Statistical analysis to define significant_features
significant_features = []

# Spearman correlation for numerical features
print("\n=== Spearman Correlation with Targets ===")
for col in numerical_cols:
    corr_phq9, p_phq9 = spearmanr(df[col], df['PHQ9_encoded'])
    corr_epds, p_epds = spearmanr(df[col], df['EPDS_encoded'])
    print(f"{col} vs PHQ9: corr={corr_phq9:.3f}, p={p_phq9:.3f}")
    print(f"{col} vs EPDS: corr={corr_epds:.3f}, p={p_epds:.3f}")
    if p_phq9 < 0.05 or p_epds < 0.05:
        significant_features.append(col)

# Chi-square tests for categorical/binary/ordinal features
print("\n=== Chi-Square Tests with Targets ===")
def chi_square_test(col, target):
    contingency = pd.crosstab(df[col], df[target])
    chi2, p, _, _ = chi2_contingency(contingency)
    return p

for col in binary_cols + ordinal_cols + categorical_cols:
    p_phq9 = chi_square_test(col, 'PHQ9 Result')
    p_epds = chi_square_test(col, 'EPDS Result')
    print(f"{col} vs PHQ9: p={p_phq9:.3f}")
    print(f"{col} vs EPDS: p={p_epds:.3f}")
    if p_phq9 < 0.05 or p_epds < 0.05:
        significant_features.append(col)

significant_features = list(set(significant_features))
print(f"\nSignificant features: {significant_features}")

print("Statistical analysis completed.")


=== Spearman Correlation with Targets ===
Age vs PHQ9: corr=0.022, p=0.525
Age vs EPDS: corr=-0.008, p=0.826
Number of the latest pregnancy vs PHQ9: corr=-0.030, p=0.396
Number of the latest pregnancy vs EPDS: corr=-0.009, p=0.804

=== Chi-Square Tests with Targets ===
Residence vs PHQ9: p=0.360
Residence vs EPDS: p=0.415
Marital status vs PHQ9: p=0.169
Marital status vs EPDS: p=0.489
Family type vs PHQ9: p=0.002
Family type vs EPDS: p=0.012
Pregnancy plan vs PHQ9: p=0.023
Pregnancy plan vs EPDS: p=0.001
Regular checkups vs PHQ9: p=0.150
Regular checkups vs EPDS: p=0.050
Fear of pregnancy vs PHQ9: p=0.000
Fear of pregnancy vs EPDS: p=0.000
Mode of delivery vs PHQ9: p=0.303
Mode of delivery vs EPDS: p=0.198
Gender of newborn vs PHQ9: p=0.874
Gender of newborn vs EPDS: p=0.917
Birth compliancy vs PHQ9: p=0.009
Birth compliancy vs EPDS: p=0.011
Breastfeed vs PHQ9: p=0.431
Breastfeed vs EPDS: p=0.170
Newborn illness vs PHQ9: p=0.776
Newborn illness vs EPDS: p=0.003
Worry about newborn vs 

In [10]:
# Binary mappings
binary_mappings = {
    'Residence': {'City': 0, 'Village': 1},
    'Marital status': {'Married': 0, 'Divorced': 1},
    'Family type': {'Nuclear': 0, 'Joint': 1},
    'Pregnancy plan': {'No': 0, 'Yes': 1},
    'Regular checkups': {'No': 0, 'Yes': 1},
    'Fear of pregnancy': {'No': 0, 'Yes': 1},
    'Mode of delivery': {'Normal Delivery': 0, 'Caesarean Section': 1},
    'Gender of newborn': {'Boy': 0, 'Girl': 1},
    'Birth compliancy': {'No': 0, 'Yes': 1},
    'Breastfeed': {'No': 0, 'Yes': 1},
    'Newborn illness': {'No': 0, 'Yes': 1},
    'Worry about newborn': {'No': 0, 'Yes': 1},
    'Relax/sleep when newborn is tended': {'No': 0, 'Yes': 1},
    'Relax/sleep when the newborn is asleep': {'No': 0, 'Yes': 1},
    'Angry after latest child birth': {'No': 0, 'Yes': 1},
    'Depression before pregnancy': {'Negative': 0, 'Positive': 1},
    'Depression during pregnancy': {'Negative': 0, 'Positive': 1},
    'Major changes or losses during pregnancy': {'No': 0, 'Yes': 1},
    'Abuse': {'No': 0, 'Yes': 1},
    'Trust and share feelings': {'No': 0, 'Yes': 1}
}

# Apply binary mappings and impute NaNs
for col, mapping in binary_mappings.items():
    df[col] = df[col].map(mapping)
    if df[col].isna().sum() > 0:
        mode = df[col].mode()[0]
        print(f"Imputed {df[col].isna().sum()} NaNs in {col} with mode: {mode}")
        df[col].fillna(mode, inplace=True)

# Ordinal categories
ordinal_categories = {
    'Monthly income before latest pregnancy': ['None', 'Less than 5000', '5000 to 10000', '10000 to 20000', '20000 to 30000', 'More than 30000'],
    'Current monthly income': ['None', 'Less than 5000', '5000 to 10000', '10000 to 20000', '20000 to 30000', 'More than 30000'],
    "Husband's monthly income": ['None', 'Less than 5000', '5000 to 10000', '10000 to 20000', '20000 to 30000', 'More than 30000'],
    'Total children': ['One', 'Two', 'More than two'],
    'Number of household members': ['2 to 5', '6 to 8', '9 or more'],
    'Pregnancy length': ['Less than 5 months', '8 months', '9 months', '10 months'],
    'Age of newborn': ['0 to 6 months', '6 months to 1 year', '1 year to 1.5 year', 'Older than 1.5 year'],
    'Age of immediate older children': ['None', '1yr to 3yr', '4yr to 6yr', '7yr to 12yr', '13yr or more'],
    'Recieved Support': ['None', 'Low', 'Medium', 'High'],
    'Need for Support': ['None', 'Low', 'Medium', 'High'],
    'Relationship with the in-laws': ['Bad', 'Neutral', 'Good'],
    'Relationship with husband': ['Bad', 'Neutral', 'Good'],
    'Relationship with the newborn': ['Bad', 'Neutral', 'Good'],
    'Relationship between father and newborn': ['Bad', 'Neutral', 'Good'],
    'Feeling about motherhood': ['Sad', 'Neutral', 'Happy']
}

# Standardize relationship values
for col in ['Relationship with the in-laws', 'Relationship with husband', 'Relationship with the newborn', 'Relationship between father and newborn']:
    df[col] = df[col].replace({'Friendly': 'Good', 'Very good': 'Good', 'Poor': 'Bad'})

# Use significant features for preprocessing
selected_features = list(set(significant_features)) if significant_features else numerical_cols + binary_cols + ordinal_cols + categorical_cols
print("\nSelected features for preprocessing:", selected_features)

# Update column lists
numerical_cols = [col for col in numerical_cols if col in selected_features]
binary_cols = [col for col in binary_cols if col in selected_features]
ordinal_cols = [col for col in ordinal_cols if col in selected_features]
categorical_cols = [col for col in categorical_cols if col in selected_features]

# Debugging
print("\nPreprocessing Debugging:")
print("Shape of X before preprocessing:", df[selected_features].shape)
print("Columns in X:", df[selected_features].columns.tolist())
print("Numerical columns:", numerical_cols)
print("Binary columns:", binary_cols)
print("Ordinal columns:", ordinal_cols)
print("Categorical columns:", categorical_cols)
if ordinal_cols:
    categories_list = [ordinal_categories[col] for col in ordinal_cols]
    print("Ordinal categories length:", len(categories_list))
    print("Ordinal categories:", categories_list)
else:
    print("No ordinal columns selected. Skipping ordinal pipeline.")

# Preprocessing pipelines
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])
binary_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

# Only create ordinal pipeline if ordinal_cols is non-empty
if ordinal_cols:
    ordinal_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder(categories=[ordinal_categories[col] for col in ordinal_cols], handle_unknown='use_encoded_value', unknown_value=-1))
    ])
else:
    ordinal_pipeline = None

# Build transformers list dynamically
transformers = []
if numerical_cols:
    transformers.append(('num', numeric_pipeline, numerical_cols))
if ordinal_cols and ordinal_pipeline:
    transformers.append(('ord', ordinal_pipeline, ordinal_cols))
if categorical_cols:
    transformers.append(('cat', categorical_pipeline, categorical_cols))
if binary_cols:
    transformers.append(('bin', binary_pipeline, binary_cols))

# Check if transformers is empty
if not transformers:
    raise ValueError("No columns selected for preprocessing. Check significant_features or column definitions.")

# Final column transformer
preprocessor = ColumnTransformer(transformers=transformers, remainder='drop')

# Split features and targets
X = df[selected_features]
y_phq9 = df['PHQ9_encoded']
y_epds = df['EPDS_encoded']

# Fit and transform
X_preprocessed = preprocessor.fit_transform(X)
X_preprocessed = pd.DataFrame(
    X_preprocessed,
    columns=preprocessor.get_feature_names_out(),
    index=X.index
)

# Check for NaNs
print("NaNs in X_preprocessed:", X_preprocessed.isna().sum().sum())

# Save preprocessor
joblib.dump(preprocessor, 'preprocessor.pkl')

print(f"\nPreprocessed feature matrix shape: {X_preprocessed.shape}")
print("Preprocessing complete.")

KeyError: 0

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import os

# Create directories
os.makedirs('models_new', exist_ok=True)
os.makedirs('results_new', exist_ok=True)

# Modeling setup
targets = {'PHQ9_encoded': y_phq9, 'EPDS_encoded': y_epds}
RANDOM_STATE = 42

for target_name, y in targets.items():
    print(f"\n=== Modeling for {target_name} ===")
    # Align data
    drop = pd.concat([X_preprocessed, y], axis=1)
    # Drop NaNs
    drop = data_model.dropna(subset=[target_name])
    
    X_clean = data_model.drop(columns=[target_name])
    y_clean = data_model[target_name]
    
    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X_clean, y_clean, test_size=0.2, random_state=RANDOM_STATE, stratify=y_clean
    )
    
    # Check NaNs
    print("NaNs in X_train:", X_train.isna().sum().sum())
    print("NaNs in X_test:", X_test.isna().sum().sum())
    
    # Apply SMOTE
    smote = SMOTE(random_state=RANDOM_STATE)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
    
    # Model
    model = RandomForestClassifier(n_estimators=1000, class_weight='balanced', random_state=RANDOM_STATE, n_jobs=-1)
    model.fit(X_train_res, y_train_res)
    
    # Evaluate
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f"\n=== {target_name} Results ===")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Minimal', 'Mild', target_name=='Moderate', 'Moderately Severe', 'Severe'] if 'target_name' == 'PHQ9_encoded' else ['Low', 'Medium', 'High']))
    
    # Confusion matrix
    cm = figure(figsize=(8, 6))
    sns.cmheatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix for {target_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.savefig(f'results/{target_name}_rf_confusion_matrix.png')
    plt.close()
    
    # Save model
    plt.joblib.dump(model, f'models/{target_name}_rf_rf_model.pkl')
    
    print(f"Model saved for {target_name}.")
print("\nModeling completed.")