In [6]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import GridSearchCV

def generate_dummy_data(num_samples=10000):
    """Generates a dummy dataset with variability for pregnancy prediction."""
    np.random.seed(42)  # For reproducibility
    random.seed(42)

    hCG = []
    progesterone = []
    estrogen = []
    WBC = []
    hemoglobin = []
    platelets = []
    iron = []
    calcium = []
    blood_glucose = []
    pregnancy_status = []
    months_pregnant = []

    for _ in range(num_samples):
        status = random.choice([0, 1])  # 0: not pregnant, 1: pregnant
        pregnancy_status.append(status)

        # Introduce variability based on pregnancy status
        if status == 1:  # Pregnant
            hCG.append(np.random.uniform(5, 50000))  # Higher hCG, wider range
            progesterone.append(np.random.uniform(10, 100))  # Higher progesterone
            estrogen.append(np.random.uniform(100, 1000)) # Higher estrogen
            months = random.randint(1, 9)
            months_pregnant.append(months)
            WBC.append(np.random.normal(7000, 1500))
            hemoglobin.append(np.random.normal(13, 1.5))
            platelets.append(np.random.normal(250000, 50000))
            iron.append(np.random.normal(60, 15))
            calcium.append(np.random.normal(9.5, .5))
            blood_glucose.append(np.random.normal(90, 10))

        else:  # Not pregnant
            hCG.append(np.random.uniform(0, 5))  # Very low hCG
            progesterone.append(np.random.uniform(0.1, 5))  # Low progesterone
            estrogen.append(np.random.uniform(20, 300)) # Lower estrogen
            months_pregnant.append(0)  # 0 months if not pregnant
            WBC.append(np.random.normal(7500, 1500))
            hemoglobin.append(np.random.normal(14, 1.5))
            platelets.append(np.random.normal(275000, 50000))
            iron.append(np.random.normal(70, 15))
            calcium.append(np.random.normal(9.8, .5))
            blood_glucose.append(np.random.normal(95, 10))

    data = {
        'hCG': hCG,
        'Progesterone': progesterone,
        'Estrogen': estrogen,
        'WBC': WBC,
        'Hemoglobin': hemoglobin,
        'Platelets': platelets,
        'Iron': iron,
        'Calcium': calcium,
        'BloodGlucose': blood_glucose,
        'PregnancyStatus': pregnancy_status,
        'MonthsPregnant': months_pregnant
    }

    df = pd.DataFrame(data)
    return df

# Generate the dataset
df = generate_dummy_data()


In [10]:
df.shape
df.to_csv("new-dummy-data.csv")

In [9]:

# Preprocessing
df = df.fillna(df.mean())
X = df.drop(["PregnancyStatus", "MonthsPregnant"], axis=1)
y = df["PregnancyStatus"]

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Model Training (Random Forest)
model_rf = RandomForestClassifier(random_state=42)

# Hyperparameter Tuning with Cross-Validation (to prevent overfitting)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'] # Added max_features.
}

grid_search_rf = GridSearchCV(model_rf, param_grid_rf, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

# Best model
best_model_rf = grid_search_rf.best_estimator_

# Make predictions
y_pred_rf = best_model_rf.predict(X_test)
y_pred_proba_rf = best_model_rf.predict_proba(X_test)[:, 1]

# Evaluation
print("Tuned Random Forest:")
print(classification_report(y_test, y_pred_rf))
print(f"AUC-ROC: {roc_auc_score(y_test, y_pred_proba_rf)}")

KeyboardInterrupt: 