In [None]:
pwd

In [None]:
import pandas as pd

In [None]:
maternal_dataset = pd.read_csv("Maternal Health Risk Data Set.csv")

In [None]:
maternal_dataset.head()

In [None]:
maternal_dataset.info()

In [None]:
# Map the RiskLevel to numerical values
risk_mapping = {'Low': 0, 'Mid': 1, 'High': 2}
maternal_dataset['RiskLevel'] = maternal_dataset['RiskLevel'].map(risk_mapping)

# Check the result
print(maternal_dataset['RiskLevel'].value_counts())


In [None]:
import pandas as pd

maternal_dataset = pd.read_csv("Maternal Health Risk Data Set.csv")
maternal_dataset['RiskLevel'] = maternal_dataset['RiskLevel'].astype(str).str.strip().str.lower()
print(maternal_dataset['RiskLevel'].unique())


In [None]:
risk_mapping = {'low risk': 0, 'mid risk': 1, 'high risk': 2}
maternal_dataset['RiskLevel'] = maternal_dataset['RiskLevel'].map(risk_mapping)

In [None]:
print(maternal_dataset['RiskLevel'].value_counts())

In [None]:
# Import libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [None]:
# Separate features (X) and target (y)
X = maternal_dataset.drop('RiskLevel', axis=1)  # All columns except the target
y = maternal_dataset['RiskLevel']   

In [None]:

# Split the dataset and scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)   # Fit & transform training features
X_test_scaled = scaler.transform(X_test) # Only transform test features

In [None]:
print(X_train_scaled[:5])  # the first 5 rows of the scaled training data

In [None]:
print("Training set:", X_train.shape, y_train.shape)
print("Test set:", X_test.shape, y_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# Initialize the model
log_reg = LogisticRegression(max_iter=5000)

In [None]:
# Train the model
log_reg.fit(X_train, y_train)
print("Logistic Regression model trained!")

In [None]:
# Make predictions
y_pred_logreg = log_reg.predict(X_test)

In [None]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_logreg))
print("Classification Report:\n", classification_report(y_test, y_pred_logreg))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [None]:
# Predict on test data
y_pred_logreg = log_reg.predict(X_test)

In [None]:
# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred_logreg)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title("Logistic Regression - Confusion Matrix")
plt.show()

In [None]:
import numpy as np

In [None]:
feature_names = X.columns  # make sure X is a DataFrame
coefficients = log_reg.coef_
print(feature_names)
print(coefficients)

In [None]:
# Plot each class's coefficients
for i in range(coefficients.shape[0]):
    plt.figure(figsize=(8, 4))
    plt.barh(feature_names, coefficients[i])
    plt.title(f'Logistic Regression Coefficients for Class {i}')
    plt.xlabel('Coefficient Value')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# Initialize the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)

In [None]:
# Fit the model to the training data
dt_model.fit(X_train, y_train)


In [None]:
# Predict on the test data
y_pred_dt = dt_model.predict(X_test)

In [None]:
# Evaluate the performance
accuracy_dt = accuracy_score(y_test, y_pred_dt)
confusion_dt = confusion_matrix(y_test, y_pred_dt)
report_dt = classification_report(y_test, y_pred_dt)

In [None]:
# Display results
print("Decision Tree Accuracy:", accuracy_dt)
print("Confusion Matrix:\n", confusion_dt)
print("Classification Report:\n", report_dt)

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(20, 10))  # Adjust size as needed
plot_tree(dt_model, feature_names=X.columns, class_names=["Low", "Mid", "High"],
          filled=True, rounded=True, fontsize=10)
plt.title("Decision Tree Visualization")
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# Initialize the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
# Fit the model on training data
rf_model.fit(X_train, y_train)

In [None]:
y_pred_rf = rf_model.predict(X_test)

In [None]:
# Evaluate the performance and display 
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)
cm_rf = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix:\n", cm_rf)
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
feature_importances = rf_model.feature_importances_
feature_names = X_train.columns

In [None]:
# Create a DataFrame for better plotting
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)


In [None]:
plt.figure(figsize=(8, 5))
sns.barplot(
    x='Importance',
    y='Feature',
    data=importance_df,
    hue='Feature',  # Assign hue
    palette='viridis',
    dodge=False,    # Avoid shifting bars
    legend=False    # Turn off legend since hue is same as y
)
plt.title('Machine Learning Model-Random Forest')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()


In [None]:
!pip install xgboost

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# Create and train the model
xgb_model = XGBClassifier(eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

In [None]:
# Evaluate and display report
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [None]:
# Get feature importances from the trained model
importances = xgb_model.feature_importances_
features = X_train.columns

In [None]:
# Create a DataFrame for easy plotting
importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

In [None]:
# Plot using seaborn
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df, palette='viridis')
plt.title('Machine Learning Model - XGBoost')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Initialize the SVM model
svm_model = SVC(kernel='rbf', C=1, gamma='scale', random_state=42)

In [None]:
# Train the model
svm_model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred_svm = svm_model.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))

In [None]:
# Visualize the confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test, y_pred_svm), annot=True, fmt='d', cmap='Blues')
plt.title('SVM Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
# Fit PCA for 2D visualization
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [None]:
# Train SVM model on 2D PCA data
svm_model = SVC(kernel='linear', C=1, random_state=42)
svm_model.fit(X_train_pca, y_train)


In [None]:
# Plot decision boundaries
def plot_svm_decision_boundary(model, X, y):
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                         np.arange(y_min, y_max, 0.1))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.figure(figsize=(10, 6))
    plt.contourf(xx, yy, Z, alpha=0.3, cmap='viridis')
    sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, palette='deep', edgecolor='k')
    plt.title("SVM Decision Boundary (PCA 2D Projection)")
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.legend(title="Risk Level")
    plt.show()

In [None]:
# Visualize
plot_svm_decision_boundary(svm_model, X_train_pca, y_train)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Model performance data
data = {
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'XGBoost', 'SVM'],
    'Accuracy': [0.67, 0.82, 0.81, 0.86, 0.58],
    'Precision': [0.67, 0.82, 0.82, 0.87, 0.63],
    'Recall': [0.67, 0.82, 0.82, 0.86, 0.58],
    'F1-Score': [0.64, 0.82, 0.81, 0.86, 0.57]
}
df = pd.DataFrame(data)

In [None]:
# Plotting
fig, ax = plt.subplots(figsize=(10, 6))
df.set_index('Model').plot(kind='bar', ax=ax)
plt.title('Model Performance Comparison')
plt.ylabel('Score')
plt.ylim(0, 1)
plt.xticks(rotation=45, ha='right')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data
data = pd.read_csv("Maternal Health Risk Data Set.csv")

# Create DataFrame
df = pd.DataFrame(data)

# Select only the clinical measurements (exclude RiskLevel)
clinical_vars = ['Age', 'SystolicBP', 'DiastolicBP', 'BS', 'BodyTemp', 'HeartRate']
df_clinical = df[clinical_vars]

# Compute correlation matrix
correlation_matrix = df_clinical.corr()

print(correlation_matrix)

# Visualization
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
plt.title('Correlation Matrix of Clinical Measurements')
plt.tight_layout()  # Prevent label cutoff
plt.show()

In [None]:
!pip install pydotplus

In [None]:
!pip install graphviz

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from pathlib import Path
import joblib

# Load data
df = pd.read_csv('Maternal Health Risk Data Set.csv')

# Feature engineering
df['BP_Category'] = pd.cut(df['SystolicBP'], 
                         bins=[0, 90, 120, 140, 200],
                         labels=['Low', 'Normal', 'Pre-High', 'High'])

# Convert to numerical
risk_mapping = {'low risk': 0, 'mid risk': 1, 'high risk': 2}
df['RiskLevel_encoded'] = df['RiskLevel'].map(risk_mapping)
# 2. Define features

numerical_features = ['Age', 'SystolicBP', 'DiastolicBP', 'BS', 'BodyTemp', 'HeartRate']
categorical_features = ['BP_Category']

# Split data - 80% for training, 20% for testing
X = df[numerical_features + categorical_features]
y = df['RiskLevel_encoded']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,  # 20% for testing
    random_state=42,
    stratify=y  # Maintain class distribution
)



# 3. Create preprocessing pipeline with proper encoding
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(), categorical_features)
])

# 4. Train and save both models (using only X_train, y_train)
models = {
    # 'random_forest': Pipeline([
    #     ('preprocessor', preprocessor),
    #     ('classifier', RandomForestClassifier(random_state=42))
    # ]),
    'xgboost': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(random_state=42, eval_metric='mlogloss'))
    ])
}

for name, model in models.items():
    model.fit(X_train, y_train)  # Only train on 80% of the data
    save_path = Path('deployment') / f'{name}_pipeline.pkl'
    joblib.dump(model, save_path)
    
    # Optional: Evaluate on test set
    test_score = model.score(X_test, y_test)
    # print(f"{name} test accuracy: {test_score:.2f}")

# 5. Prediction function remains the same
def predict_risk_level(age, systolic_bp, diastolic_bp, bs, body_temp, heart_rate, model_type='xgboost'):
    """Predict using either XGBoost (default) or Random Forest"""
    # Load the selected pipeline
    save_path = Path('deployment') / f'{model_type}_pipeline.pkl'
    pipeline = joblib.load(save_path)
    
    # Create input DataFrame (with BP category)
    input_df = pd.DataFrame([[age, systolic_bp, diastolic_bp, bs, body_temp, heart_rate]],
                          columns=numerical_features)
    
    # Add BP category (same logic as training)
    bp_val = systolic_bp
    if bp_val < 90: bp_cat = 'Low'
    elif bp_val < 120: bp_cat = 'Normal'
    elif bp_val < 140: bp_cat = 'Pre-High'
    else: bp_cat = 'High'
    input_df['BP_Category'] = bp_cat
    
    # Predict and return
    prediction = pipeline.predict(input_df)[0]
    return ['low risk', 'mid risk', 'high risk'][prediction]

# Example usage
print("XGBoost prediction:", predict_risk_level(35,120,60,6.1,98.0,76))


This is to test the deployment

In [None]:
import requests

url = 'http://127.0.0.1:5000/predict-api'
data = {'Age': 25,'SystolicBP':130,'DiastolicBP':80,'BS':15,'BodyTemp':98.0,'HeartRate':86}  # example input
response = requests.post(url, json=data)
print(response.json())
