## Load Data and Initial Setup

In [None]:
import pandas as pd
df=pd.read_csv('/content/fetal_health.csv')

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

print("Libraries imported successfully.")

print("\nFirst 5 rows of the DataFrame:")
print(df.head())

print("\nConcise summary of the DataFrame:")
df.info()

print("\nDescriptive statistics of the DataFrame:")
print(df.describe())

## Data Preprocessing



In [None]:
X = df.drop('fetal_health', axis=1)
y = df['fetal_health']

print("Features (X) shape:", X.shape)
print("Target (y) shape:", y.shape)

print("\nChecking for missing values in X:")
print(X.isnull().sum().sum())

print("\nChecking for missing values in y:")
print(y.isnull().sum().sum())

# Outlier detection and treatment using IQR
for column in X.columns:
    if X[column].dtype in ['float64', 'int64']:
        Q1 = X[column].quantile(0.25)
        Q3 = X[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Replace outliers with the median
        median_val = X[column].median()
        X[column] = np.where(X[column] < lower_bound, median_val,
                             np.where(X[column] > upper_bound, median_val, X[column]))
print("\nOutliers treated using IQR method. All values outside 1.5*IQR bounds replaced with column median.")

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

print("\nFeatures scaled using StandardScaler. First 5 rows of X_scaled:")
print(X_scaled.head())


## Exploratory Data Analysis (EDA)



In [None]:
df['fetal_health'] = df['fetal_health'].astype(int)
print("Converted 'fetal_health' column to integer type in df.")
print(df['fetal_health'].value_counts())

# Calculate the correlation matrix
correlation_matrix = df.corr()

# Plotting the heatmap
plt.figure(figsize=(20, 18))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix of Fetal Health Dataset Features')
plt.show()

print("Interpretation: The correlation matrix heatmap shows the pairwise correlation coefficients between all features in the dataset. Positive values indicate a positive correlation, negative values indicate a negative correlation, and values close to zero suggest a weak linear relationship. The 'fetal_health' row/column is particularly important, as it reveals which features are most strongly correlated with the target variable. For example, 'abnormal_short_term_variability' and 'prolongued_decelerations' might show strong positive correlations with 'fetal_health' (higher values indicating worse health), while 'accelerations' might show a strong negative correlation (higher values indicating better health). This visualization helps in understanding feature dependencies and selecting relevant features for modeling.")

In [None]:
plt.figure(figsize=(18, 5))

# lmplot for 'baseline value' vs 'fetal_health'
sns.lmplot(x='baseline value', y='fetal_health', data=df, aspect=1.5)
plt.title('Linear Relationship between Baseline Value and Fetal Health')
plt.xlabel('Baseline Value')
plt.ylabel('Fetal Health Outcome')
plt.grid(True)
plt.show()

print("Interpretation: The lmplot for 'baseline value' and 'fetal_health' shows a slight positive linear trend. As the baseline value increases, there's a tendency for the fetal health outcome to shift towards higher categories (i.e., less healthy). However, there is significant overlap and spread, indicating that 'baseline value' alone is not a strong predictor, but higher baseline values are more frequently associated with abnormal fetal health.")

# lmplot for 'accelerations' vs 'fetal_health'
sns.lmplot(x='accelerations', y='fetal_health', data=df, aspect=1.5)
plt.title('Linear Relationship between Accelerations and Fetal Health')
plt.xlabel('Accelerations')
plt.ylabel('Fetal Health Outcome')
plt.grid(True)
plt.show()

print("Interpretation: The lmplot for 'accelerations' and 'fetal_health' shows a negative linear trend. Higher acceleration values are generally associated with lower fetal health outcomes (i.e., healthier babies). This suggests that a higher number of accelerations might be indicative of a healthier fetus. However, the data points are very spread out, especially for lower acceleration values, and the relationship isn't very strong.")

# lmplot for 'uterine_contractions' vs 'fetal_health'
sns.lmplot(x='uterine_contractions', y='fetal_health', data=df, aspect=1.5)
plt.title('Linear Relationship between Uterine Contractions and Fetal Health')
plt.xlabel('Uterine Contractions')
plt.ylabel('Fetal Health Outcome')
plt.grid(True)
plt.show()

print("Interpretation: The lmplot for 'uterine_contractions' and 'fetal_health' shows a very weak, almost negligible, positive linear trend. While there's a slight tendency for fetal health outcomes to increase with more uterine contractions, the relationship is not clear and highly scattered. This feature does not appear to be a strong linear indicator of fetal health.")

**Reasoning**:
Following the instructions, I will now generate `swarmplot` visualizations to show the distribution of data points for 'fetal_health' across 'mean_value_of_short_term_variability' and 'histogram_min', providing interpretations for each.



In [None]:
plt.figure(figsize=(15, 6))

# Swarmplot for 'mean_value_of_short_term_variability' vs 'fetal_health'
sns.swarmplot(x='fetal_health', y='mean_value_of_short_term_variability', data=df, palette='viridis')
plt.title('Distribution of Mean Value of Short Term Variability by Fetal Health')
plt.xlabel('Fetal Health Outcome (1: Normal, 2: Suspect, 3: Pathological)')
plt.ylabel('Mean Value of Short Term Variability')
plt.grid(True)
plt.show()

print("Interpretation: The swarmplot for 'mean_value_of_short_term_variability' against 'fetal_health' shows distinct distributions for each health category. For Fetal Health 1 (Normal), the values are concentrated at higher short-term variability, indicating good health. As fetal health deteriorates (Fetal Health 2: Suspect, Fetal Health 3: Pathological), the mean value of short-term variability tends to decrease and become more tightly clustered at lower values. This suggests that lower short-term variability is associated with poorer fetal health outcomes.")

# Swarmplot for 'histogram_min' vs 'fetal_health'
plt.figure(figsize=(15, 6))
sns.swarmplot(x='fetal_health', y='histogram_min', data=df, palette='magma')
plt.title('Distribution of Histogram Min by Fetal Health')
plt.xlabel('Fetal Health Outcome (1: Normal, 2: Suspect, 3: Pathological)')
plt.ylabel('Histogram Min')
plt.grid(True)
plt.show()

print("Interpretation: The swarmplot for 'histogram_min' against 'fetal_health' reveals that Fetal Health 1 (Normal) has a wider spread of histogram_min values, with many data points at lower heart rates. For Fetal Health 2 (Suspect) and Fetal Health 3 (Pathological), the distribution shifts towards higher histogram_min values, and the spread narrows, indicating less variability and potentially higher baseline fetal heart rates during periods of distress. This suggests that higher histogram_min values are more common in less healthy fetuses.")

In [None]:
plt.figure(figsize=(15, 6))

# Swarmplot for 'mean_value_of_short_term_variability' vs 'fetal_health'
sns.swarmplot(x='fetal_health', y='mean_value_of_short_term_variability', data=df, palette='viridis', hue='fetal_health', legend=False)
plt.title('Distribution of Mean Value of Short Term Variability by Fetal Health')
plt.xlabel('Fetal Health Outcome (1: Normal, 2: Suspect, 3: Pathological)')
plt.ylabel('Mean Value of Short Term Variability')
plt.grid(True)
plt.tight_layout()
plt.show()

print("Interpretation: The swarmplot for 'mean_value_of_short_term_variability' against 'fetal_health' shows distinct distributions for each health category. For Fetal Health 1 (Normal), the values are concentrated at higher short-term variability, indicating good health. As fetal health deteriorates (Fetal Health 2: Suspect, Fetal Health 3: Pathological), the mean value of short-term variability tends to decrease and become more tightly clustered at lower values. This suggests that lower short-term variability is associated with poorer fetal health outcomes. The user warning regarding overlapping points indicates the density of the data, which can make individual points hard to distinguish.")

# Swarmplot for 'histogram_min' vs 'fetal_health'
plt.figure(figsize=(15, 6))
sns.swarmplot(x='fetal_health', y='histogram_min', data=df, palette='magma', hue='fetal_health', legend=False)
plt.title('Distribution of Histogram Min by Fetal Health')
plt.xlabel('Fetal Health Outcome (1: Normal, 2: Suspect, 3: Pathological)')
plt.ylabel('Histogram Min')
plt.grid(True)
plt.tight_layout()
plt.show()

print("Interpretation: The swarmplot for 'histogram_min' against 'fetal_health' reveals that Fetal Health 1 (Normal) has a wider spread of histogram_min values, with many data points at lower heart rates. For Fetal Health 2 (Suspect) and Fetal Health 3 (Pathological), the distribution shifts towards higher histogram_min values, and the spread narrows, indicating less variability and potentially higher baseline fetal heart rates during periods of distress. This suggests that higher histogram_min values are more common in less healthy fetuses. Similar to the previous plot, the user warning about unplaced points highlights the concentration of data at certain values.")

In [None]:
plt.figure(figsize=(15, 6))

# Boxplot for 'histogram_variance' vs 'fetal_health'
sns.boxplot(x='fetal_health', y='histogram_variance', data=df, palette='viridis')
plt.title('Distribution of Histogram Variance by Fetal Health')
plt.xlabel('Fetal Health Outcome (1: Normal, 2: Suspect, 3: Pathological)')
plt.ylabel('Histogram Variance')
plt.grid(True)
plt.tight_layout()
plt.show()

print("Interpretation: The boxplot for 'histogram_variance' against 'fetal_health' indicates clear differences across health categories. For Fetal Health 1 (Normal), the variance is generally lower, with a tighter interquartile range and fewer extreme outliers. As fetal health deteriorates (Fetal Health 2 and 3), the median histogram variance tends to increase, and the spread (IQR) becomes larger, suggesting greater variability in heart rate. There are also more noticeable outliers with higher variance in the less healthy categories, indicating episodes of significant heart rate fluctuations.")

# Boxplot for 'histogram_mode' vs 'fetal_health'
plt.figure(figsize=(15, 6))
sns.boxplot(x='fetal_health', y='histogram_mode', data=df, palette='magma')
plt.title('Distribution of Histogram Mode by Fetal Health')
plt.xlabel('Fetal Health Outcome (1: Normal, 2: Suspect, 3: Pathological)')
plt.ylabel('Histogram Mode')
plt.grid(True)
plt.tight_layout()
plt.show()

print("Interpretation: The boxplot for 'histogram_mode' against 'fetal_health' shows a general trend where the mode of the heart rate histogram decreases as fetal health deteriorates. Fetal Health 1 (Normal) has a higher median mode, indicating a more stable and higher typical heart rate. Fetal Health 2 (Suspect) and Fetal Health 3 (Pathological) show progressively lower median modes, suggesting a shift towards lower central heart rate values. The spread of the data also changes, with Fetal Health 3 having a somewhat wider IQR and more outliers at lower modes, indicating greater heart rate instability in pathological cases.")

In [None]:
plt.figure(figsize=(15, 6))

# Boxplot for 'histogram_variance' vs 'fetal_health'
sns.boxplot(x='fetal_health', y='histogram_variance', data=df, palette='viridis', hue='fetal_health', legend=False)
plt.title('Distribution of Histogram Variance by Fetal Health')
plt.xlabel('Fetal Health Outcome (1: Normal, 2: Suspect, 3: Pathological)')
plt.ylabel('Histogram Variance')
plt.grid(True)
plt.tight_layout()
plt.show()

print("Interpretation: The boxplot for 'histogram_variance' against 'fetal_health' indicates clear differences across health categories. For Fetal Health 1 (Normal), the variance is generally lower, with a tighter interquartile range and fewer extreme outliers. As fetal health deteriorates (Fetal Health 2 and 3), the median histogram variance tends to increase, and the spread (IQR) becomes larger, suggesting greater variability in heart rate. There are also more noticeable outliers with higher variance in the less healthy categories, indicating episodes of significant heart rate fluctuations.")

# Boxplot for 'histogram_mode' vs 'fetal_health'
plt.figure(figsize=(15, 6))
sns.boxplot(x='fetal_health', y='histogram_mode', data=df, palette='magma', hue='fetal_health', legend=False)
plt.title('Distribution of Histogram Mode by Fetal Health')
plt.xlabel('Fetal Health Outcome (1: Normal, 2: Suspect, 3: Pathological)')
plt.ylabel('Histogram Mode')
plt.grid(True)
plt.tight_layout()
plt.show()

print("Interpretation: The boxplot for 'histogram_mode' against 'fetal_health' shows a general trend where the mode of the heart rate histogram decreases as fetal health deteriorates. Fetal Health 1 (Normal) has a higher median mode, indicating a more stable and higher typical heart rate. Fetal Health 2 (Suspect) and Fetal Health 3 (Pathological) show progressively lower median modes, suggesting a shift towards lower central heart rate values. The spread of the data also changes, with Fetal Health 3 having a somewhat wider IQR and more outliers at lower modes, indicating greater heart rate instability in pathological cases.")

## Model Training




In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

# 1. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

print("Data split into training and testing sets successfully.")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

# 2. Initialize the classification models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'SVM': SVC(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

results = {}

print("\nTraining models and evaluating performance...")
for name, model in models.items():
    print(f"\n--- {name} ---")
    # 3. Train each model
    model.fit(X_train, y_train)

    # 4. Make predictions on the test set
    y_pred = model.predict(X_test)

    # 5. Calculate and print evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    results[name] = {
        'accuracy': accuracy,
        'report': report,
        'confusion_matrix': cm
    }

    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(report)
    print("Confusion Matrix:")
    print(cm)

    # Plot Confusion Matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=['Normal (1)', 'Suspect (2)', 'Pathological (3)'],
                yticklabels=['Normal (1)', 'Suspect (2)', 'Pathological (3)'])
    plt.title(f'Confusion Matrix - {name}')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

## Hyperparameter Tuning and Best Model Selection




In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grids for each model
param_grid_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}

param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5]
}

print("GridSearchCV imported and parameter grids defined for Logistic Regression, SVM, Random Forest, and Gradient Boosting.")

In [None]:
tuned_models = {}
best_scores = {}

# Dictionary of models and their respective parameter grids
models_to_tune = {
    'Logistic Regression': (LogisticRegression(max_iter=1000, random_state=42), param_grid_lr),
    'SVM': (SVC(random_state=42), param_grid_svm),
    'Random Forest': (RandomForestClassifier(random_state=42), param_grid_rf),
    'Gradient Boosting': (GradientBoostingClassifier(random_state=42), param_grid_gb)
}

print("Starting hyperparameter tuning for each model...")

for name, (model, param_grid) in models_to_tune.items():
    print(f"\n--- Tuning {name} ---")
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    print(f"Best parameters for {name}: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy for {name}: {grid_search.best_score_:.4f}")

    tuned_models[name] = grid_search.best_estimator_
    best_scores[name] = grid_search.best_score_

print("\n--- Hyperparameter Tuning Complete ---")

# Compare tuned models and select the best one
best_model_name = max(best_scores, key=best_scores.get)
best_model = tuned_models[best_model_name]

print(f"\nOverall best performing model: {best_model_name}")
print(f"Best cross-validation accuracy: {best_scores[best_model_name]:.4f}")
print("Selected best model and stored it as 'best_model' for deployment.")

## confusion matrix for best model

In [None]:
# Make predictions on the test set using the best model
y_pred_best_model = best_model.predict(X_test)

# Calculate the confusion matrix
cm_best_model = confusion_matrix(y_test, y_pred_best_model)

# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_best_model, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Normal (1)', 'Suspect (2)', 'Pathological (3)'],
            yticklabels=['Normal (1)', 'Suspect (2)', 'Pathological (3)'])
plt.title(f'Confusion Matrix - {best_model_name} (Best Tuned Model)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

print(f"Interpretation: The confusion matrix for the best-tuned {best_model_name} model visually represents its performance on the test set. Each row corresponds to the true class, while each column represents the predicted class. The diagonal elements show the number of correctly classified instances for each class. Off-diagonal elements indicate misclassifications. For example, a high number in the cell (True: Suspect, Predicted: Normal) would indicate that the model frequently misclassifies Suspect cases as Normal. This matrix provides a detailed breakdown of the model's predictive accuracy and its ability to distinguish between the three fetal health outcomes.")

## Save Best Model




In [None]:
import joblib

# Save the best-performing model to a file
model_filename = 'best_fetal_health_model.joblib'
joblib.dump(best_model, model_filename)

print(f"Best model '{best_model_name}' saved to {model_filename}")

In [None]:
import subprocess
import os

# 1. Install pyngrok
!pip install pyngrok
print("pyngrok installed successfully.")

In [None]:
os.environ['NGROK_AUTH_TOKEN'] = '37yofC2Qh6BykHXZNlTF7qEGEa6_387qPPWND9fU3zhAV8GYL'
print("NGROK_AUTH_TOKEN environment variable updated.")

In [None]:
import subprocess
import os

# Install Streamlit
!pip install streamlit
print("Streamlit installed successfully.")

## Save Fitted Scaler




In [None]:
import joblib

# Save the fitted scaler object to a file
scaler_filename = 'scaler.joblib'
joblib.dump(scaler, scaler_filename)

print(f"Fitted scaler saved to {scaler_filename}")

## Streamlit Dashboard Development


In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
import joblib
import numpy as np

# Load the pre-trained model and scaler
model = joblib.load('best_fetal_health_model.joblib')
scaler = joblib.load('scaler.joblib') # Load the saved scaler

st.set_page_config(page_title="Fetal Health Prediction", layout="wide")
st.title("Fetal Health Classification")
st.markdown("Predict the health outcome of a fetus based on CTG (Cardiotocography) parameters.")

# Define the feature names in the correct order as per the training data
# Since X is not directly available in app.py, we need to infer column names from the model or load them.
# For consistency, we can assume the order of features is the same as the original df columns excluding the target.
# A more robust solution would be to save feature names alongside the model/scaler.
# For now, let's re-define them based on the context of the notebook before this app.py generation.
feature_names = ['baseline value', 'accelerations', 'fetal_movement', 'uterine_contractions',
                 'light_decelerations', 'severe_decelerations', 'prolongued_decelerations',
                 'abnormal_short_term_variability', 'mean_value_of_short_term_variability',
                 'percentage_of_time_with_abnormal_long_term_variability',
                 'mean_value_of_long_term_variability', 'histogram_width', 'histogram_min',
                 'histogram_max', 'histogram_number_of_peaks', 'histogram_number_of_zeroes',
                 'histogram_mode', 'histogram_mean', 'histogram_median', 'histogram_variance',
                 'histogram_tendency']

# --- User Input Section ---
st.sidebar.header("Input Fetal CTG Parameters")

# Dictionary to store user inputs
user_inputs = {}

# Create sliders for each feature (using dummy min/max/mean for initial app.py generation)
# In a real app, these values would ideally come from the training data statistics.
# For now, approximate ranges will be used, as the actual `df` is not available in app.py scope.

# Using estimated ranges for the sliders. In a production app, these would be explicitly saved/loaded.
# For 'fetal_movement', 'severe_decelerations', 'light_decelerations', 'prolongued_decelerations', 'accelerations', 'uterine_contractions'
# Assuming typical small float ranges.
# For 'histogram_number_of_peaks', 'histogram_number_of_zeroes'
# Assuming typical integer ranges.

# Example approximations (these would ideally be loaded from a config or saved stats):
feature_ranges = {
    'baseline value': (100.0, 180.0, 133.0),
    'accelerations': (0.0, 0.02, 0.003),
    'fetal_movement': (0.0, 0.1, 0.009),
    'uterine_contractions': (0.0, 0.02, 0.004),
    'light_decelerations': (0.0, 0.02, 0.002),
    'severe_decelerations': (0.0, 0.001, 0.0),
    'prolongued_decelerations': (0.0, 0.005, 0.0001),
    'abnormal_short_term_variability': (10.0, 90.0, 47.0),
    'mean_value_of_short_term_variability': (0.0, 7.0, 1.3),
    'percentage_of_time_with_abnormal_long_term_variability': (0.0, 100.0, 10.0),
    'mean_value_of_long_term_variability': (0.0, 50.0, 4.0),
    'histogram_width': (0.0, 180.0, 70.0),
    'histogram_min': (50.0, 160.0, 93.0),
    'histogram_max': (120.0, 240.0, 164.0),
    'histogram_number_of_peaks': (0.0, 20.0, 4.0),
    'histogram_number_of_zeroes': (0.0, 10.0, 0.3),
    'histogram_mode': (60.0, 190.0, 137.0),
    'histogram_mean': (70.0, 190.0, 135.0),
    'histogram_median': (70.0, 190.0, 138.0),
    'histogram_variance': (0.0, 300.0, 19.0),
    'histogram_tendency': (-1.0, 1.0, 0.0)
}

for feature in feature_names:
    min_val, max_val, mean_val = feature_ranges.get(feature, (0.0, 1.0, 0.5)) # Default to 0-1 if not found

    if feature in ['fetal_movement', 'severe_decelerations', 'light_decelerations', 'prolongued_decelerations', 'accelerations', 'uterine_contractions']:
        user_inputs[feature] = st.sidebar.slider(
            f"{feature.replace('_', ' ').title()}",
            min_val, max_val, float(mean_val), step=0.001, format='%.3f'
        )
    elif feature in ['histogram_number_of_peaks', 'histogram_number_of_zeroes']:
        user_inputs[feature] = st.sidebar.slider(
            f"{feature.replace('_', ' ').title()}",
            min_val, max_val, float(round(mean_val)), step=1.0
        )
    elif feature == 'histogram_tendency': # This feature has -1, 0, 1 values
         user_inputs[feature] = st.sidebar.slider(
            f"{feature.replace('_', ' ').title()}",
            -1.0, 1.0, 0.0, step=1.0
        )
    else:
        user_inputs[feature] = st.sidebar.slider(
            f"{feature.replace('_', ' ').title()}",
            min_val, max_val, mean_val, step=(max_val - min_val) / 100.0
        )

# --- Prediction Section ---
if st.sidebar.button('Predict Fetal Health'):
    # Convert user inputs to a DataFrame
    input_df = pd.DataFrame([user_inputs])

    # Scale the input data
    scaled_input = scaler.transform(input_df)

    # Make prediction
    prediction = model.predict(scaled_input)
    prediction_proba = model.predict_proba(scaled_input)

    # Map numerical prediction to meaningful labels
    fetal_health_map = {
        1: 'Normal (1)',
        2: 'Suspect (2)',
        3: 'Pathological (3)'
    }
    predicted_class = fetal_health_map.get(int(prediction[0]), 'Unknown')

    st.subheader("Prediction Result")
    st.success(f"The predicted fetal health outcome is: **{predicted_class}**")

    st.subheader("Prediction Probabilities")
    proba_df = pd.DataFrame(prediction_proba, columns=['Normal', 'Suspect', 'Pathological'])
    st.write(proba_df)

st.header("About the Model and Data")
st.markdown(
    "This application uses a Gradient Boosting Classifier model, which was trained on the `fetal_health.csv` dataset. "
    "The dataset contains 21 features derived from Cardiotocogram (CTG) examinations, which are used to predict "
    "one of three fetal health outcomes: Normal (1), Suspect (2), or Pathological (3)."
)
st.markdown(
    "The data underwent preprocessing steps including outlier treatment using the IQR method (replacing outliers with the median) "
    "and feature scaling using `StandardScaler`. The model was optimized through hyperparameter tuning "
    "using GridSearchCV, selecting the Gradient Boosting Classifier as the best-performing model based on cross-validation accuracy."
)


##  Streamlit App Deployment




In [None]:
from pyngrok import ngrok, conf
import time
import subprocess
import os

# Terminate any existing ngrok tunnels
ngrok.kill()

# Set the ngrok authtoken explicitly
ngrok.set_auth_token(os.environ['NGROK_AUTH_TOKEN'])
print("ngrok authtoken set.")

# 3. Run the Streamlit application in the background
# We use subprocess to run streamlit in a non-blocking way.
# Streamlit usually runs on port 8501 by default.
streamlit_process = subprocess.Popen(['streamlit', 'run', 'app.py'],
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE,
                                     bufsize=1, universal_newlines=True)

print("Streamlit app started in the background. Waiting for it to become available...")

# Give Streamlit some time to start up
time.sleep(5) # Adjust this if Streamlit takes longer to start

# 4. Use pyngrok to establish a public URL for the Streamlit application
# The ngrok.connect() method returns a Tunnel object
public_url = ngrok.connect(addr="8501", proto="http")

# 5. Print the public URL
print(f"Streamlit App URL: {public_url}")
print("You can access the Streamlit app using the URL above. If you want to stop the app and ngrok tunnel, run ngrok.kill() and terminate the kernel.")