In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv("healthcare-dataset-stroke-data.csv")

# Data preprocessing
data['bmi'].fillna(data['bmi'].mean(), inplace=True)
data['smoking_status'].fillna(data['smoking_status'].mode()[0], inplace=True)

# Select features and target variable
X = data.drop(['id', 'stroke'], axis=1)
y = data['stroke']

# Define numerical and categorical features
numeric_features = ['age', 'avg_glucose_level', 'bmi']
categorical_features = ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

# Preprocessing pipelines for both numerical and categorical data
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine numerical and categorical transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder='passthrough')  # This includes any remaining columns not specified

# Full preprocessing pipeline
preprocessing_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit and transform the data
X_preprocessed = preprocessing_pipeline.fit_transform(X)

# Implementing Oversampling using SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_preprocessed, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Define and train the Logistic Regression model
logistic_regression = LogisticRegression(class_weight='balanced', random_state=42)
logistic_regression.fit(X_train, y_train)

# Save the Logistic Regression model
joblib.dump(logistic_regression, 'logistic_regression_model.pkl')

# Define evaluation metrics
metrics = ['precision', 'recall', 'roc_auc']

# Evaluate the Logistic Regression model
print("Model: Logistic Regression")
y_pred = logistic_regression.predict(X_test)
y_proba = logistic_regression.predict_proba(X_test)[:, 1]

for metric in metrics:
    if metric == 'roc_auc':
        score = roc_auc_score(y_test, y_proba)
    else:
        precision, recall, _, _ = precision_recall_fscore_support(y_test, y_pred)
        if metric == 'precision':
            score = precision[1]
        elif metric == 'recall':
            score = recall[1]
    print(f"{metric.capitalize()}: {score:.2f}")

# Function to predict stroke risk using the trained Logistic Regression model
def predict_stroke_risk(input_data):
    # Ensure input_data is a DataFrame with the same columns as the original dataset
    if not isinstance(input_data, pd.DataFrame):
        raise ValueError("Input data should be a pandas DataFrame.")

    # Preprocess input data
    input_data['bmi'].fillna(data['bmi'].mean(), inplace=True)
    input_data['smoking_status'].fillna(data['smoking_status'].mode()[0], inplace=True)

    # Preprocess the input data using the same preprocessing pipeline
    X_input_preprocessed = preprocessing_pipeline.transform(input_data)

    # Get predicted probabilities using the trained Logistic Regression model
    logistic_regression_prob = logistic_regression.predict_proba(X_input_preprocessed)[0][1]

    return {
        'Logistic Regression Probability': logistic_regression_prob
    }

sample_data = pd.DataFrame({
    'gender': ['Male'],                  # Gender: Male
    'age': [65.0],                      # Age: 65 years
    'hypertension': [1],                # Hypertension: Yes (1)
    'heart_disease': [1],               # Heart Disease: Yes (1)
    'ever_married': ['Yes'],            # Ever Married: Yes
    'work_type': ['Self-employed'],     # Work Type: Self-employed
    'Residence_type': ['Urban'],         # Residence Type: Urban
    'avg_glucose_level': [150.0],       # Average Glucose Level: 150 mg/dL
    'bmi': [30.0],                      # BMI: 30.0
    'smoking_status': ['Smokes']        # Smoking Status: Smokes
})


# Use the prediction function to get predicted probabilities for the sample data
prediction_result = predict_stroke_risk(sample_data)

# Convert probabilities to percentages
for model, prob in prediction_result.items():
    risk_percentage = prob * 100
    print(f"{model}: {risk_percentage:.2f}%")


Model: Logistic Regression
Precision: 0.76
Recall: 0.82
Roc_auc: 0.84
Logistic Regression Probability: 70.21%


In [4]:
pip install tensorflowjs

Collecting tensorflowjs
  Downloading tensorflowjs-4.11.0-py3-none-any.whl (89 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/89.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.2/89.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow-decision-forests>=1.5.0 (from tensorflowjs)
  Downloading tensorflow_decision_forests-1.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.8/16.8 MB[0m [31m80.0 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow<3,>=2.13.0 (from tensorflowjs)
  Downloading tensorflow-2.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (489.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m489.8/489.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting wurlitzer (from tensorflow-decision-forests>=1.5.0->tensorflowjs)
  Downloading wurlitzer-3.

In [7]:
import tensorflow as tf
import numpy as np
import tensorflowjs as tfjs

# Define your logistic regression model using TensorFlow/Keras
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy')

# Train the model on your training data (X_train, y_train)
model.fit(X_train, y_train, epochs=10)

# Save the trained model
model.save('tensorflow_logistic_regression')

# Convert the TensorFlow/Keras model to TensorFlow.js format
tfjs.converters.save_keras_model(model, 'tfjs_logistic_regression')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  saving_api.save_model(
