Train the model

Dataset: https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
import joblib

# load the dataset
df = pd.read_csv('/content/healthcare-dataset-stroke-data.csv')

# convert categorical features to numeric using label encoding
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df['ever_married'] = le.fit_transform(df['ever_married'])
df['work_type'] = le.fit_transform(df['work_type'])
df['Residence_type'] = le.fit_transform(df['Residence_type'])
df['smoking_status'] = le.fit_transform(df['smoking_status'])

# Handle missing values
imputer = SimpleImputer(strategy='median')
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# drop the id column as it's not useful for prediction
df.drop('id', axis=1, inplace=True)

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('stroke', axis=1), df['stroke'], test_size=0.2, random_state=42)

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [2, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# train a random forest classifier
rf = RandomForestClassifier(random_state=42)

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding accuracy score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)

# Fit the model using the best hyperparameters and predict the stroke probability for the test set
best_rf = grid_search.best_estimator_
best_rf.fit(X_train, y_train)

# evaluate the model on the test set
accuracy = best_rf.score(X_test, y_test)
print('RandomForestClassifier Model accuracy:', accuracy)

# Predict the stroke classes for the test set
y_pred = best_rf.predict(X_test)

# Compute and print the precision and recall
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

# save the trained model to a file
joblib.dump(best_rf, 'stroke_prediction_model_best_rf.joblib')

  warn(


Best Hyperparameters: {'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Accuracy Score: 0.9545010818397561


  warn(


RandomForestClassifier Model accuracy: 0.9393346379647749
Classification Report:
               precision    recall  f1-score   support

         0.0       0.94      1.00      0.97       960
         1.0       0.00      0.00      0.00        62

    accuracy                           0.94      1022
   macro avg       0.47      0.50      0.48      1022
weighted avg       0.88      0.94      0.91      1022



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


['stroke_prediction_model_best_rf.joblib']

Find the max probability in the dataset (for use in the score calculation)

In [2]:
# predict stroke probability for all instances in the dataset
probas = best_rf.predict_proba(df.drop('stroke', axis=1))

# find the maximum probability and its corresponding index
max_proba = np.max(probas[:, 1])
max_index = np.argmax(probas[:, 1])

print(max_proba, max_index)

print(df.loc[max_index])

0.7764451659451659 171
gender                0.00
age                  79.00
hypertension          1.00
heart_disease         1.00
ever_married          0.00
work_type             3.00
Residence_type        0.00
avg_glucose_level    60.94
bmi                  28.10
smoking_status        2.00
stroke                1.00
Name: 171, dtype: float64


Test the model

In [3]:
import joblib

model = joblib.load('stroke_prediction_model_best_rf.joblib')

# test input data
input_data = {
    'gender': 'Male', 
    'age': 79, 
    'hypertension': 1, 
    'heart_disease': 1, 
    'ever_married': 'No', 
    'work_type': 'Never_worked', 
    'Residence_type': 'Urban', 
    'avg_glucose_level': 60, 
    'bmi': 28.10,
    'smoking_status': 'smokes'
    }

# input mapping
label_encode_dict = {
    'gender': {'Male': 0, 'Female': 1, 'Other': 2},
    'ever_married': {'No': 0, 'Yes': 1},
    'work_type': {'children': 0, 'Govt_job': 1, 'Never_worked': 2, 'Private': 3, 'Self-employed': 4},
    'Residence_type': {'Rural': 0, 'Urban': 1},
    'smoking_status': {'Unknown': 0, 'never smoked': 1, 'formerly smoked': 2, 'smokes': 3}
}

# map the inputs
for feature in label_encode_dict:
    if input_data[feature] in label_encode_dict[feature]:
        input_data[feature] = label_encode_dict[feature][input_data[feature]]
    else:
        input_data[feature] = np.nan

input_values = np.array([input_data['gender'], input_data['age'], input_data['hypertension'], input_data['heart_disease'],
                        input_data['ever_married'], input_data['work_type'], input_data['Residence_type'],
                        input_data['avg_glucose_level'], input_data['bmi'], input_data['smoking_status']])
    
# make the prediction using the loaded model
prediction = model.predict_proba([input_values])[0][1]
# prediction_other = model.predict([input_values])[0]

print(prediction)

0.5680033930857874




Figure out how to generate a score out of 100

In [5]:
def prob_to_risk_one(probability):
    odds = probability / (1 - probability)
    risk = 50 * (1 + np.log(odds))
    return risk

print(prob_to_risk_one(prediction))

def prob_to_risk_two(probability, max_probability):
    risk = probability * 100 / max_probability
    return risk

print(prob_to_risk_two(prediction, max_proba))


63.6854829298938
73.1543472737521
