In [None]:
#Auto-save / Auto-reload
%autosave 1

%load_ext autoreload
%autoreload 2

# Imports

In [None]:
import os

# for EDA
import requests
import wget
import pylab
import pandas as pd
import numpy as np

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
import scikitplot as skplt # https://scikit-plot.readthedocs.io/en/stable/Quickstart.htm
from IPython.display import display

# models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import ComplementNB

# tuning and metrics
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_curve, precision_recall_curve, auc, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mutual_info_score

# Deployment
import bentoml

# still need?
from sklearn.metrics import mean_squared_error as rmse
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score


In [None]:
df = pd.read_csv('./data/healthcare-dataset-stroke-data-df.csv')
df2 = pd.read_csv('./data/healthcare-dataset-stroke-data-df2.csv')

# Modeling

The modeling proccess is to first test possable viable models, then select the best models, and create a baseline to compare the tuning process against.

**Linear Regression** is a poor choice for clasification output problems and better suited for continuous output problems. While the label is binary the model could provide values outside of that range.

Researching ML models and the general concesses from ML engineers show the following models best for our label which is a binary classifier (stroke 1 or 0). There are numerous articles and research papers but this [article](https://towardsdatascience.com/pros-and-cons-of-various-classification-ml-algorithms-3b5bfb3c87d6) and this [article](https://towardsdatascience.com/top-10-binary-classification-algorithms-a-beginners-guide-feeacbd7a3e2) sum up the reasoning.

### Data Prep

#### DF

In [None]:
#@title Split: df_train / df_val

df_full_train, df_val = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_test = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.stroke.values
y_val = df_val.stroke.values
y_test = df_test.stroke.values

del df_train['stroke']
del df_val['stroke']
del df_test['stroke']

In [None]:
#@title x_train

train_dict = df_train.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
x_train = dv.fit_transform(train_dict)


In [None]:
x_train.shape, y_train.shape

In [None]:
#@title x_val

val_dict = df_val.to_dict(orient='records')
x_val = dv.transform(val_dict)

In [None]:
#@title x_test

test_dict = df_test.to_dict(orient='records')
x_test = dv.transform(test_dict)

#### DF2

In [None]:
#@title Split: df2_train / df2_val / df2_test

df2_full_train, df2_val = train_test_split(df2, test_size=0.2, random_state=1)
df2_train, df2_test = train_test_split(df2_full_train, test_size=0.25, random_state=1)

df2_train = df2_train.reset_index(drop=True)
df2_val = df2_val.reset_index(drop=True)
df2_test = df2_test.reset_index(drop=True)

y_train2 = df2_train.stroke.values
y_val2 = df2_val.stroke.values
y_test2 = df2_test.stroke.values

del df2_train['stroke']
del df2_val['stroke']
del df2_test['stroke']

In [None]:
#@title x_val2

val2_dict = df2_val.to_dict(orient='records')
x_val2 = dv.transform(val2_dict)

In [None]:
#@title x_train2

train2_dict = df2_train.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
x_train2 = dv.fit_transform(train2_dict)


In [None]:
#@title x_test2

test2_dict = df2_test.to_dict(orient='records')
x_test2 = dv.transform(test2_dict)

### Model Comparison

In [None]:
#@title def model_run(models)

def model_run(models,x_train,y_train,x_val,y_val):
  for name, model in models:
      model = model
      model.fit(x_train, y_train)

      print(" ")
      print("Model: ",name)
      print('Validation Accuracy: ', accuracy_score(y_val, model.predict(x_val)))
      print('Training Accuracy: ', accuracy_score(y_train, model.predict(x_train)))
      print(" ")

      plt.figure()
      cf_matrix = confusion_matrix(y_val, model.predict(x_val))
      sns.heatmap(cf_matrix, annot = True, fmt = 'g', cmap = sns.cubehelix_palette(as_cmap=True))
      plt.title('Confusion Matrix: {}'.format(name))
      plt.xticks([0.5, 1.5], ['Predicted Non-Stroke', 'Predicted Stroke'])
      plt.yticks([0.5, 1.5], ['Actual Non-Stroke', 'Actual Stroke'])
      plt.show()

      print(" ")

      skplt.metrics.plot_roc(y_val, model.predict_proba(x_val))
      plt.title('ROC Curves: {}'.format(name))
      plt.show()

In [None]:
#@title Model Selection
start_models = [('Logistic Regression', LogisticRegression(max_iter=4000,random_state=10)), # max_iter=4000 due to convergence error
          ('Decision Tree Classifier', DecisionTreeClassifier(random_state=10)),
          ('Random Forest', RandomForestClassifier(random_state=10)),
          ('Linear Discriminant Analyzer', LinearDiscriminantAnalysis()),
          ('Ada Boost', AdaBoostClassifier(random_state=1)),
          ('KNN', KNeighborsClassifier()),
          ('Support Vector Machine', SVC(probability =True, random_state=10)),
          ('XG Boost', XGBClassifier(random_state=10)),
          ('Cat Boost', CatBoostClassifier(logging_level = 'Silent')),
          ('Naive Bayes', ComplementNB())]

Logisitc Regression interation increased to resolve [convergence error](https://stackoverflow.com/questions/62658215/convergencewarning-lbfgs-failed-to-converge-status-1-stop-total-no-of-iter)

#### DF all models

In [None]:
#@title Run all models
model_run(start_models,x_train,y_train,x_val,y_val)

#### DF2 all models

In [None]:
#@title Run all models 2
model_run(start_models,x_train2,y_train2,x_val2,y_val2)

Poor overall score, likely due to data imbalance. Actual stroke prediction is 82-89% accurate. With SVM having the best performance.

### SMOTE

Oversampling with SMOTE

In [None]:
#@title def training_smote(x,y) - to address the stroke imbalance

# ONLY APPLY SMOTE TO TRAIN!
def training_smote(x,y):
    smt = SMOTE(random_state=42,sampling_strategy='minority')
    a, b = smt.fit_resample(x,y)

    return a, b

In [None]:
x_train_smt, y_train_smt = training_smote(x_train, y_train)

In [None]:
x_train_smt.shape

In [None]:
x_train2_smt, y_train2_smt = training_smote(x_train2, y_train2)

In [None]:
len(x_train2)

SMOTE is not able to generate data

### All Models

#### SMOTE DF all models

In [None]:
#@title Run all models
model_run(start_models,x_train_smt,y_train_smt,x_val,y_val)

#### SMOTE DF2 all models

In [None]:
#@title Run all models 2
model_run(start_models,x_train2_smt,y_train2_smt,x_val2,y_val2)

## Best Performing Models

While the score may not be the highest, these models were the best at predicting a stroke. The trade off is lower non-stroke accuracy. In the case of healthcare, we would rather over capture non-stroke than miss actual strokes.

The df2 (more generalized data) and the SMOTE data increased accuracy.

Naive Bayes had the best Stroke possiitive prediction but the non-stroke prediction was so poor I am not sure that it is usable in production.

In [None]:
#@title Selected Models
best_models = [('Logistic Regression', LogisticRegression(max_iter=1000, random_state=10)),
          ('Linear Discriminant Analyzer', LinearDiscriminantAnalysis()),
          ('Support Vector Machine', SVC(probability = True,random_state=10)),
          ('Naive Bayes', ComplementNB())]

In [None]:
model_run(best_models, x_train2_smt,y_train2_smt,x_val2,y_val2)

<a name="tuning"></a>
# Tuning

The tuning process consists of adjusting and measuring model parameters, and, if needed, revisiting feature engineering.

**Best Preforming Model**


Comparison of the models in the variable `selected_models`:

* **Logistic Regression**, `LogisticRegression()`
  * Validation Accuracy:     ~0.729
  * Training Accuracy:       ~0.781
  * Stroke Possitive Missed: 8/52
* **Linear Discriminant Analyzer**, `LinearDiscriminantAnalysis()`
  * Validation Accuracy:     ~0.714
  * Training Accuracy:       ~0.782
  * Stroke Possitive Missed: 5/55
* **Support Vector Machine**, `SVC(probability = True)`
  * Validation Accuracy:     ~0.669
  * Training Accuracy:       ~0.763
  * Stroke Possitive Missed: 5/55
* **Naive Bays**, `ComplementNB()`
  * Validation Accuracy:     ~0.497
  * Training Accuracy:       ~0.733
  * Stroke Possitive Missed: 2/58

<a name="feat_imp"></a>
## Feature Importance - [article](https://machinelearningmastery.com/calculate-feature-importance-with-python/)

## Feature Importance Model

In [None]:
my_list = [LogisticRegression(max_iter=4000, random_state=10),LinearDiscriminantAnalysis()]

In [None]:
#@title Feature Rank
for model in my_list:
    rfe = RFECV(estimator=model,step=1,cv=5)
    rfe.fit(x_train2_smt,y_train2_smt)
    print(" ")
    print(model)
    print(" ")
    for j, col, in zip(range(df.shape[1]), df.columns):
      print(f'{col} selected= {rfe.support_[1]} rank= {rfe.ranking_[1]}')

None of the features show a higher rank than the others.

In [None]:
#@title Get Feature Names

feat_names = dv.get_feature_names_out()

In [None]:
#@title def feature_importance(model)
def feature_importance(model):
    # define the model
    model = model
    # fit the model
    model.fit(x_train2_smt, y_train2_smt)
    # get importance
    importance = model.coef_[0]
    # summarize feature importance
    for i,v in enumerate(importance):
        print(f'Feature: {feat_names[i]}',' ',f'Score: {v}\n')

In [None]:
#@title Logistic Feature coefficients

feature_importance(LogisticRegression(max_iter=4000, random_state=10))

In [None]:
#@title Linear Discriminant Feature coefficients

feature_importance(LinearDiscriminantAnalysis())

# plot feature importance
#pyplot.bar([x for x in range(len(importance))], importance)
#pyplot.show()

**NOTE:** Reading too much into coeficients can be dangerious as the relaitonships can be complex.

Heart_disease, Diabetes, and hypertension were all possitively correlated with stroke. This is consistent with studies showing that these diseases greatly increase your likelihood of having a stroke. [Cardiovascular risk factors](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4419105/) / [Diabetes and Stroke: Epidemiology, Pathophysiology, Pharmaceuticals and Outcomes](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5298897/#:~:text=Diabetes%20is%20a%20well%2Destablished,stroke%20with%20uncontrolled%20glucose%20levels.)

The second highest possitive coefficients is for work_type=children despte research showing that stroke in children is [rare](https://www.hopkinsmedicine.org/health/conditions-and-diseases/stroke/pediatric-stroke).

The obese feature may be confusing the models, as we are seeing inconsistent accounting for it, despite it being a known to increase the likelyhood of stroke. This may be a feature that is only positive when paired with other features. [Obesity and Stroke: Does the Paradox Apply for Stroke?](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7946563/).

The negative coeficient for smokes (in logistic regression) is also intersting, as we know it leads to hypertension and heart disease. It is likely only positive when combined with hypertension and/or heart disease.

In [None]:
df2[df2.work_type == 'children'].stroke.value_counts()

It may be better to remove the children type and limit age to 18+; creating an adult only model. It is reasonable that children would need a model specific to them.

## Feature Importance Data

In [None]:
#@title Sample Stroke Rate
stroke_rate = df.stroke.mean() # works because it is binary
round(stroke_rate,2)

In [None]:
df2.info()

In [None]:
categorical = ['gender','hypertension','heart_disease','ever_married','work_type','residence_type','smoking_status','obese','diabetes']

In [None]:
#@title Feature Importance Breakdown

for c in categorical:
    print(c)
    df_group = df2.groupby(c).stroke.agg(['mean','count'])
    df_group['diff'] = df_group['mean'] - stroke_rate # difference between group and overall stroke rate
    df_group['risk'] = df_group['mean'] / stroke_rate # group risk relative to overall risk
    display(df_group)
    print('\n')

The most most interesting thing here is we can see that the stroke risk reletive to the group risk is consistent with expectations.

In [None]:
def mutual_stroke(series):
    return mutual_info_score(series,df2['stroke'])

In [None]:
mi = df2[categorical].apply(mutual_stroke)
mi.sort_values(ascending=False)

We can see that the categories are independant, this is expected.

In [None]:
#@title Feature Stroke Correlation
df2.corr(numeric_only = True)['stroke'].sort_values(ascending=False).iloc[1:] # if using different dataframes, use df1.corrwith(df2)

**Note:** obese is a bool value and should not show in this list.

## Feature Importance Testing




**Hypothesis:** better prediction can be obtained by excluding children value.

In [None]:
df2_adult = df2.copy()

In [None]:
child = df2_adult[(df2_adult['work_type']=='Other')].index
df2_adult.drop(child, inplace=True)

In [None]:
age18 = df2[(df['age']<18)].index
df2_adult.drop(age18,inplace=True)

In [None]:
df2_adult.age.plot.hist(bins=50)

In [None]:
#@title Split: x_train2_adult / x_val2_adult / x_test2_adult

df2_full_train_adult, x_val2_adult = train_test_split(df2_adult, test_size=0.2, random_state=1)
x_train2_adult, x_test2_adult = train_test_split(df2_full_train_adult, test_size=0.25, random_state=1)

x_train2_adult = x_train2_adult.reset_index(drop=True)
x_val2_adult = x_val2_adult.reset_index(drop=True)
x_test2_adult = x_test2_adult.reset_index(drop=True)

y_train2_adult = x_train2_adult.stroke.values
y_val2_adult = x_val2_adult.stroke.values
y_test2_adult = x_test2_adult.stroke.values

del x_train2_adult['stroke']
del x_val2_adult['stroke']
del x_test2_adult['stroke']

dv = DictVectorizer(sparse=False)

# df2_train_child
df2_train2_dict_adult = x_train2_adult.to_dict(orient='records')
x_train2_adult = dv.fit_transform(df2_train2_dict_adult)

# df2_x_val_adult
df2_val2_dict_adult = x_val2_adult.to_dict(orient='records')
x_val2_adult = dv.transform(df2_val2_dict_adult)

# x_test2_adult
df2_test2_dict_adult = x_test2_adult.to_dict(orient='records')
x_test2_adult = dv.transform(df2_test2_dict_adult)

In [None]:
x_train2_adult_smt, y_train2_adult_smt = training_smote(x_train2_adult, y_train2_adult)

In [None]:
# selected_models is defined at the end of the section Modeling
model_run(best_models,x_train2_adult_smt, y_train2_adult_smt,x_val2_adult,y_val2_adult)

All models did worse. Removing children had a negative affect.

<a name="modTune"></a>
## Model Tuning

### LogisticRegression Tunning
[Article](https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/)

In [None]:
log_model = LogisticRegression(max_iter=4000, random_state=10)

In [None]:
# define LogisticRegression and parameters
model = log_model
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01,0.001]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=10)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(x_train2, y_train2) # defined in the modeling section
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

As noted previously, we need possitive stroke prediction first and model statistics second. Using the above results gives us a guidline to test against but it is limited in that is looing at model performace and not taking into account possitive stroke prediction.

In [None]:
# Final model comparison
log_model_opt = [('liblinear', LogisticRegression(max_iter=4000, random_state=10,C=0.1,penalty='l2',solver='liblinear')),
                 ('newton-cg', LogisticRegression(max_iter=4000, random_state=10,C=0.1,penalty='l2',solver='newton-cg')),
                 ('lbfgs', LogisticRegression(max_iter=4000, random_state=10,C=0.1,penalty='l2',solver='lbfgs')),
                 ('Default', LogisticRegression(max_iter=4000, random_state=10))]

In [None]:
model_run(log_model_opt,x_train2_smt,y_train2_smt,x_val2,y_val2)

**Logistic Regression**, `LogisticRegression(max_iter=4000, random_state=10,C=0.1,penalty='l2',solver='liblinear')` is the best performing.

In [None]:
#@title Optimized Log Model

log_model = LogisticRegression(max_iter=4000, random_state=10,C=0.1,penalty='l2',solver='liblinear')

### LinearDiscriminantAnalysis Tuning
[Article](https://machinelearningmastery.com/linear-discriminant-analysis-for-machine-learning/)

In [None]:
lda_models = [('svd', LinearDiscriminantAnalysis(solver='svd')),
          ('lsqr', LinearDiscriminantAnalysis(solver='lsqr')),
          ('eigen', LinearDiscriminantAnalysis(shrinkage='auto',solver='eigen'))]

In [None]:
model_run(lda_models,x_train2_smt,y_train2_smt,x_val2,y_val2)

The Model `LinearDiscriminantAnalysis(solver='svd')` outperfomred Logistic Regression models.

In [None]:
lda_model = LinearDiscriminantAnalysis(solver='svd')

### SVM Tuning

#### SVM hyperparamerts tuning using [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

In [None]:
# baseline
# train the model on train set
model =  SVC(probability = True,random_state=10)
model.fit(x_train2_smt, y_train2_smt)

# print prediction results
predictions = model.predict(x_val)
print(classification_report(y_val, predictions))

In [None]:
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

# fitting the model for grid search
grid.fit(x_train, y_train)

In [None]:
# print best parameter after tuning
print(grid.best_params_)

# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

In [None]:
grid_predictions = grid.predict(x_train2_smt)

# print classification report
print(classification_report(y_train2_smt, grid_predictions))

In [None]:
svc_models =[('rbf',SVC(kernel='rbf',probability = True,random_state=10,C=0.1,gamma=1)),('Default',SVC(probability = True,random_state=10))]
model_run(svc_models, x_train2_smt,y_train2_smt,x_val2,y_val2)

Tuning lead to overfitting and poor predictive accracy for stroke positive observations. Default svc is the better of the two.

Best overall model however is `lda_model = LinearDiscriminantAnalysis(solver='svd')`

### Final Model selection

In [None]:
final_model = [('lda_model', LinearDiscriminantAnalysis(solver='svd'))]

In [None]:
#@title def model_test(models)

def model_test(models):
  for name, model in models:
      model = model

      print(" ")
      print("Model: ",name)
      print('Testing Accuracy: ', accuracy_score(y_test2, model.predict(x_test2)))
      print(" ")

      plt.figure()
      cf_matrix = confusion_matrix(y_test2, model.predict(x_test2))
      plt.title('Confusion Matrix: {}'.format(name))
      sns.heatmap(cf_matrix, annot = True, fmt = 'g', cmap = sns.cubehelix_palette(as_cmap=True))
      plt.show()

      print(" ")

      skplt.metrics.plot_roc(y_test2, model.predict_proba(x_test2))
      plt.title('ROC Curves: {}'.format(name))
      plt.show()

In [None]:
model_run(final_model,x_train2_smt,y_train2_smt,x_val2,y_val2)

In [None]:
model_test(final_model)

<a name="deploy"></a>
# Deployment

**This is the code for creating the bento, cannot access service on Google Colab.**

## Data prep

## Model

In [None]:
df2.iloc[6] # is the index of a stroke possitive patient

**JSON format**
{
"gender": "Male",
"age": 74.0,
"hypertension": 1,
"heart_disease": 1,
"ever_married": Yes,
"work_type": "Private",
"residence_type": "Rural",
"smoking_status": "never smoked",
"obese": False,
"diabetes": False
}

In [None]:
model = LinearDiscriminantAnalysis(solver='svd')

In [None]:
model.fit(x_train2_smt, y_train2_smt)
y_pred = model.predict_proba(x_test2)

In [None]:
len(x_test2), len(y_pred)

In [None]:
y_pred[6,1] # 6 is the index of a stroke possitive patient, 1 is prediction probability

<a name="bento"></a>
## Bentoml

In [None]:
df2.to_csv('clean_stroke_data.csv', index=False)

In [None]:
%%writefile training.py

# Script to train and save model

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction import DictVectorizer
from imblearn.over_sampling import SMOTE
import bentoml


df = pd.read_csv('clean_stroke_data.csv') # csv must be in same directory

# data split
df_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.stroke.values
y_test = df_test.stroke.values

del df_train['stroke']
del df_test['stroke']

# x_train
train_dict = df_train.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
x_train = dv.fit_transform(train_dict)

# x_test
test_dict = df_test.to_dict(orient='records')
x_test = dv.transform(test_dict)

# ONLY APPLY SMOTE TO TRAIN!
def training_smote(x,y):
    smt = SMOTE(random_state=1)
    x_train, y_train = smt.fit_resample(x, y)
    return x_train, y_train

# SMOTE
x_train_smt, y_train_smt = training_smote(x_train, y_train)
len(x_train), len(y_train)

# Train model
model = LinearDiscriminantAnalysis(solver='svd')

model.fit(x_trai_smt, y_train_smt)

bentoml.sklearn.save_model('stroke_prediction', model,
                           custom_objects={
                               'dicVectorizer': dv
                           },
                           signatures = {"predict_proba": {"batchable": False}}
                           )

**Save Model**

In [None]:
# save model from notebook
bentoml.sklearn.save_model('stroke_prediction', model,
                           custom_objects={
                               'dicVectorizer': dv
                           },
                           signatures = {"predict_proba": {"batchable": False}}
                           )

**service.py file, service that runs.**

[Risk Rates](https://www.betterhealth.vic.gov.au/health/conditionsandtreatments/heart-disease-and-stroke-your-risk-score)

Calculate a percentage score (or absolute risk), which puts you into one of three categories of risk:

* **High risk** – a score over 15% means you are at high risk. If you have a score over 15%, you have at least a 1 in 7 chance of having a heart attack or stroke in the next five years, if nothing is changed.
* **Moderate risk** – a score of between 10% and 15%, you have (as a minimum), a 1 in 10 chance of having a heart attack or stroke in the next five years, if nothing is changed.
* **Low risk** – a score under 10%, you have a less than 1 in 10 chance of having a heart attack or stroke in the next five years, if nothing is changed.

In [None]:
%%writefile service.py

import bentoml
from bentoml.io import JSON
import numpy as np

model_ref = bentoml.sklearn.get('stroke_prediction:latest')
dv = model_ref.custom_objects['dicVectorizer']

model_runner = model_ref.to_runner()

svc = bentoml.Service('stroke_prediction', runners=[model_runner])

@svc.api(input= JSON.from_sample({
      "gender": "Male",
      "age": 74.0,
      "hypertension": 1,
      "heart_disease": 1,
      "ever_married": Yes,
      "work_type": "Private",
      "residence_type": "Rural",
      "smoking_status": "never smoked",
      "obese": False,
      "diabetes": False
        }), output= JSON())

def classify(application_data):
    vector = dv.transform(application_data)
    prediction = model_runner.predict_proba.run(vector)
    # np.format_float_positional returns a string
    round_predict = float(np.format_float_positional(prediction[:,1], precision=2))

    if round_predict < 0.10:
        return 'Stroke Risk: LOW'
    elif (round_predict => 0.10) & (round_predict =< 0.15):
        return 'Stroke Risk: MODERATE'
    else:
        return 'Stroke Risk: HIGH'

**Bento .yaml file file needed to build the bento**

In [None]:
%%writefile bentofile.yaml

service: "service.py:svc"
labels:
  owner: Gregory Morris
  project: MLZoomcamp midterm project
include:
- "*.py"
python:
    packages:  # Additional pip packages required by the service
    - scikit-learn
    - numpy
    - bentoml

**Build the bento from the yaml file**

**bentoml commands:**
* build bento: `bentoml build`

* docker container: `bentoml containerize [bento name:code given after 'bentoml build']`

* *make sure docker service is running if setting up locally. Then run this command line:*

    * run docker contianer: `docker run -it --rm -p 3000:3000 [bento name:code given after 'bentoml build'] serve --production`

## AWS
See README.md for AWS instructions.

## Production App Access
See README.md for App access instructions.

# Next steps

* Review outliers and test against model
* Consider AWS Lambda as alternative to ECS
* Investigate gradio and streamlit as front end options
* Consider ensamble methods to improve predictions
