### STROKE PREDICTION:

In [1]:
! pip install -U mlfoundry

Collecting mlfoundry
  Downloading mlfoundry-0.3.22-py3-none-any.whl (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.8/97.8 kB[0m [31m348.2 kB/s[0m eta [36m0:00:00[0m
Collecting whylogs<0.7.0,>=0.6.15
  Downloading whylogs-0.6.30-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting pipreqs<0.5.0,>=0.4.11
  Downloading pipreqs-0.4.11-py2.py3-none-any.whl (32 kB)
Collecting scikit-learn<0.25.0,>=0.24.2
  Downloading scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.3/22.3 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
Collecting fastparquet<0.8.0,>=0.7.2
  Downloading fastparquet-0.7.2-cp37-cp37m-manylinux2010_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollectin

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_score, roc_auc_score, f1_score, recall_score, accuracy_score


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv


#### Initialize MLFoundry client

In [3]:
import getpass
import urllib.parse
import mlfoundry as mlf

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
MLF_API_KEY = user_secrets.get_secret("mlfoundry")


# MLF_API_KEY = os.environ.get('MLF_API_KEY')
# if not MLF_API_KEY:
#     print("Please get your API key from https://app.truefoundry.com/settings")
#     MLF_API_KEY = getpass.getpass("Paste your API key and hit enter:")
    

In [4]:
client = mlf.get_client(api_key=MLF_API_KEY)

#### Load the data:

In [5]:
stroke_data = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
stroke_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


#### Exploratory Data Analysis:

In [6]:
stroke_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [7]:
stroke_data.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [8]:
na_values = stroke_data.isna().sum()
na_values.sort_values()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
smoking_status         0
stroke                 0
bmi                  201
dtype: int64

#### Data Preprocessing:

In [9]:
stroke_data = stroke_data.set_index('id')

X = stroke_data.drop(columns =['stroke'])
y = stroke_data['stroke']

In [10]:
# get a list of all numeric columns in the dataset
numeric_cols = stroke_data.select_dtypes(exclude = "object").drop(columns = ['stroke']).columns.tolist()
print(numeric_cols)

# build a numeric transformer pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])


# get a list of all categorical columns in the dataset
categoric_cols = stroke_data.select_dtypes(include = "object").columns.tolist()
print(categoric_cols)

# build a categorical transformer pipeline
categorical_transformer = OneHotEncoder(handle_unknown='ignore')


# Apply the column transformer and make a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categoric_cols)])

['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']
['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']


### Training a logistic regression model

In [11]:
# Start MLFoundry Run 1
run = client.create_run(project_name='Stroke-Prediction-new')
print('RUN 1 ID:', run.run_id)

#Set tags for the run
run.set_tags({'framework': 'sklearn', 'task': 'Classification', 'model': 'Logistic Regression'})


# Append classifier to preprocessing pipeline.
LR_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

LR_clf.fit(X_train, y_train)

y_pred = LR_clf.predict(X_test)

# run.log_params(LR_clf.get_params())
run.log_model(LR_clf, framework=mlf.ModelFramework.SKLEARN)

# Log the metrics

metrics = {
    'test/precision': precision_score(y_test, y_pred),
    'test/recall': recall_score(y_test, y_pred),
    'test/accuracy': accuracy_score(y_test, y_pred),
    'test/roc_auc': roc_auc_score(y_test, y_pred),
    'test/f1_score': f1_score(y_test, y_pred)
}
print('Logistic_Regr_metrics:', metrics)
run.log_metrics(metrics)

# Log test dataset:

run.log_dataset(
    dataset_name='test',
    features=X_test,
    predictions=y_pred,
    actuals=y_test,
)

run.end()

[mlfoundry] 2022-06-08T07:19:04+0000 INFO Run is created with name 'leave-political-thing' and id 'c64167f7c6fc49c5a675afdd71ef8cf8'
RUN 1 ID: c64167f7c6fc49c5a675afdd71ef8cf8
[mlfoundry] 2022-06-08T07:19:06+0000 INFO Tags set successfully




[mlfoundry] 2022-06-08T07:19:55+0000 INFO Model logged successfully
Logistic_Regr_metrics: {'test/precision': 0.5, 'test/recall': 0.013157894736842105, 'test/accuracy': 0.9504240052185258, 'test/roc_auc': 0.5062357764693133, 'test/f1_score': 0.025641025641025637}
[mlfoundry] 2022-06-08T07:19:57+0000 INFO Metrics logged successfully
[mlfoundry] 2022-06-08T07:19:57+0000 INFO Logging Dataset, this might take a while ...
[mlfoundry] 2022-06-08T07:20:04+0000 INFO failed to log features as FileFormat.PARQUET due to need more than 2 values to unpack, trying with FileFormat.CSV
[mlfoundry] 2022-06-08T07:20:04+0000 INFO failed to log predictions as FileFormat.PARQUET due to need more than 2 values to unpack, trying with FileFormat.CSV
[mlfoundry] 2022-06-08T07:20:04+0000 INFO failed to log actuals as FileFormat.PARQUET due to need more than 2 values to unpack, trying with FileFormat.CSV
[mlfoundry] 2022-06-08T07:20:31+0000 INFO Dataset logged successfully
[mlfoundry] 2022-06-08T07:20:33+0000 IN

#### Training a decision tree classifier

In [12]:
# Start MLFoundry Run 2
run = client.create_run(project_name='Stroke-Prediction-new')
print('RUN 2 ID:', run.run_id)

#Set tags for the run
run.set_tags({'framework': 'sklearn', 'task': 'Classification', 'model': 'Decision Tree'})


# Append classifier to preprocessing pipeline.
DT_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', DecisionTreeClassifier())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

DT_clf.fit(X_train, y_train)

y_pred = DT_clf.predict(X_test)

# run.log_params(DT_clf.get_params())
run.log_model(DT_clf, framework=mlf.ModelFramework.SKLEARN)

# Log the metrics

DT_metrics = {
    'test/precision': precision_score(y_test, y_pred),
    'test/recall': recall_score(y_test, y_pred),
    'test/accuracy': accuracy_score(y_test, y_pred),
    'test/roc_auc': roc_auc_score(y_test, y_pred),
    'test/f1_score': f1_score(y_test, y_pred)
}
print('Decision_Tree_metrics:', DT_metrics)
run.log_metrics(DT_metrics)

# Log test dataset:

run.log_dataset(
    dataset_name='test',
    features=X_test,
    predictions=y_pred,
    actuals=y_test,
)

run.end()

[mlfoundry] 2022-06-08T07:20:39+0000 INFO Run is created with name 'appear-bad-office' and id 'c1c6014681384399b4f05df7e455af4e'
RUN 2 ID: c1c6014681384399b4f05df7e455af4e
[mlfoundry] 2022-06-08T07:20:41+0000 INFO Tags set successfully
[mlfoundry] 2022-06-08T07:21:10+0000 INFO Model logged successfully
Decision_Tree_metrics: {'test/precision': 0.15584415584415584, 'test/recall': 0.15789473684210525, 'test/accuracy': 0.9158512720156555, 'test/roc_auc': 0.5566412599790486, 'test/f1_score': 0.1568627450980392}
[mlfoundry] 2022-06-08T07:21:12+0000 INFO Metrics logged successfully
[mlfoundry] 2022-06-08T07:21:12+0000 INFO Logging Dataset, this might take a while ...
[mlfoundry] 2022-06-08T07:21:19+0000 INFO failed to log features as FileFormat.PARQUET due to need more than 2 values to unpack, trying with FileFormat.CSV
[mlfoundry] 2022-06-08T07:21:19+0000 INFO failed to log predictions as FileFormat.PARQUET due to need more than 2 values to unpack, trying with FileFormat.CSV
[mlfoundry] 202

#### Training a RandomForest Classifier

In [13]:
# Start MLFoundry Run 3
run = client.create_run(project_name='Stroke-Prediction-new')
print('RUN 3 ID:', run.run_id)

#Set tags for the run
run.set_tags({'framework': 'sklearn', 'task': 'Classification', 'model': 'RandomForest'})


# Append classifier to preprocessing pipeline.
RF_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

RF_clf.fit(X_train, y_train)

y_pred = RF_clf.predict(X_test)

# run.log_params(RF_clf.get_params())
run.log_model(RF_clf, framework=mlf.ModelFramework.SKLEARN)

# Log the metrics

RF_metrics = {
    'test/precision': precision_score(y_test, y_pred),
    'test/recall': recall_score(y_test, y_pred),
    'test/accuracy': accuracy_score(y_test, y_pred),
    'test/roc_auc': roc_auc_score(y_test, y_pred),
    'test/f1_score': f1_score(y_test, y_pred)
}
print('Decision_Tree_metrics:', RF_metrics)
run.log_metrics(RF_metrics)

# Log test dataset:

run.log_dataset(
    dataset_name='test',
    features=X_test,
    predictions=y_pred,
    actuals=y_test,
)

run.end()

[mlfoundry] 2022-06-08T07:21:55+0000 INFO Run is created with name 'must-new-family' and id '389637056ec54f8d9e73814e9140f853'
RUN 3 ID: 389637056ec54f8d9e73814e9140f853
[mlfoundry] 2022-06-08T07:21:56+0000 INFO Tags set successfully
[mlfoundry] 2022-06-08T07:22:52+0000 INFO Model logged successfully
Decision_Tree_metrics: {'test/precision': 0.5, 'test/recall': 0.013157894736842105, 'test/accuracy': 0.9504240052185258, 'test/roc_auc': 0.5062357764693133, 'test/f1_score': 0.025641025641025637}
[mlfoundry] 2022-06-08T07:22:54+0000 INFO Metrics logged successfully
[mlfoundry] 2022-06-08T07:22:54+0000 INFO Logging Dataset, this might take a while ...
[mlfoundry] 2022-06-08T07:23:01+0000 INFO failed to log features as FileFormat.PARQUET due to need more than 2 values to unpack, trying with FileFormat.CSV
[mlfoundry] 2022-06-08T07:23:01+0000 INFO failed to log predictions as FileFormat.PARQUET due to need more than 2 values to unpack, trying with FileFormat.CSV
[mlfoundry] 2022-06-08T07:23:0