In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import os
import warnings
import sys

import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

import mlflow
import mlflow.sklearn

from pathlib import Path

## Data

The data in this notebook was used in a hackathon: https://www.machinehack.com/course/plugin-hackathon-cardiac-arrest-risk-prediction/

In [3]:
!ls Cardiac_Arrest_Participants_Data

Cardiac_Arrest_Participants_Data  Sample_Submission.xlsx  Test.csv  Train.csv


In [4]:
path = Path('Cardiac_Arrest_Participants_Data')

## Read data

In [5]:
df = pd.read_csv(path/'Train.csv')

In [6]:
df.replace({'UnderRisk': {'yes': 1, 'no': 0}}, inplace=True)

In [7]:
df

Unnamed: 0,Gender,Chain_smoker,Consumes_other_tobacco_products,HighBP,Obese,Diabetes,Metabolic_syndrome,Use_of_stimulant_drugs,Family_history,History_of_preeclampsia,CABG_history,Respiratory_illness,UnderRisk
0,1,1,1,0,1,0,0,0,1,0,0,0,0
1,1,0,1,0,1,0,0,0,1,0,0,0,0
2,1,0,1,0,1,0,0,0,1,0,0,0,0
3,1,0,1,0,1,0,0,0,1,0,0,0,0
4,1,0,0,0,0,0,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,2,0,1,0,1,0,0,0,1,0,0,0,0
885,1,0,1,0,1,0,0,0,1,0,0,0,0
886,1,0,0,1,1,0,0,0,1,0,0,0,1
887,2,0,1,0,0,0,0,1,1,0,0,0,0


In [8]:
test_df = pd.read_csv(path/'Test.csv')

## Connect to MLflow server

In [9]:
# mlflow server --backend-store-uri mlruns/ --default-artifact-root mlruns/ --host 0.0.0.0 --port 5000
remote_server_uri = "http://localhost:5000" # set to your server URI
mlflow.set_tracking_uri(remote_server_uri)  # or set the MLFLOW_TRACKING_URI in the env

In [25]:
mlflow.tracking.get_tracking_uri()

'http://localhost:5000'

In [10]:
mlflow.set_experiment('Cardiac')

## Training

In [11]:
def eval_metrics(actual, pred):
    # compute relevant metrics
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [12]:
def prepare_split(data):

    target = "UnderRisk"
    
    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data, test_size=0.1)

    # The predicted column is target which is a scalar from [3, 9]
    train_x = train.drop([target], axis=1)
    test_x = test.drop([target], axis=1)
    train_y = train[[target]]
    test_y = test[[target]]
    return train_x, train_y, test_x, test_y

In [22]:
def train_decision_tree(train_x, train_y, test_x, test_y, decision_tree_params):
    
    max_depth = decision_tree_params['max_depth']
    min_samples_split = decision_tree_params['min_samples_split']
    # Execute DecisionTreeClassifier
    lr = DecisionTreeClassifier(random_state=0, max_depth=max_depth, min_samples_split=min_samples_split)
    lr.fit(train_x, train_y)

    # Evaluate Metrics
    predicted_qualities = lr.predict(test_x)
    (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

    # Print out metrics
    print(f"DecisionTreeClassifier model (max_depth={max_depth}, min_samples_split={min_samples_split}):")
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

    # Log parameter, metrics, and model to MLflow
    mlflow.log_param(key="max_depth", value=max_depth)
    mlflow.log_param(key="min_samples_split", value=min_samples_split)
    mlflow.log_metric(key="rmse", value=rmse)
    mlflow.log_metrics({"mae": mae, "r2": r2})
    print('data_path', data_path)
    mlflow.log_artifact(data_path)
    print("Save to: {}".format(mlflow.get_artifact_uri()))

    mlflow.sklearn.log_model(lr, "tree_model")
    
def train_knn(train_x, train_y, test_x, test_y, knn_params):
    
    n_neighbors = knn_params['n_neighbors']
    leaf_size=knn_params['leaf_size']
    
    # Execute DecisionTreeClassifier
    lr = KNeighborsClassifier(n_neighbors=n_neighbors, leaf_size=leaf_size)
    lr.fit(train_x, train_y)

    # Evaluate Metrics
    predicted_qualities = lr.predict(test_x)
    (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

    # Print out metrics
    print(f"KNeighborsClassifier model (n_neighbors={n_neighbors}, leaf_size={leaf_size}):")
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

    # Log parameter, metrics, and model to MLflow
    mlflow.log_param(key="n_neighbors", value=n_neighbors)
    mlflow.log_param(key="leaf_size", value=leaf_size)
    mlflow.log_metric(key="rmse", value=rmse)
    mlflow.log_metrics({"mae": mae, "r2": r2})
    mlflow.log_artifact(data_path, 'test123')
    print("Save to: {}".format(mlflow.get_artifact_uri()))

    mlflow.sklearn.log_model(lr, "knn_model")

In [23]:
data_path = 'Cardiac_Arrest_Participants_Data/Train.csv'

def train(data, decision_tree_params, knn_params):
    # train a model with given parameters
    warnings.filterwarnings("ignore")
    np.random.seed(777)

    train_x, train_y, test_x, test_y = prepare_split(data)

    # Useful for multiple runs (only doing one run in this sample notebook)    
    with mlflow.start_run():
        train_decision_tree(train_x, train_y, test_x, test_y, decision_tree_params)
    
    with mlflow.start_run():
        train_knn(train_x, train_y, test_x, test_y, knn_params)

In [24]:
train(df, {'max_depth':3, 'min_samples_split':2}, {'n_neighbors': 3, 'leaf_size': 30})

DecisionTreeClassifier model (max_depth=3, min_samples_split=2):
  RMSE: 0.4740454631399772
  MAE: 0.2247191011235955
  R2: -0.2898550724637683
data_path Cardiac_Arrest_Participants_Data/Train.csv
Save to: mlruns/1/4465b107bcb843a3b07f100c67aebd51/artifacts
KNeighborsClassifier model (n_neighbors=3, leaf_size=30):
  RMSE: 0.4620423639315076
  MAE: 0.21348314606741572
  R2: -0.22536231884057978
Save to: mlruns/1/08294761de5e44ac80490f106bdfde24/artifacts


In [21]:
train(df, {'max_depth':4, 'min_samples_split':2}, {'n_neighbors': 4, 'leaf_size': 30})

DecisionTreeClassifier model (max_depth=4, min_samples_split=2):
  RMSE: 0.4740454631399772
  MAE: 0.2247191011235955
  R2: -0.2898550724637683
data_path Cardiac_Arrest_Participants_Data/Train.csv
Save to: mlruns/1/2e289137a9b945508cf929d77c3107b5/artifacts
KNeighborsClassifier model (n_neighbors=4, leaf_size=30):
  RMSE: 0.4620423639315076
  MAE: 0.21348314606741572
  R2: -0.22536231884057978
Save to: mlruns/1/d3c44f85e5774d31bb9184bf537d2f2e/artifacts


In [17]:
train(df, {'max_depth':4, 'min_samples_split':3}, {'n_neighbors': 5, 'leaf_size': 30})

DecisionTreeClassifier model (max_depth=4, min_samples_split=3):
  RMSE: 0.4740454631399772
  MAE: 0.2247191011235955
  R2: -0.2898550724637683
Save to: mlruns/1/6614589b4b014052899e185fbd11d2f4/artifacts
KNeighborsClassifier model (n_neighbors=5, leaf_size=30):
  RMSE: 0.4620423639315076
  MAE: 0.21348314606741572
  R2: -0.22536231884057978
Save to: mlruns/1/b2642a2dce0447cda178bce6a8dac71a/artifacts
