# Libraries

In [1]:
# data prep
import numpy as np
import pandas as pd

# scikit-learn 
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

# mlflow
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

# Loading data

https://archive.ics.uci.edu/ml/datasets/Heart+Disease

Features related to laboratory and physiological testing include:

* Age of patient
* Sex of patient
* CP — chest pain type
* Trestbps — resting blood pressure
* Chol — serum cholesterol
* FBS — fasting blood sugar
* Restecg — resting electrocardiogram results
* Thalach — maximum heart rate
* Exang — exercise induced angina
* Oldpeak — ST depression induced by exercise
* Slope — slope of the peak exercise ST segment
* CA — number of major vessels colored with flouroscopy
* Thal — type of defect
* Num — diagnosis of heart disease

based on: https://towardsdatascience.com/machine-learning-model-development-and-deployment-with-mlflow-and-scikit-learn-pipelines-f658c39e4d58

In [2]:
# https link
URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'

# readind data
df = pd.read_csv(URL, header=None, names=['age','sex','cp','trestbps','chol','fbs','restecg','thalach'
                                                ,'exang','oldpeak','slope','ca','thal','num'])
# set up target variable
df['target'] = np.where(df['num'] > 0,1,0)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0,0


# Model

Split our data up into train, validation, and test datasets

In [3]:
# Train and Test
train, test = train_test_split(df, test_size=0.2)
# Validation
train, val = train_test_split(train, test_size=0.2)

print(len(train), 'Train Examples')
print(len(test), 'Test Examples')
print(len(val), 'Validation Examples')


193 Train Examples
61 Test Examples
49 Validation Examples


We are now ready to import our custom class and define the pipeline.

In [4]:
from custom_class import NewFeatureTransformer

#Define the Pipeline

# numeric features
numeric_features = ['age','trestbps','chol','thalach','oldpeak']
numeric_transformer = Pipeline(steps=[
     ('imputer', SimpleImputer(strategy='median')),
     ('scaler', StandardScaler())])

# categorical features
categorical_features = [ 'cp','restecg','ca','thal','slope']
categorical_transformer = Pipeline(steps=[ 
     ('imputer', SimpleImputer(strategy='constant',fill_value=0)),
     ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# binary features
binary_features = [ 'sex','fbs','exang']
binary_transformer = Pipeline(steps=[
     ('imputer',SimpleImputer(strategy='constant',fill_value=0))])

new_features_input =  [ 'thalach','trestbps']
new_transformer = Pipeline(steps=[
     ('new', NewFeatureTransformer())])

preprocessor = ColumnTransformer(
     transformers=[
          ('num', numeric_transformer, numeric_features),
          ('cat', categorical_transformer, categorical_features),
          ('binary', binary_transformer, binary_features),
          ('new', new_transformer, new_features_input)])

# Now join together the preprocessing with the classifier.
clf = Pipeline(steps=[('preprocessor', preprocessor),
     ('classifier', LogisticRegression())], verbose=True)

#fit the pipeline
clf.fit(train, train['target'].values)

#create predictions for validation data
y_pred = clf.predict(val)


[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.8s


In [5]:
class ModelOut (mlflow.pyfunc.PythonModel):
     def __init__(self, model):
          self.model = model
     def predict (self, context, model_input):
          model_input.columns= map(str.lower,model_input.columns)
          return self.model.predict_proba(model_input)[:,1]

In [6]:
mlflow_conda={'channels': ['defaults'],
     'name':'conda',
     'dependencies': [ 'python=3.8', 'pip',
     {'pip':['mlflow==1.11.0','scikit-learn==0.24.2','cloudpickle==1.5.0','pandas==1.3.1','numpy==1.21.1']}]}

In [7]:
with mlflow.start_run():
     #log metrics
     mlflow.log_metric("accuracy", accuracy_score( val['target'].values, y_pred))
     mlflow.log_metric("precison", precision_score( val['target'].values, y_pred))
     mlflow.log_metric("recall", recall_score( val['target'].values, y_pred))
     
     # log model
     mlflow.pyfunc.log_model(   artifact_path="model",
                                python_model=ModelOut(model=clf,),
                                code_path=['custom_class.py'],
                                conda_env=mlflow_conda)
     signature = infer_signature(val, y_pred)
     
     #print out the active run ID
     run = mlflow.active_run()
     print("Active run_id: {}".format(run.info.run_id))

Active run_id: 3dc5d9cf4f894a05ae970ba92ca7ffb9


  inputs = _infer_schema(model_input)
