# ML Ops: Individual Assignment - ML Flow

June 2021

## 1. Load Libraries and Dataset

In [122]:
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,precision_score,recall_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
import shutil  
import os

In [123]:
#Delete last runs directory
try:
    shutil.rmtree('./mlruns')
except FileNotFoundError:
    print("WARNING: Can't find folder mlruns")

In [124]:
# Load Data
# Download Dataset from: https://www.kaggle.com/uciml/red-wine-quality-cortez-et-al-2009
path = "C:/Users/Tini/OneDrive/04_Electives/04_MLOps/02_Ind Assignment/winequality-red.csv"
df = pd.read_csv(path,sep = ",")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [125]:
# Split into test & train set

x = df.drop("quality", axis=1)
y = df.quality


x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.20,random_state=42)

In [126]:
# Scale the X data
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

## 2. Set up Model

In [127]:
def calculate_errors(y,y_pred):
    accuracy = accuracy_score(y,y_pred)
    precision = precision_score(y, y_pred, average = "macro", labels=np.unique(y_pred))
    recall = recall_score(y,y_pred, average = "macro",labels=np.unique(y_pred))
    y,y_pred = np.array(y), np.array(y_pred)
    return accuracy,precision,recall 

In [128]:
def train_knn_classifier(n,p,exp = None):
    with mlflow.start_run(experiment_id=exp): #start mlflow run
        knn = KNeighborsClassifier(n_neighbors= n, p=p, metric = "minkowski", weights = "distance") #, weights=w)
        knn.fit(x_train,y_train)
        y_pred = knn.predict(x_test)
        
        #calculate errors
        accuracy, precision, recall = calculate_errors(y_test,y_pred)
        errors = accuracy,precision,recall #,f1_score
        print("accuracy:{0:.3f}, precision:{1:.2f}, recall:{2:.2f}".format(accuracy,precision,recall))
        
        #log metris and parmeters
        mlflow.log_metrics({"Accuracy":accuracy,"Precision":precision, "Recall":recall})
        mlflow.log_params({"n_neighbors":n, "p": p})
        
        #register model
        mlflow.sklearn.log_model(knn, "model")
        
        #save error plot
        plt.figure()
        plt.bar(['accuracy','precision','recall'],errors,color=['blue','red','green']);
        plt.title("Errors")
        plt.savefig("errors.png")
        plt.close()
        mlflow.log_artifact("errors.png")

## 3. Train and Predict - Multi-classes

In [129]:
metric_list = ['euclidean', 'manhattan', 'chebyshev','minkowski', 'wminkowski', 'seuclidean','mahalanobis']
weight_list = ['uniform', 'distance']

In [130]:
 for n in range(1,41):
        for p in range(1,3):
                    train_knn_classifier(n,p)

accuracy:0.625, precision:0.32, recall:0.32
accuracy:0.622, precision:0.34, recall:0.34
accuracy:0.625, precision:0.32, recall:0.32
accuracy:0.622, precision:0.34, recall:0.34
accuracy:0.647, precision:0.42, recall:0.42
accuracy:0.625, precision:0.31, recall:0.32
accuracy:0.656, precision:0.39, recall:0.40
accuracy:0.603, precision:0.35, recall:0.33
accuracy:0.653, precision:0.39, recall:0.40
accuracy:0.625, precision:0.37, recall:0.39
accuracy:0.637, precision:0.37, recall:0.39
accuracy:0.644, precision:0.48, recall:0.41
accuracy:0.656, precision:0.39, recall:0.40
accuracy:0.653, precision:0.39, recall:0.40
accuracy:0.669, precision:0.39, recall:0.41
accuracy:0.653, precision:0.48, recall:0.50
accuracy:0.653, precision:0.64, recall:0.67
accuracy:0.669, precision:0.50, recall:0.52
accuracy:0.669, precision:0.50, recall:0.52
accuracy:0.678, precision:0.50, recall:0.52
accuracy:0.656, precision:0.48, recall:0.51
accuracy:0.666, precision:0.50, recall:0.51
accuracy:0.675, precision:0.50, 

Our classifier predicts with an accuracy of 62-67%. Nothing to really win big prizes so in a different setting we would focus next on improving the classifier, e.g. through feature engineering, up/downsampling of unbalanced classes, cross-validation, using other classifier algorithms etc. 

In [131]:
# Find the best model
df_runs = mlflow.search_runs()
best_run_id = df_runs.loc[df_runs['metrics.Accuracy'].idxmax()]['run_id']
print("Minimum error run_id: ",best_run_id)

Minimum error run_id:  753355a4c0f541e9b6a86373406deaf7


In [132]:
# The number of neighbors that maximizes accuracy
print("Number of neighbors that maximizes accuracy:")
df_runs[df_runs["run_id"]== best_run_id][["metrics.Accuracy","params.n_neighbors"]]

Number of neighbors that maximizes accuracy:


Unnamed: 0,metrics.Accuracy,params.n_neighbors
51,0.68125,15


Alternatively, we can use the $mlflow ui command in the cmd-line to analzye the different model runs in the ML Flow Tool. 

## 4. Train for binary classification - Good or bad wine

In [138]:
#If quality value is less than or eqaul to 6 then it will be in class 0
#If quality value is greater than 6  then it will be in class 1
df = df.copy()
df['quality'] = np.where(df['quality'] > 6, 1, 0)
df['quality'].value_counts()

0    1382
1     217
Name: quality, dtype: int64

In [139]:
# Split into test & train set

x = df.drop("quality", axis=1)
y = df.quality

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.20,random_state=42)

In [140]:
# Scale the X data
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [141]:
 for n in range(1,40):
        for p in np.arange(1,2):
                    train_knn_classifier(n,p)

accuracy:0.887, precision:0.77, recall:0.79
accuracy:0.887, precision:0.77, recall:0.79
accuracy:0.903, precision:0.81, recall:0.81
accuracy:0.894, precision:0.79, recall:0.79
accuracy:0.916, precision:0.84, recall:0.81
accuracy:0.903, precision:0.81, recall:0.78
accuracy:0.912, precision:0.84, recall:0.79
accuracy:0.906, precision:0.83, recall:0.78
accuracy:0.922, precision:0.86, recall:0.81
accuracy:0.912, precision:0.84, recall:0.79
accuracy:0.922, precision:0.86, recall:0.80
accuracy:0.909, precision:0.83, recall:0.80
accuracy:0.909, precision:0.83, recall:0.78
accuracy:0.903, precision:0.82, recall:0.78
accuracy:0.916, precision:0.85, recall:0.79
accuracy:0.906, precision:0.83, recall:0.77
accuracy:0.909, precision:0.83, recall:0.78
accuracy:0.912, precision:0.85, recall:0.77
accuracy:0.906, precision:0.83, recall:0.77
accuracy:0.906, precision:0.83, recall:0.76
accuracy:0.903, precision:0.83, recall:0.75
accuracy:0.900, precision:0.82, recall:0.74
accuracy:0.894, precision:0.81, 

In [142]:
# Find the best model
df_runs = mlflow.search_runs()
best_run_id = df_runs.loc[df_runs['metrics.Accuracy'].idxmax()]['run_id']
print("Minimum error run_id: ",best_run_id)

Minimum error run_id:  9da3e8b49dff4de9bf628c6c11068792


In [143]:
# The number of neighbors that maximizes accuracy
print("Number of neighbors that maximizes accuracy:")
df_runs[df_runs["run_id"]== best_run_id][["metrics.Accuracy","params.n_neighbors"]]

Number of neighbors that maximizes accuracy:


Unnamed: 0,metrics.Accuracy,params.n_neighbors
28,0.921875,11


In [121]:
print(range(1,3))

range(1, 3)
