In [1]:
import os 
import pandas as pd
import numpy as np
import optuna
import xgboost as xgb
import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType
import requests
import zipfile

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, recall_score, roc_auc_score, precision_score, average_precision_score

from datetime import datetime



* 'schema_extra' has been renamed to 'json_schema_extra'


In [2]:
DATA_LINK = "https://raw.githubusercontent.com/danielstankw/data-mlops/5d593d3e7ea9c19bb154a3eb5a8527ce88543a97/archive.zip"
USE_MLFLOW = False

In [3]:
MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [4]:
def download_data(url: str) -> pd.DataFrame:
    """
    Download data from a given URL, unzip it to data_dir folder, and return the data as a DataFrame.
    
    Parameters:
    - url (str): The URL of the data to be downloaded.
    
    Returns:
    - pd.DataFrame: The downloaded data as a DataFrame.
    """

    data_dir = 'data'
    
    # Create 'data' directory if it doesn't exist
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    # Download the file
    zip_path = os.path.join(data_dir, 'archive.zip')
    response = requests.get(url)
    with open(zip_path, 'wb') as file:
        file.write(response.content)

    # Unzip the file into the 'data' directory
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(data_dir)

    # Remove the residual archive.zip file
    os.remove(zip_path)

    # Fetch the name of the .csv files
    csv_files = [file for file in os.listdir(data_dir) if file.endswith('.csv')]
    if not csv_files:
        raise ValueError("No CSV files found in the downloaded data.")
    
    # Read the first CSV file into a DataFrame
    df = pd.read_csv(os.path.join(data_dir, csv_files[0]))

    return df

def preprocess_data(df: pd.DataFrame) -> (pd.DataFrame, pd.Series):
    """
    Preprocess the given data by handling duplicates, and separating the target column.
    
    Parameters:
    - df (pd.DataFrame): The input data to be preprocessed.
    
    Returns:
    - pd.DataFrame: The preprocessed data.
    - pd.Series: The target column values.
    """
    
    # Handling duplicates
    df = df.drop_duplicates()

    # Drop rows where gender is 'Other'
    df = df[df['gender'] != 'Other']

    # Recategorize the 'smoking_history' column
    def recategorize_smoking_status(smoking_status: str) -> str:
        """Recategorize smoking status based on given conditions."""
        mapping = {
            'never': 'non_smoker',
            'No Info': 'no_info',
            'current': 'current',
            'ever': 'past_smoker',
            'former': 'past_smoker',
            'not current': 'past_smoker'
        }
        return mapping.get(smoking_status, smoking_status)

    df['smoking_history'] = df['smoking_history'].apply(recategorize_smoking_status)

    # Separate the target column
    target = 'diabetes'
    y = df[target].copy()
    df.drop(target, axis=1, inplace=True)

    return df, y

def split_data(df: pd.DataFrame, y: pd.Series, verbose: bool = False) -> tuple:
    """
    Split the data into training, validation, and test sets.
    
    Parameters:
    - df (pd.DataFrame): The input data.
    - y (pd.Series): The target values.
    - verbose (bool): If True, print the percentage of diabetic values in each set.
    
    Returns:
    - tuple: Training, validation, and test sets for both data and target values.
    """
    
    df_train, df_temp, y_train, y_temp = train_test_split(
        df, y, test_size=0.2, random_state=42, stratify=y
    )
    
    df_val, df_test, y_val, y_test = train_test_split(
        df_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
    )

    if verbose:
        print(f'[%] of diabetic values in the train set: {np.round(y_train.sum() / len(y_train) * 100, 2)}')
        print(f'[%] of diabetic values in the validation set: {np.round(y_val.sum() / len(y_val) * 100, 2)}')
        print(f'[%] of diabetic values in the test set: {np.round(y_test.sum() / len(y_test) * 100, 2)}')

    return df_train, df_val, df_test, y_train, y_val, y_test

def calculate_metrics(y_true: pd.Series, y_pred: pd.Series) -> tuple:
    """
    Calculate various metrics to evaluate the performance of a classification model.
    
    Parameters:
    - y_true (pd.Series): The true target values.
    - y_pred (pd.Series): The predicted target values.
    
    Returns:
    - tuple: AUC, F1 score, precision, recall, average precision, and accuracy.
    """
    
    auc = roc_auc_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    avg_precision = average_precision_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)

    return auc, f1, precision, recall, avg_precision, accuracy


def optimize_params(train, y_train, dvalid, y_val, n_trials=3):
    """
    Optimize hyperparameters for an XGBoost classifier using Optuna.
    
    Parameters:
    - X_train, y_train: Training data and labels.
    - X_val, y_val: Validation data and labels.
    - n_trials (int): Number of trials for hyperparameter optimization.
    
    Returns:
    - optuna.study.Study: Study object with optimization results.
    """

    # dtrain = xgb.DMatrix(X_train, label=y_train)
    # dvalid = xgb.DMatrix(X_val, label=y_val)
    
    def objective(trial):
        """
        Objective function for Optuna optimization.
        
        Parameters:
        - trial: Optuna trial object.
        
        Returns:
        - float: F1 score for the given hyperparameters.
        """

        with mlflow.start_run():
            mlflow.set_tag("model", "xgboost")

            constant_params = {
                "objective": "binary:logistic",
                "eval_metric" : "logloss"}
            
            hyper_params = {
                "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
                "max_depth": trial.suggest_int("max_depth", 1, 10),
                "subsample": trial.suggest_float("subsample", 0.05, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
                "min_child_weight": trial.suggest_int("min_child_weight", 1, 20)
            }

            combined_params = {**constant_params, **hyper_params}

            mlflow.log_params(combined_params)

            classifier = xgb.train(
                combined_params,
                dtrain,
                evals=[(dvalid, "validation")],
                verbose_eval=1,
                early_stopping_rounds=10,
                num_boost_round=1000
            )
            
            # output are the probabilities, we need to convert to the binary classes
            y_pred_proba = classifier.predict(dvalid)
            y_pred = (y_pred_proba > 0.5).astype(int)
            
            f1_metric = f1_score(y_true=y_val, y_pred=y_pred) 

            mlflow.log_metric('f1', f1_metric)
            
        return f1_metric
    
    # To execute the optimization, we create a study object and pass 
    # the objective function to the optimize method.
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)

    return study




def train_optimized_model(study, dtrain, y_train, dvalid, y_val):
    """
    Train an XGBoost classifier using the best parameters from a given study.
    
    Parameters:
    - study: Optuna study object with optimization results.
    - train_data, val_data: Training and validation data.
    - y_train, y_val: Training and validation labels.
    """
    
    # dtrain = xgb.DMatrix(X_train, label=y_train)
    # dvalid = xgb.DMatrix(X_val, label=y_val)
    
    with mlflow.start_run():
        
        constant_params = {
            "objective": "binary:logistic"}

        # Combine constant and hyperopt parameters
        combined_params = {**study.best_params, **constant_params}
        mlflow.log_params(combined_params)

        classifier = xgb.train(
            combined_params,
            dtrain,
            evals=[(dvalid, "validation")],
            verbose_eval=1,
            early_stopping_rounds=10,
            num_boost_round=1000
        )
        
        y_pred_proba = classifier.predict(dvalid)
        y_pred = (y_pred_proba > 0.5).astype(int)
        
        f1_metric = f1_score(y_val, y_pred)
        print(f'F1 Score: {f1_metric}')

        mlflow.log_metric("f1", f1_metric)
        mlflow.sklearn.log_model(classifier, artifact_path="models_mlflow")

        print(f"Default artifacts URI: '{mlflow.get_artifact_uri()}'")

        

In [5]:
df_original = download_data(url=DATA_LINK)
df = df_original.copy()

# preprocess data
df, y = preprocess_data(df)

# data split
df_train, df_val, df_test, y_train, y_val, y_test = split_data(df, y)

In [6]:
len(df_train), len(df_val), len(df_test)

(76902, 9613, 9613)

In [7]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
print(mlflow.get_tracking_uri())
# set an experiment, if it doesnt exist create one
mlflow.set_experiment(experiment_name='final-experiment')

2023/09/19 13:53:48 INFO mlflow.tracking.fluent: Experiment with name 'final-experiment' does not exist. Creating a new experiment.


http://127.0.0.1:5000


<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1695131628721, experiment_id='1', last_update_time=1695131628721, lifecycle_stage='active', name='final-experiment', tags={}>

In [8]:
import pickle

In [9]:
# features
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = df.select_dtypes(exclude=['object', 'category']).columns.tolist()
all_features = categorical_cols + numerical_cols

# one-hot encoding categorical features
dv = DictVectorizer()
train_dicts = df_train[all_features].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[all_features].to_dict(orient='records')
X_val = dv.transform(val_dicts)

os.makedirs("model", exist_ok=True)
with open("model/preprocessor.b", "wb") as f:
    pickle.dump(dv, f)

dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_val, label=y_val)

  if is_sparse(data):


In [10]:
study = optimize_params(dtrain, y_train, dvalid, y_val, n_trials=2)

[I 2023-09-19 13:53:52,291] A new study created in memory with name: no-name-6231814d-1721-400b-bb0c-99ce62971be6


[0]	validation-logloss:0.32097
[1]	validation-logloss:0.32050
[2]	validation-logloss:0.32003
[3]	validation-logloss:0.31956
[4]	validation-logloss:0.31945
[5]	validation-logloss:0.31899
[6]	validation-logloss:0.31869
[7]	validation-logloss:0.31823
[8]	validation-logloss:0.31777
[9]	validation-logloss:0.31743
[10]	validation-logloss:0.31713
[11]	validation-logloss:0.31680
[12]	validation-logloss:0.31651
[13]	validation-logloss:0.31622
[14]	validation-logloss:0.31593
[15]	validation-logloss:0.31583
[16]	validation-logloss:0.31549
[17]	validation-logloss:0.31504
[18]	validation-logloss:0.31460
[19]	validation-logloss:0.31416
[20]	validation-logloss:0.31388
[21]	validation-logloss:0.31359
[22]	validation-logloss:0.31316
[23]	validation-logloss:0.31272
[24]	validation-logloss:0.31262
[25]	validation-logloss:0.31218
[26]	validation-logloss:0.31186
[27]	validation-logloss:0.31143
[28]	validation-logloss:0.31133
[29]	validation-logloss:0.31090
[30]	validation-logloss:0.31063
[31]	validation-lo

[I 2023-09-19 13:53:57,211] Trial 0 finished with value: 0.8106591865357644 and parameters: {'learning_rate': 0.0011215073243063425, 'max_depth': 2, 'subsample': 0.1418528255481938, 'colsample_bytree': 0.7738921113929147, 'min_child_weight': 18}. Best is trial 0 with value: 0.8106591865357644.


[0]	validation-logloss:0.32131
[1]	validation-logloss:0.32125
[2]	validation-logloss:0.32080
[3]	validation-logloss:0.32069
[4]	validation-logloss:0.32060
[5]	validation-logloss:0.32017
[6]	validation-logloss:0.32006
[7]	validation-logloss:0.31965
[8]	validation-logloss:0.31922
[9]	validation-logloss:0.31909
[10]	validation-logloss:0.31871
[11]	validation-logloss:0.31856
[12]	validation-logloss:0.31820
[13]	validation-logloss:0.31811
[14]	validation-logloss:0.31773
[15]	validation-logloss:0.31763
[16]	validation-logloss:0.31722
[17]	validation-logloss:0.31667
[18]	validation-logloss:0.31630
[19]	validation-logloss:0.31618
[20]	validation-logloss:0.31607
[21]	validation-logloss:0.31598
[22]	validation-logloss:0.31556
[23]	validation-logloss:0.31520
[24]	validation-logloss:0.31506
[25]	validation-logloss:0.31500
[26]	validation-logloss:0.31488
[27]	validation-logloss:0.31446
[28]	validation-logloss:0.31433
[29]	validation-logloss:0.31421
[30]	validation-logloss:0.31411
[31]	validation-lo

[I 2023-09-19 13:54:04,186] Trial 1 finished with value: 0.36293436293436293 and parameters: {'learning_rate': 0.0012934440819739415, 'max_depth': 6, 'subsample': 0.6560258060443955, 'colsample_bytree': 0.3961406840266234, 'min_child_weight': 5}. Best is trial 0 with value: 0.8106591865357644.


In [11]:
study.best_params

{'learning_rate': 0.0011215073243063425,
 'max_depth': 2,
 'subsample': 0.1418528255481938,
 'colsample_bytree': 0.7738921113929147,
 'min_child_weight': 18}

In [12]:
# train best model
train_optimized_model(study, dtrain, y_train, dvalid, y_val)

[0]	validation-logloss:0.32097
[1]	validation-logloss:0.32050
[2]	validation-logloss:0.32003
[3]	validation-logloss:0.31956
[4]	validation-logloss:0.31945
[5]	validation-logloss:0.31899
[6]	validation-logloss:0.31869
[7]	validation-logloss:0.31823
[8]	validation-logloss:0.31777
[9]	validation-logloss:0.31743
[10]	validation-logloss:0.31713
[11]	validation-logloss:0.31680
[12]	validation-logloss:0.31651
[13]	validation-logloss:0.31622
[14]	validation-logloss:0.31593
[15]	validation-logloss:0.31583
[16]	validation-logloss:0.31549
[17]	validation-logloss:0.31504
[18]	validation-logloss:0.31460
[19]	validation-logloss:0.31416
[20]	validation-logloss:0.31388
[21]	validation-logloss:0.31359
[22]	validation-logloss:0.31316
[23]	validation-logloss:0.31272
[24]	validation-logloss:0.31262
[25]	validation-logloss:0.31218
[26]	validation-logloss:0.31186
[27]	validation-logloss:0.31143
[28]	validation-logloss:0.31133
[29]	validation-logloss:0.31090
[30]	validation-logloss:0.31063
[31]	validation-lo

In [14]:
logged_model = 'runs:/fc485e48d0934b68a8a6aa6703b8b2fa/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]