install this for lightgbm

```bash
sudo apt-get update
sudo apt-get install libgomp1
```

In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import lightgbm as lgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [2]:
def load_data():
    # load the data
    file_url = 'https://archive.ics.uci.edu/static/public/222/data.csv'
    df = pd.read_csv(file_url)
    return df

In [3]:
def prepare_data(df):
    ## data preparation
    # convert target var to numerical
    df.y = df.y.map({'yes':1,'no':0})

    # fill na
    df.fillna('unknown', inplace=True)

    # drop duration
    df.drop('duration', axis=1, inplace=True)

    # split the data into train/val/test with 80%/20%
    X_train, X_test = train_test_split(df, test_size=np.round(len(df)*.2).astype(int), random_state=42)
    
    y_train = X_train.y.values
    y_test = X_test.y.values

    del X_train['y']
    del X_test['y']

    return X_train, X_test, y_train, y_test

In [4]:
def transform_data(df=pd.DataFrame()):
    num = ['age', 'balance', 'day_of_week', 'campaign', 'pdays', 'previous']
    cat = ['job', 'housing', 'contact', 'month', 'poutcome']
    features = num + cat
    df_transformed =  df[features].copy()

    # mapping pdays based on conditions
    df_transformed['pdays'] = df_transformed['pdays'].apply(lambda x: 
                                                            'never' if x == -1 
                                                            else ('plus 12 months' if x > 365 
                                                                  else ('plus 6 months' if 180 <= x <= 365 
                                                                        else '6 months'
                                                                       )
                                                                 )
                                                           )
    
    # mapping previous based on conditions
    df_transformed['previous'] = df_transformed['previous'].apply(lambda x: 
                                                                  'never' if x == 0 
                                                                  else ('more than 5' if x > 5 
                                                                        else 'less than 5'
                                                                       )
                                                                 )
    
    # mapping campaing based on conditions
    df_transformed['campaign'] = df_transformed['campaign'].apply(lambda x: 
                                                                  'once' if x == 1 
                                                                  else 'more than once'
                                                                 )

    # Consolidate categories of categorical features with many categories
    seasons = {
        'fall': ['sep','oct','nov'],
        'winter': ['dec','jan','feb'],
        'spring': ['mar','apr','may'],
        'summer': ['jun','jul','aug']
    }

    df_transformed['season'] = [season[0] for mon in df_transformed['month'] for season in list(seasons.items()) if mon in season[1]]
    
    job_category = {
        'cat_1': ['blue-collar','entrepreneur','housemaid'],
        'cat_2': ['retired','student','unemployed'],
        'cat_3': ['technician', 'admin.', 'management', 'services','unknown', 'self-employed']
    }

    df_transformed['job_category'] = [category[0] for job in df_transformed['job'] for category in list(job_category.items()) if job in category[1]]
    df_transformed = df_transformed.drop(['month','job'], axis=1).copy()
    df_transformed['contact'] = ['no' if contact == 'unknown' else 'yes' for contact in df_transformed['contact']]

    df_transformed['poutcome'] = [outcome if outcome in ['success', 'failure'] else 'other' for outcome in df_transformed['poutcome']]

    return df_transformed.to_dict(orient='records')

In [5]:
def model_tracking(MLFLOW_TRACKING_URI, experiment_name, models_list, data):
    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    mlflow.set_experiment(experiment_name)
    
    model_classes = models_list
    
    mlflow.sklearn.autolog()
    mlflow.xgboost.autolog()
    mlflow.lightgbm.autolog()

    X_train, y_train, X_val, y_val = data
    
    for model_class in model_classes:
        with mlflow.start_run():
            
            mlmodel = model_class()
            mlmodel.fit(X_train, y_train)
        
            y_pred = mlmodel.predict(X_val)
            accuracy = accuracy_score(y_val, y_pred)
            mlflow.log_metric("accuracy", accuracy)
            print(f"{model_class} achieved an accuracy of {accuracy}")

In [6]:
# search runs
def search_experiments(experiment_name):
    experiment = client.get_experiment_by_name(experiment_name)

    runs = client.search_runs(
        experiment_ids=experiment.experiment_id,
        filter_string="metrics.accuracy > 0.885",
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=5,
        order_by=["metrics.accuracy ASC"]
    )


    for run in runs:
        print(f"run id: {run.info.run_id}, accracy: {run.data.metrics['accuracy']:.4f}")

In [7]:
# resgistering a Model Registry
def register_model(run_id, model_name):
    model_uri = f"runs:/{run_id}/model"
    mlflow.register_model(model_uri=model_uri, name=model_name)

In [8]:
def transition_stage(version, stage, model_name):
    # stage
    model_version = version
    new_stage = stage
    client.transition_model_version_stage(
        name=model_name,
        version=model_version,
        stage=new_stage,
        archive_existing_versions=False
    )

    # updating model version
    from datetime import datetime

    date = datetime.today().date()
    client.update_model_version(
        name=model_name,
        version=model_version,
        description=f"The model version {model_version} was transitioned to {new_stage} on {date}"
    )

In [9]:
# compare models
def test_model(name, stage_or_version, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage_or_version}")
    y_pred = model.predict(X_test)
    return {"accuracy": accuracy_score(y_test, y_pred)}

main

In [10]:
df = load_data()

In [11]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,,no
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,,no
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,,no


In [12]:
df.y.value_counts(normalize=True)*100

y
no     88.30152
yes    11.69848
Name: proportion, dtype: float64

In [13]:
# prepare data
X_train, X_val, y_train, y_val = prepare_data(df)

In [14]:
pd.Series(y_train).value_counts(normalize=True)*100

0    88.393376
1    11.606624
Name: proportion, dtype: float64

In [15]:
pd.Series(y_val).value_counts(normalize=True)*100

0    87.934085
1    12.065915
Name: proportion, dtype: float64

In [16]:
# transform data 
data_transformer = transform_data
X_train, X_val = data_transformer(X_train), data_transformer(X_val)

In [17]:
dv = DictVectorizer()
dv.fit(X_train)

X_train = dv.transform(X_train)
X_val = dv.transform(X_val)

model tracking

start mlflow

```bash
mlflow ui --backend-store-uri sqlite:///mlflow.db
```

In [11]:
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
experiment_name = "bank-marketing-5"

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name)

model_classes = [
    RandomForestClassifier,
    XGBClassifier,
    lgb.LGBMClassifier,
    LogisticRegression,
    SVC,
    MLPClassifier
]

data = (X_train, y_train, X_val, y_val)

model_tracking(MLFLOW_TRACKING_URI, experiment_name, model_classes, data)



<class 'sklearn.ensemble._forest.RandomForestClassifier'> achieved an accuracy of 0.8837646538376466




<class 'xgboost.sklearn.XGBClassifier'> achieved an accuracy of 0.8864189338641894
[LightGBM] [Info] Number of positive: 4198, number of negative: 31971
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003093 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 407
[LightGBM] [Info] Number of data points in the train set: 36169, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.116066 -> initscore=-2.030221
[LightGBM] [Info] Start training from score -2.030221




<class 'lightgbm.sklearn.LGBMClassifier'> achieved an accuracy of 0.8909533289095333


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


<class 'sklearn.linear_model._logistic.LogisticRegression'> achieved an accuracy of 0.8853129838531298




<class 'sklearn.svm._classes.SVC'> achieved an accuracy of 0.8793408537934085
<class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'> achieved an accuracy of 0.7986065029860651


model registry

In [12]:
# initiate mlflow client
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

client.search_experiments(view_type=ViewType.ALL)

[<Experiment: artifact_location='/home/minasonbol/mlops-zoomcamp/capstone/mlruns/6', creation_time=1720229445519, experiment_id='6', last_update_time=1720229445519, lifecycle_stage='active', name='bank-marketing-5', tags={}>,
 <Experiment: artifact_location='/home/minasonbol/mlops-zoomcamp/capstone/mlruns/5', creation_time=1720228890385, experiment_id='5', last_update_time=1720228890385, lifecycle_stage='active', name='bank-marketing-4', tags={}>,
 <Experiment: artifact_location='/home/minasonbol/mlops-zoomcamp/capstone/mlruns/4', creation_time=1720210388823, experiment_id='4', last_update_time=1720210388823, lifecycle_stage='active', name='bank-marketing-3', tags={}>,
 <Experiment: artifact_location='/home/minasonbol/mlops-zoomcamp/capstone/mlruns/3', creation_time=1720208940813, experiment_id='3', last_update_time=1720208940813, lifecycle_stage='active', name='bank-marketing-2', tags={}>,
 <Experiment: artifact_location='/home/minasonbol/mlops-zoomcamp/capstone/mlruns/2', creation_ti

In [14]:
# search runs
search_experiments(experiment_name)

run id: c11900639edf45afbeca3ec36b3988a2, accracy: 0.8853
run id: facd28b22bfb4d0997b95842e5b3d339, accracy: 0.8853
run id: 717aeb979f0046b9905346c14cad3bd4, accracy: 0.8864
run id: 9ce9a73d85f9471da1c467a00a9da29e, accracy: 0.8864
run id: d4444bbdecb44d9387aa8e84120854bb, accracy: 0.8910


In [17]:
model_name = 'bank_marketing'

run_id = "facd28b22bfb4d0997b95842e5b3d339"
register_model(run_id, model_name)

run_id = "717aeb979f0046b9905346c14cad3bd4"
register_model(run_id, model_name)

run_id = "9ce9a73d85f9471da1c467a00a9da29e"
register_model(run_id, model_name)

run_id = "d4444bbdecb44d9387aa8e84120854bb"
register_model(run_id, model_name)

Successfully registered model 'bank_marketing'.
Created version '1' of model 'bank_marketing'.
Registered model 'bank_marketing' already exists. Creating a new version of this model...
Created version '2' of model 'bank_marketing'.
Registered model 'bank_marketing' already exists. Creating a new version of this model...
Created version '3' of model 'bank_marketing'.
Registered model 'bank_marketing' already exists. Creating a new version of this model...
Created version '4' of model 'bank_marketing'.


In [24]:
version = 1
stage = "Staging"
transition_stage(version, stage, model_name)
%time test_model(model_name, stage, X_val, y_val)

CPU times: user 26.7 ms, sys: 0 ns, total: 26.7 ms
Wall time: 26.7 ms


  client.transition_model_version_stage(
  latest = client.get_latest_versions(name, None if stage is None else [stage])


{'accuracy': 0.8853129838531298}

In [25]:
version = 2
stage = "Staging"
transition_stage(version, stage, model_name)
%time test_model(model_name, stage, X_val, y_val)

CPU times: user 98 ms, sys: 0 ns, total: 98 ms
Wall time: 42.9 ms


  client.transition_model_version_stage(
  latest = client.get_latest_versions(name, None if stage is None else [stage])


{'accuracy': 0.8864189338641894}

In [26]:
version = 3
stage = "Staging"
transition_stage(version, stage, model_name)
%time test_model(model_name, stage, X_val, y_val)

CPU times: user 96.7 ms, sys: 3.26 ms, total: 100 ms
Wall time: 48.5 ms


  client.transition_model_version_stage(
  latest = client.get_latest_versions(name, None if stage is None else [stage])


{'accuracy': 0.8864189338641894}

In [27]:
version = 4
stage = "Staging"
transition_stage(version, stage, model_name)
%time test_model(model_name, stage, X_val, y_val)

CPU times: user 59.1 ms, sys: 3.85 ms, total: 63 ms
Wall time: 47.2 ms


  client.transition_model_version_stage(
  latest = client.get_latest_versions(name, None if stage is None else [stage])


{'accuracy': 0.8909533289095333}

In [28]:
version = 1
stage = "Production"
transition_stage(version, stage, model_name)

  client.transition_model_version_stage(


In [29]:
version = 4
stage = "Production"
transition_stage(version, stage, model_name)

  client.transition_model_version_stage(
