In [1]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")

# Add seed
np.random.seed(0)

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
from utils import get_data_summary, get_value_counts, Preparedata

### Load Data

In [3]:
fp: str = "../data/titanic_data.csv"

data_original: pl.DataFrame = pl.read_csv(source=fp)
data_original.head()

pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
i64,i64,str,str,f64,i64,i64,str,f64,str,str,str,i64,str
1,1,"""Allen, Miss. E…","""female""",29.0,0,0,"""24160""",211.3375,"""B5""","""S""","""2""",,"""St Louis, MO"""
1,1,"""Allison, Maste…","""male""",0.9167,1,2,"""113781""",151.55,"""C22""","""S""","""11""",,"""Montreal, PQ /…"
1,0,"""Allison, Miss.…","""female""",2.0,1,2,"""113781""",151.55,"""C22""","""S""",,,"""Montreal, PQ /…"
1,0,"""Allison, Mr. H…","""male""",30.0,1,2,"""113781""",151.55,"""C22""","""S""",,135.0,"""Montreal, PQ /…"
1,0,"""Allison, Mrs. …","""female""",25.0,1,2,"""113781""",151.55,"""C22""","""S""",,,"""Montreal, PQ /…"


In [4]:
# Config
TARGET: str = "survived"
TEST_SIZE: float = 0.2
RANDOM_STATE: int = 123

In [5]:
get_data_summary(data=data_original, features=data_original.columns)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
count,1309.0,1309.0,1309,1309,1046.0,1309.0,1309.0,1309,1308.0,295,1307,486.0,121.0,745
unique,,,1307,2,,,,929,,181,3,27.0,,369
top,,,"Connolly, Miss. Kate",male,,,,CA. 2343,,F,S,13.0,,"New York, NY"
freq,,,2,843,,,,11,,8,914,39.0,,64
mean,2.294882,0.381971,,,29.881135,0.498854,0.385027,,33.295479,,,,160.809917,
std,0.837836,0.486055,,,14.4135,1.041658,0.86556,,51.758668,,,,97.696922,
min,1.0,0.0,,,0.1667,0.0,0.0,,0.0,,,,1.0,
25%,2.0,0.0,,,21.0,0.0,0.0,,7.8958,,,,72.0,
50%,3.0,0.0,,,28.0,0.0,0.0,,14.4542,,,,155.0,
75%,3.0,1.0,,,39.0,1.0,0.0,,31.275,,,,256.0,


In [6]:
# Drop columns with too many NaNs or unnecessary columns
cols_to_drop: list[str] = ["cabin", "boat", "body", "home.dest"]
data: pl.DataFrame = data_original.drop(columns=cols_to_drop)

get_data_summary(data=data, features=data.columns)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,embarked
count,1309.0,1309.0,1309,1309,1046.0,1309.0,1309.0,1309,1308.0,1307
unique,,,1307,2,,,,929,,3
top,,,"Connolly, Miss. Kate",male,,,,CA. 2343,,S
freq,,,2,843,,,,11,,914
mean,2.294882,0.381971,,,29.881135,0.498854,0.385027,,33.295479,
std,0.837836,0.486055,,,14.4135,1.041658,0.86556,,51.758668,
min,1.0,0.0,,,0.1667,0.0,0.0,,0.0,
25%,2.0,0.0,,,21.0,0.0,0.0,,7.8958,
50%,3.0,0.0,,,28.0,0.0,0.0,,14.4542,
75%,3.0,1.0,,,39.0,1.0,0.0,,31.275,


In [7]:
data.head(10)

pclass,survived,name,sex,age,sibsp,parch,ticket,fare,embarked
i64,i64,str,str,f64,i64,i64,str,f64,str
1,1,"""Allen, Miss. E…","""female""",29.0,0,0,"""24160""",211.3375,"""S"""
1,1,"""Allison, Maste…","""male""",0.9167,1,2,"""113781""",151.55,"""S"""
1,0,"""Allison, Miss.…","""female""",2.0,1,2,"""113781""",151.55,"""S"""
1,0,"""Allison, Mr. H…","""male""",30.0,1,2,"""113781""",151.55,"""S"""
1,0,"""Allison, Mrs. …","""female""",25.0,1,2,"""113781""",151.55,"""S"""
1,1,"""Anderson, Mr. …","""male""",48.0,0,0,"""19952""",26.55,"""S"""
1,1,"""Andrews, Miss.…","""female""",63.0,1,0,"""13502""",77.9583,"""S"""
1,0,"""Andrews, Mr. T…","""male""",39.0,0,0,"""112050""",0.0,"""S"""
1,1,"""Appleton, Mrs.…","""female""",53.0,2,0,"""11769""",51.4792,"""S"""
1,0,"""Artagaveytia, …","""male""",71.0,0,0,"""PC 17609""",49.5042,"""C"""


In [8]:
data_with_index = data.with_row_index("id", offset=0)
data_with_index

id,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,embarked
u32,i64,i64,str,str,f64,i64,i64,str,f64,str
0,1,1,"""Allen, Miss. E…","""female""",29.0,0,0,"""24160""",211.3375,"""S"""
1,1,1,"""Allison, Maste…","""male""",0.9167,1,2,"""113781""",151.55,"""S"""
2,1,0,"""Allison, Miss.…","""female""",2.0,1,2,"""113781""",151.55,"""S"""
3,1,0,"""Allison, Mr. H…","""male""",30.0,1,2,"""113781""",151.55,"""S"""
4,1,0,"""Allison, Mrs. …","""female""",25.0,1,2,"""113781""",151.55,"""S"""
…,…,…,…,…,…,…,…,…,…,…
1304,3,0,"""Zabour, Miss. …","""female""",14.5,1,0,"""2665""",14.4542,"""C"""
1305,3,0,"""Zabour, Miss. …","""female""",,1,0,"""2665""",14.4542,"""C"""
1306,3,0,"""Zakarian, Mr. …","""male""",26.5,0,0,"""2656""",7.225,"""C"""
1307,3,0,"""Zakarian, Mr. …","""male""",27.0,0,0,"""2670""",7.225,"""C"""


In [9]:
# Sex: {"female": 0, "male": 1}, Embarked: {"S": 0, "C": 1, "Q": 2},

data.lazy().with_columns(
    sex_int=pl.col("sex").cast(pl.Categorical).to_physical(),
    embarked_int=pl.col("embarked").cast(pl.Categorical).to_physical(),
    ticket_int=pl.col("ticket").str.contains(r"^\D"),
).with_columns(
    ticket_int=(
        pl.col("ticket_int")
        .cast(pl.Categorical)
        .to_physical()  # similar to pd.factorize
    )
).collect()

pclass,survived,name,sex,age,sibsp,parch,ticket,fare,embarked,sex_int,embarked_int,ticket_int
i64,i64,str,str,f64,i64,i64,str,f64,str,u32,u32,u32
1,1,"""Allen, Miss. E…","""female""",29.0,0,0,"""24160""",211.3375,"""S""",0,0,0
1,1,"""Allison, Maste…","""male""",0.9167,1,2,"""113781""",151.55,"""S""",1,0,0
1,0,"""Allison, Miss.…","""female""",2.0,1,2,"""113781""",151.55,"""S""",0,0,0
1,0,"""Allison, Mr. H…","""male""",30.0,1,2,"""113781""",151.55,"""S""",1,0,0
1,0,"""Allison, Mrs. …","""female""",25.0,1,2,"""113781""",151.55,"""S""",0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…
3,0,"""Zabour, Miss. …","""female""",14.5,1,0,"""2665""",14.4542,"""C""",0,1,0
3,0,"""Zabour, Miss. …","""female""",,1,0,"""2665""",14.4542,"""C""",0,1,0
3,0,"""Zakarian, Mr. …","""male""",26.5,0,0,"""2656""",7.225,"""C""",1,1,0
3,0,"""Zakarian, Mr. …","""male""",27.0,0,0,"""2670""",7.225,"""C""",1,1,0


In [10]:
# Split the data
from sklearn.model_selection import train_test_split

X_train: pl.DataFrame
X_test: pl.DataFrame

X_train, X_test = train_test_split(
    data_original,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=data_original.select(TARGET),
)

X_train.shape, X_test.shape

((1047, 14), (262, 14))

In [11]:
X_train.head()

pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
i64,i64,str,str,f64,i64,i64,str,f64,str,str,str,i64,str
2,1,"""Hart, Miss. Ev…","""female""",7.0,0,2,"""F.C.C. 13529""",26.25,,"""S""","""14""",,"""Ilford, Essex …"
2,0,"""Peruschitz, Re…","""male""",41.0,0,0,"""237393""",13.0,,"""S""",,,
1,1,"""Lurette, Miss.…","""female""",58.0,0,0,"""PC 17569""",146.5208,"""B80""","""C""",,,
3,0,"""Shellard, Mr. …","""male""",,0,0,"""C.A. 6212""",15.1,,"""S""",,,
1,1,"""Carter, Mrs. W…","""female""",36.0,1,2,"""113760""",120.0,"""B96""","""S""","""4""",,"""Bryn Mawr, PA"""


In [12]:
# The name is the U_id (Polars does NOT maintain order)
y_train: pd.DataFrame = (
    X_train.select(["name", TARGET]).sort("name").drop(columns=["name"]).to_pandas()
)
y_test: pd.DataFrame = (
    X_test.select(["name", TARGET]).sort("name").drop(columns=["name"]).to_pandas()
)

y_train.head()

Unnamed: 0,survived
0,0
1,0
2,0
3,1
4,1


In [13]:
console.print(f"Train: {get_value_counts(data=y_train, feature=TARGET)}")
console.print(f"Test: {get_value_counts(data=y_test, feature=TARGET)}")

In [14]:
num_vars: list[str] = ["pclass", "age", "sibsp", "parch", "ticket", "fare"]
cat_vars: list[str] = ["sex", "embarked"]

get_data_summary(data=X_train, features=X_train.columns)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
count,1047.0,1047.0,1047,1047,834.0,1047.0,1047.0,1047,1046.0,233,1045,384.0,98.0,592
unique,,,1046,2,,,,780,,158,3,25.0,,320
top,,,"Kelly, Mr. James",male,,,,CA. 2343,,F,S,13.0,,"New York, NY"
freq,,,2,681,,,,10,,7,736,31.0,,54
mean,2.292264,0.382044,,,29.857414,0.504298,0.382044,,33.05311,,,,156.744898,
std,0.839152,0.486119,,,14.498647,1.077197,0.884962,,49.204992,,,,102.279515,
min,1.0,0.0,,,0.1667,0.0,0.0,,0.0,,,,1.0,
25%,2.0,0.0,,,21.0,0.0,0.0,,7.8958,,,,61.25,
50%,3.0,0.0,,,28.0,0.0,0.0,,14.4542,,,,150.5,
75%,3.0,1.0,,,39.0,1.0,0.0,,31.359375,,,,259.75,


In [15]:
processor = Preparedata(variables=cat_vars + ["ticket"])
processor.fit_transform(X=X_train)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
1020,3,0,"Abbing, Mr. Anthony",male,42.0,0,0,1,7.5500,,s,,,
120,3,0,"Abbott, Master. Eugene Joseph",male,13.0,0,2,1,20.2500,,s,,,"East Providence, RI"
543,3,0,"Abbott, Mr. Rossmore Edward",male,16.0,1,1,1,20.2500,,s,,190.0,"East Providence, RI"
98,3,1,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35.0,1,1,1,20.2500,,s,A,,"East Providence, RI"
318,3,1,"Abelseth, Miss. Karen Marie",female,16.0,0,0,0,7.6500,,s,16,,"Norway Los Angeles, CA"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,3,0,"de Pelsmaeker, Mr. Alfons",male,16.0,0,0,0,9.5000,,s,,,
923,2,0,"del Carlo, Mr. Sebastiano",male,29.0,1,0,1,27.7208,,c,,295.0,"Lucca, Italy / California"
495,3,0,"van Billiard, Master. James William",male,,1,1,1,14.5000,,s,,,
43,3,0,"van Billiard, Master. Walter John",male,11.5,1,1,1,14.5000,,s,,1.0,


In [16]:
from feature_engine.selection import DropFeatures
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer
from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


set_config(transform_output="pandas")

col_transf: ColumnTransformer = ColumnTransformer(
    transformers=[
        ("num_vars", MinMaxScaler(clip=True), num_vars),
        (
            "cat_vars",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            cat_vars,
        ),
    ],
    remainder="drop",
)
preprocessor: Pipeline = Pipeline(
    steps=[
        ("drop_features", DropFeatures(features_to_drop=cols_to_drop)),
        ("preprocess", Preparedata(variables=["sex", "embarked", "ticket"])),
        ("median_imputer", MeanMedianImputer(variables=num_vars)),
        (
            "cat_imputer",
            CategoricalImputer(
                imputation_method="frequent", variables=cat_vars, ignore_format=True
            ),
        ),
        ("col_transf", col_transf),
    ]
)
preprocessor

In [17]:
X_train_tr: pd.DataFrame = preprocessor.fit_transform(X=X_train.to_pandas())
X_test_tr: pd.DataFrame = preprocessor.transform(X=X_test.to_pandas())

X_train_tr.head()

Unnamed: 0,num_vars__pclass,num_vars__age,num_vars__sibsp,num_vars__parch,num_vars__ticket,num_vars__fare,cat_vars__sex_female,cat_vars__sex_male,cat_vars__embarked_c,cat_vars__embarked_q,cat_vars__embarked_s
1020,1.0,0.524008,0.0,0.0,1.0,0.014737,0.0,1.0,0.0,0.0,1.0
120,1.0,0.160751,0.0,0.222222,1.0,0.039525,0.0,1.0,0.0,0.0,1.0
543,1.0,0.19833,0.125,0.111111,1.0,0.039525,0.0,1.0,0.0,0.0,1.0
98,1.0,0.436325,0.125,0.111111,1.0,0.039525,1.0,0.0,0.0,0.0,1.0
318,1.0,0.19833,0.0,0.0,0.0,0.014932,1.0,0.0,0.0,0.0,1.0


In [18]:
assert X_train_tr.shape[0] == y_train.shape[0], "The size is NOT equal!"
assert X_test_tr.shape[0] == y_test.shape[0], "The size is NOT equal!"

In [19]:
from mlflow_example import Experiment, Estimator, TrainingData, run_experiment
import mlflow
from sklearn.linear_model import LogisticRegression

In [25]:
import os
from google.cloud import storage


# Initialize the client with explicit credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = (
    "../service_account/ml-project-419115-8de0a00b8e5f.json"
)

In [22]:
training_data: TrainingData = TrainingData(
    X_train=X_train_tr,
    X_validate=X_test_tr,
    y_train=y_train[TARGET],
    y_validate=y_test[TARGET],
)

experiment: Experiment = Experiment(
    experiment_name="test experiment",
    experiment_type="classification",
    run_name="test",
    model_name="log-model2",
    # tracking_uri="http://127.0.0.1:5252", # This doesn't work on docker (locally).
    tracking_uri="http://localhost:5251",
)

log_model: LogisticRegression = LogisticRegression(random_state=123)
estimator: Estimator = Estimator(
    preprocessor=preprocessor,
    model=log_model,
)

### Start The MLFlow Traking Server

- Using `Localhost` and `SQLite`

```sh
mlflow server -h "127.0.0.1" -p 5252 --backend-store-uri sqlite:///mlruns.db
```

In [26]:
try:
    run_experiment(
        experiment=experiment,
        estimator=estimator,
        training_data=training_data,
    )
except Exception as e:
    print(f"Error running experiment: {e}")

INFO:richLogger: Training 'log-model2' 
5it [01:02, 12.49s/it]


INFO:richLogger:Mean AUC [Validation]: 0.8438


INFO:richLogger:Training finished successfully
Successfully registered model 'preprocessor-model'.
2024/04/07 01:11:58 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: preprocessor-model, version 1
Created version '1' of model 'preprocessor-model'.
Successfully registered model 'log-model2'.
2024/04/07 01:12:10 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: log-model2, version 1
Created version '1' of model 'log-model2'.


INFO:richLogger: Training 'log-model2' Done! 


In [24]:
from pathlib import Path
from random import random

import mlflow


def main():
    """
    Runs an MLflow experiment and logs various metrics and artifacts.
    This function sets up an MLflow experiment, starts a run, logs some test parameters and metrics, 
    creates a temporary text file, and logs the file as an artifact.
    """
    mlflow.set_tracking_uri("http://localhost:5251")
    experiment_name = "playground"

    try:
        mlflow.create_experiment(experiment_name)
    except mlflow.exceptions.RestException:  # type: ignore
        pass

    mlflow.set_experiment(experiment_name)

    with mlflow.start_run() as run:
        mlflow.log_param("test", 13)

        mlflow.log_metric("foo", random())
        mlflow.log_metric("foo", random() + 1)
        mlflow.log_metric("foo", random() + 2)

        tmp_txt_path = "tmp.txt"
        Path(tmp_txt_path).write_text("Everything is working!")

        mlflow.log_artifact(tmp_txt_path)

        Path(tmp_txt_path).unlink()


if __name__ == "__main__":
    main()



InvalidOperation: Anonymous credentials cannot be refreshed.

In [None]:
from typing import Literal
from mlflow import MlflowClient

client = MlflowClient()
validation_tag: Literal["pending", "passed"]
validation_tag = "passed"
model_version: int = 1

# Add tag
client.set_registered_model_tag(
    experiment.model_name, "model.validation_status", validation_tag
)
client.set_registered_model_tag(experiment.model_name, "model.data_scientist", "Neidu")