In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from catboost import CatBoostClassifier
import joblib
from clearml import Dataset, Task

from mlops_ods.config import compose_config

In [10]:
cfg = compose_config()

In [None]:
task = Task.init(
    project_name="Mlops-test", task_name="catboost", output_uri=True
)
frame_path = Dataset.get(
    dataset_name="Raw data", dataset_project="Mlops-test"
).get_local_copy()
task.set_progress(0)
data = pd.read_csv(
    frame_path + "/2015-street-tree-census-tree-data.csv"
)

In [None]:
train, test = train_test_split(
    data,
    test_size=0.3,
    random_state=42,
)

In [None]:
num_cols = cfg.features.numerical
cat_cols = cfg.features.categorical
total_cols = num_cols + cat_cols

train = train[~train["health"].isna()]
drop_columns(train)
preprocess_data(train)
train_features = train[total_cols]

test = test[~test["health"].isna()]
drop_columns(test)
preprocess_data(test)
test_features = test[total_cols]

In [None]:
task.upload_artifact(
    name="train_features_cb",
    artifact_object=train_features,
)
task.upload_artifact(
    name="test_features_cb",
    artifact_object=test_features,
)

In [None]:
model_params = {
    "iterations": cfg.model.iterations, 
    "verbose": cfg.model.verbose,
    "random_seed": cfg.model.random_seed,
    "cat_features": cat_cols
}
task.connect(model_params)
model = CatBoostClassifier(**model_params)
model.fit(train_features, train["health"])
joblib.dump(model, "models/model_cb.pkl", compress=True)
predicts = model.predict(test_features)
report = classification_report(test["health"], predicts, output_dict=True)
confusion = confusion_matrix(test["health"], predicts)

logger = task.get_logger()
logger.report_single_value("accuracy", report.pop("accuracy"))
for class_name, metrics in report.items():
    for metric, value in metrics.items():
        logger.report_single_value(f"{class_name}_{metric}", value)
logger.report_confusion_matrix("conflusion matrix", "ignored", matrix=confusion)

In [None]:
task.close()