In [80]:
import os

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from pandas import Series, DataFrame

from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_validate, train_test_split, GridSearchCV, cross_val_score, StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix

import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from mlflow.client import MlflowClient

In [6]:
os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://truenas.local:9000"
os.environ["MLFLOW_TRACKING_URI"] = "http://192.168.1.14:5000"

EXPERIMENT_NAME = "Weather Forecast Model Experiment"

In [8]:
mlflow.set_experiment(
    experiment_name=EXPERIMENT_NAME
)
experiment = mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)

## Data loading

In [21]:
train_dataset = pd.read_csv("../data/FULL_DATA_SET", index_col=0)
X, y = train_dataset.iloc[:, :train_dataset.shape[1] - 1], train_dataset.iloc[:, train_dataset.shape[1] - 1]

print(f"X shape: {X.shape}\ny shape: {y.shape}")

X shape: (2593, 19)
y shape: (2593,)


In [50]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = None, None, None, None

for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
print(X_train.shape)

(2074, 19)


In [41]:
y.value_counts()

target
rain                 1218
partly-cloudy-day     709
clear-day             522
snow                   88
cloudy                 48
fog                     5
wind                    3
Name: count, dtype: int64

# Experiments with models
*Pipeline with preprocessing and ensemble*

In [54]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier())
])

params = {
    'scaler': [StandardScaler(), MinMaxScaler(), Normalizer()],
    'model': [RandomForestClassifier(), AdaBoostClassifier(), GradientBoostingClassifier()]
}

classes = y.unique()
print(classes)

['snow' 'rain' 'partly-cloudy-day' 'cloudy' 'clear-day' 'wind' 'fog']


In [52]:
mlflow.sklearn.autolog()

grid_search = GridSearchCV(pipeline, param_grid=params)
grid_search.fit(X_train, y_train)

print("Лучший масштабатор:", grid_search.best_params_['scaler'])
print("Лучшая модель:", grid_search.best_params_['model'])
print("Лучшее значение R^2:", grid_search.best_score_)

2023/07/10 15:11:50 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '789ec4bebcd74fa0a67dd075d6348a8c', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2023/07/10 15:13:11 INFO mlflow.sklearn.utils: Logging the 5 best runs, 4 runs will be omitted.


Лучший масштабатор: StandardScaler()
Лучшая модель: GradientBoostingClassifier()
Лучшее значение R^2: 0.9855351842151213


In [53]:
model = grid_search.best_estimator_

print(f"Model: {model}\nScore: {grid_search.best_score_}")

Model: Pipeline(steps=[('scaler', StandardScaler()),
                ('model', GradientBoostingClassifier())])
Score: 0.9855351842151213


In [56]:
y_pred = model.predict(X_test)
cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
score_ = model.score(X_test, y_test)
cm

array([[104,   0,   0,   0,   0,   0],
       [  0,  10,   0,   0,   0,   0],
       [  0,   0,   0,   1,   0,   0],
       [  1,   0,   0, 141,   0,   0],
       [  0,   0,   0,   0, 243,   1],
       [  0,   0,   0,   1,   9,   8]])

In [57]:
print(score_)

0.9749518304431599


# Model Tuning
**GradientBoosting**

In [73]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', GradientBoostingClassifier())
])

params = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [3e-2, 3e-3, 3e-1],
    'model__max_depth': [2, 3, 4]
}

In [74]:
mlflow.sklearn.autolog()

grid_search = GridSearchCV(pipeline, param_grid=params, verbose=2)
grid_search.fit(X_train, y_train)

2023/07/10 17:15:40 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '09aa479315e14825a5875688d4ef96f0', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END model__learning_rate=0.03, model__max_depth=2, model__n_estimators=100; total time=   2.6s
[CV] END model__learning_rate=0.03, model__max_depth=2, model__n_estimators=100; total time=   2.5s
[CV] END model__learning_rate=0.03, model__max_depth=2, model__n_estimators=100; total time=   2.9s
[CV] END model__learning_rate=0.03, model__max_depth=2, model__n_estimators=100; total time=   2.5s
[CV] END model__learning_rate=0.03, model__max_depth=2, model__n_estimators=100; total time=   2.5s
[CV] END model__learning_rate=0.03, model__max_depth=2, model__n_estimators=200; total time=   5.0s
[CV] END model__learning_rate=0.03, model__max_depth=2, model__n_estimators=200; total time=   5.2s
[CV] END model__learning_rate=0.03, model__max_depth=2, model__n_estimators=200; total time=   5.1s
[CV] END model__learning_rate=0.03, model__max_depth=2, model__n_estimators=200; total time=   5.1s
[CV] END model__learning_rate=0.03, mod

2023/07/10 17:22:30 INFO mlflow.sklearn.utils: Logging the 5 best runs, 13 runs will be omitted.


In [76]:
print("Лучшая модель:", grid_search.best_params_)
print("Лучшее значение:", grid_search.best_score_)
boost_model = grid_search.best_estimator_

Лучшая модель: {'model__learning_rate': 0.03, 'model__max_depth': 2, 'model__n_estimators': 200}
Лучшее значение: 0.9865013677900005


# Register Model in Registry

In [79]:
model_uri = "runs:/09aa479315e14825a5875688d4ef96f0/best_estimator"
mv = mlflow.register_model(model_uri, name="KrasnodarWeatherForecastModel")
print(f"Name: {mv.name}\nVersion: {mv.version}")

Successfully registered model 'KrasnodarWeatherForecastModel'.
2023/07/10 17:38:57 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: KrasnodarWeatherForecastModel, version 1


Name: KrasnodarWeatherForecastModel
Version: 1


Created version '1' of model 'KrasnodarWeatherForecastModel'.


In [81]:
client = MlflowClient()
client.transition_model_version_stage(
    name="KrasnodarWeatherForecastModel",
    version=1,
    stage="Staging"
)

<ModelVersion: aliases=[], creation_timestamp=1688999937321, current_stage='Staging', description='', last_updated_timestamp=1689002423997, name='KrasnodarWeatherForecastModel', run_id='09aa479315e14825a5875688d4ef96f0', run_link='', source='s3://mlflow-bucket/1/09aa479315e14825a5875688d4ef96f0/artifacts/best_estimator', status='READY', status_message='', tags={}, user_id='', version='1'>