In [None]:
from ibm_watsonx_ai.helpers import DataConnection
from ibm_watsonx_ai.helpers import ContainerLocation

training_data_references = [
    DataConnection(
        data_asset_id='ea2f5fd3-ef0c-449a-8584-9ce205006ccb'
    ),
]
training_result_reference = DataConnection(
    location=ContainerLocation(
        path='auto_ml/58706bb2-a401-4e92-9ff7-d4333cd677b7/wml_data/1c48396f-7855-4e99-9e86-f26b20abd395/data/automl',
        model_location='auto_ml/58706bb2-a401-4e92-9ff7-d4333cd677b7/wml_data/1c48396f-7855-4e99-9e86-f26b20abd395/data/automl/model.zip',
        training_status='auto_ml/58706bb2-a401-4e92-9ff7-d4333cd677b7/wml_data/1c48396f-7855-4e99-9e86-f26b20abd395/training-status.json'
    )
)

The following cell contains input parameters provided to run the AutoAI experiment in Watson Studio.

In [None]:
experiment_metadata = dict(
    prediction_type='multiclass',
    prediction_column='PMGSY_SCHEME',
    holdout_size=0.1,
    scoring='accuracy',
    csv_separator=',',
    random_state=33,
    max_number_of_estimators=2,
    training_data_references=training_data_references,
    training_result_reference=training_result_reference,
    deployment_url='https://au-syd.ml.cloud.ibm.com',
    project_id='de664104-b212-48ac-87e4-e28afe315fa4',
    drop_duplicates=True,
    include_batched_ensemble_estimators=['BatchedTreeEnsembleClassifier(ExtraTreesClassifier)', 'BatchedTreeEnsembleClassifier(LGBMClassifier)', 'BatchedTreeEnsembleClassifier(RandomForestClassifier)', 'BatchedTreeEnsembleClassifier(SnapBoostingMachineClassifier)', 'BatchedTreeEnsembleClassifier(SnapRandomForestClassifier)', 'BatchedTreeEnsembleClassifier(XGBClassifier)'],
    classes=['PM-JANMAN', 'PMGSY-I', 'PMGSY-II', 'PMGSY-III', 'RCPLWEA'],
    feature_selector_mode='auto'
)

Set n_jobs parameter to the number of available CPUs

In [None]:
import os, ast
CPU_NUMBER = 4
if 'RUNTIME_HARDWARE_SPEC' in os.environ:
    CPU_NUMBER = int(ast.literal_eval(os.environ['RUNTIME_HARDWARE_SPEC'])['num_cpu'])

watsonx.ai connection
This cell defines the credentials required to work with the watsonx.ai Runtime.

In [None]:
import getpass

api_key = getpass.getpass("Please enter your api key (press enter): ")

In [None]:
from ibm_watsonx_ai import Credentials

credentials = Credentials(
    api_key=api_key,
    url=experiment_metadata['deployment_url']
)

In [None]:
from ibm_watsonx_ai import APIClient

client = APIClient(credentials)

if 'space_id' in experiment_metadata:
    client.set.default_space(experiment_metadata['space_id'])
else:
    client.set.default_project(experiment_metadata['project_id'])

training_data_references[0].set_client(client)

Incremental learning

Get pipeline
Download and save a pipeline model object from the AutoAI training job (lale pipeline type is used for inspection and partial_fit capabilities).

In [None]:
from ibm_watsonx_ai.experiment import AutoAI

pipeline_optimizer = AutoAI(credentials, project_id=experiment_metadata['project_id']).runs.get_optimizer(metadata=experiment_metadata)
pipeline_model = pipeline_optimizer.get_pipeline(pipeline_name='Pipeline_10', astype='lale')

Data loader
Create DataLoader iterator to retrieve training dataset in batches. DataLoader is Torch compatible (torch.utils.data), returning Pandas DataFrames.

Note: If reading data results in an error, provide data as iterable reader (e.g. read_csv() method from Pandas with chunks). It may be necessary to use methods for initial data pre-processing like: e.g. DataFrame.dropna(), DataFrame.drop_duplicates(), DataFrame.sample().

reader_full_data = pd.read_csv(DATA_PATH, chunksize=CHUNK_SIZE)
Batch size in rows.

In [None]:
number_of_batch_rows = 2189


In [None]:
from ibm_watsonx_ai.data_loaders import experiment as data_loaders
from ibm_watsonx_ai.data_loaders.datasets import experiment as datasets

dataset = datasets.ExperimentIterableDataset(
    connection=training_data_references[0],
    enable_sampling=False,
    experiment_metadata=experiment_metadata,
    number_of_batch_rows=number_of_batch_rows
    )

data_loader = data_loaders.ExperimentDataLoader(dataset=dataset)

Continue model training
In this cell, the pipeline is incrementally fitted using data batches (via partial_fit calls).

Note: If you need, you can evaluate the pipeline using custom holdout data. Provide the X_test, y_test and call scorer on them.

Define scorer from the optimization metric
This cell constructs the cell scorer based on the experiment metadata.

In [None]:
from sklearn.metrics import get_scorer

scorer = get_scorer(experiment_metadata['scoring'])

Tuning the incremental learner
For the best training performance set:

n_jobs - to available number of CPUs.

In [None]:
pipeline_model.steps[-1][1].impl.base_ensemble.set_params(n_jobs=CPU_NUMBER)

Set up a learning curve plot

In [None]:
import matplotlib.pyplot as plt
from ibm_watsonx_ai.utils.autoai.incremental import plot_learning_curve
import time

partial_fit_scores = []
fit_times = []

Fit pipeline model in batches
Tip: If the data passed to partial_fit is highly imbalanced (>1:10), please consider applying the sample_weight parameter:

from sklearn.utils.class_weight import compute_sample_weight

pipeline_model.partial_fit(X_train, y_train, freeze_trained_prefix=True,
                                             sample_weight=compute_sample_weight('balanced', y_train))
Note: If you have a holdout/test set please provide it for better pipeline evaluation and replace X_test and y_test in the following cell.

from pandas import read_csv
test_df = read_csv('DATA_PATH')

X_test = test_df.drop([experiment_metadata['prediction_column']], axis=1).values
y_test = test_df[experiment_metadata['prediction_column']].values
If holdout set was not provided, 30% of first training batch would be used as holdout.

Filter warnings for incremental training.

In [None]:
import warnings

warnings.filterwarnings('ignore')

In [None]:
from sklearn.model_selection import train_test_split

fig, axes = plt.subplots(1, 3, figsize=(18, 4))

for i, batch_df in enumerate(data_loader):
    batch_df.dropna(subset=experiment_metadata["prediction_column"], inplace=True)
    X_train = batch_df.drop([experiment_metadata['prediction_column']], axis=1).values
    y_train = batch_df[experiment_metadata['prediction_column']].values
    if i==0:
        X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3)
    start_time = time.time()
    pipeline_model = pipeline_model.partial_fit(X_train, y_train, freeze_trained_prefix=True)
    fit_times.append(time.time() - start_time)
    partial_fit_scores.append(scorer(pipeline_model, X_test, y_test))
    plot_learning_curve(fig=fig, axes=axes, scores=partial_fit_scores, fit_times=fit_times)

Test pipeline model
Test the fitted pipeline (predict).

In [None]:
pipeline_model.predict(X_test[:10])



Store the model
In this section you will learn how to store the incrementally trained model.

In [None]:
model_metadata = {
    client.repository.ModelMetaNames.NAME: 'P10 - Pretrained AutoAI pipeline'
}

stored_model_details = client.repository.store_model(model=pipeline_model, meta_props=model_metadata, experiment_metadata=experiment_metadata)

Inspect the stored model details.

In [None]:
stored_model_details