In [None]:
import logging
import os
import tempfile

import numpy as np
import pandas as pd

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.core.dataset import Dataset
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from azureml.core.script_run_config import ScriptRunConfig
from azureml.core.run import Run
from azureml.data.datapath import DataPath
from azureml.train.automl import AutoMLConfig
from sklearn.metrics import classification_report

In [None]:
print ( 'This notebook was created using version 1.39.0 of the Azure ML SDK' )
print ( 'You are currently using version' , azureml.core.VERSION , 'of the Azure ML SDK' )

In [None]:
ws = Workspace.from_config()

# Choose an experiment name.
experiment_name = 'automl-nlp-text'

experiment = Experiment ( ws , experiment_name )

output = {}
output [ 'Subscription ID' ] = ws.subscription_id
output [ 'Workspace Name' ] = ws.name
output [ 'Resource Group' ] = ws.resource_group
output [ 'Location' ] = ws.location
output [ 'Experiment Name' ] = experiment.name

print ( output )

In [None]:
train = pd.read_csv ( 'embold_train.csv' )
train.head ()

In [None]:
train [ 'X' ] = ( train [ 'title' ].map ( str ) + ' ' + train [ 'body' ] ).apply ( lambda row : row.strip () ).apply ( lambda row : row [ : min ( len ( row ) , 128 ) ] )
train = train.rename ( columns = { 'label' : 'y' } )
train = train.drop ( [ 'title' , 'body' ] , axis = 1 ) 
train.head ()


In [None]:
l = train [ 'X' ].map ( lambda b : len ( b ) )
l.describe ().T

In [None]:
data_train = train.loc [ : 20000 ]
data_val = train.loc [ 20000 : 30000 ]
data_test = train.loc [ 30000 : 40000]

data_dir = 'automl-nlp-data'  # Local directory to store data
blobstore_datadir = data_dir  # Blob store directory to store data in
if not os.path.isdir ( data_dir ) :
    os.mkdir ( data_dir )

train_data_fname = data_dir + '/train_data.csv'
val_data_fname = data_dir + '/val_data.csv'
test_data_fname = data_dir + '/test_data.csv'

data_train.to_csv ( train_data_fname , index = False )
data_val.to_csv ( val_data_fname , index = False )
data_test.to_csv ( test_data_fname , index = False )

datastore = ws.get_default_datastore ()
target = DataPath (
    datastore = datastore , path_on_datastore = blobstore_datadir , name = 'automl_nlp_data'
)
Dataset.File.upload_directory(
    src_dir = data_dir , target = target , overwrite = True , show_progress = True
)

In [None]:
train_dataset = Dataset.Tabular.from_delimited_files (
    path = [ ( datastore , blobstore_datadir + '/train_data.csv' ) ]
)

val_dataset = Dataset.Tabular.from_delimited_files (
    path = [ ( datastore , blobstore_datadir + '/val_data.csv' ) ]
)

test_dataset = Dataset.Tabular.from_delimited_files (
    path = [ ( datastore , blobstore_datadir + '/test_data.csv' ) ]
)

train_dataset = train_dataset.register (
    workspace = ws ,
    name = 'automl_nlp_data_train' ,
    description = 'automl_nlp_data_train' ,
    create_new_version = True ,
)

val_dataset = val_dataset.register (
    workspace = ws ,
    name = 'automl_nlp_data_val' ,
    description = 'automl_nlp_data_val' ,
    create_new_version = True ,
)

test_dataset = test_dataset.register (
    workspace = ws ,
    name = 'automl_nlp_data_test' ,
    description = 'automl_nlp_data_test' ,
    create_new_version = True ,
)

In [None]:
compute_target = ComputeTarget ( workspace = ws , name = 'gpu-cluster' )

In [None]:
automl_settings = {
    'verbosity' : logging.INFO ,
}

automl_config = AutoMLConfig (
    task = 'text-classification' ,
    debug_log= 'automl_errors.log' ,
    compute_target = compute_target ,
    training_data = train_dataset ,
    validation_data = val_dataset ,
    label_column_name = 'y' ,
    enable_dnn = True ,
    **automl_settings
)

In [None]:
automl_run = experiment.submit ( automl_config , show_output = False )
_ = automl_run.wait_for_completion ( show_output = False )

In [26]:
validation_metrics = automl_run.get_metrics ()
validation_metrics

{'norm_macro_recall': 0.5447819099968475,
 'recall_score_micro': 0.7883211678832117,
 'matthews_correlation': 0.6309329324673372,
 'balanced_accuracy': 0.6965212733312317,
 'AUC_macro': 0.8813928235523464,
 'AUC_weighted': 0.8967333894421733,
 'average_precision_score_micro': 0.8491594331191559,
 'average_precision_score_weighted': 0.8403986060804449,
 'recall_score_macro': 0.6965212733312317,
 'precision_score_macro': 0.7205938149882555,
 'f1_score_weighted': 0.7854261062053336,
 'recall_score_weighted': 0.7883211678832117,
 'accuracy': 0.7883211678832117,
 'f1_score_micro': 0.7883211678832118,
 'weighted_accuracy': 0.8160791096550282,
 'AUC_micro': 0.9139501408323322,
 'f1_score_macro': 0.7067028288713043,
 'log_loss': 0.6738255217939387,
 'average_precision_score_macro': 0.7470504065502511,
 'precision_score_weighted': 0.7837336125451774,
 'precision_score_micro': 0.7883211678832117}

In [None]:
best_run , best_model = automl_run.get_output () #downloading best_model might require gpu compute instance and other installs in update_env.yml
best_run

In [30]:
training_run_id = best_run.id
training_run = Run(experiment, training_run_id)

In [32]:
# Inference script run arguments
arguments = [
    '--run_id' ,
    training_run_id ,
    '--experiment_name' ,
    experiment.name ,
    '--input_dataset_id',
    test_dataset.as_named_input ( 'automl_nlp_data_test' ) 
]
scoring_args = arguments

with tempfile.TemporaryDirectory() as tmpdir :
    # Download required files from training run into temp folder.
    entry_script_name = 'score_script.py'
    output_path = os.path.join ( tmpdir , entry_script_name )
    training_run.download_file (
        'outputs/' + entry_script_name , os.path.join ( tmpdir , entry_script_name )
    )

    script_run_config = ScriptRunConfig (
        source_directory = tmpdir ,
        script=entry_script_name ,
        compute_target = compute_target ,
        environment = training_run.get_environment() ,
        arguments = scoring_args
    )

    scoring_run = experiment.submit ( script_run_config )

In [34]:
print ( scoring_run )
_ = scoring_run.wait_for_completion ( show_output = False )

Run(Experiment: automl-nlp-text,
Id: automl-nlp-text_1650565910_02726c34,
Type: azureml.scriptrun,
Status: Queued)


In [None]:
output_prediction_file = './preds_multiclass.csv'
scoring_run.download_file (
    'outputs/predictions.csv' , output_file_path = output_prediction_file
)

test_set_predictions_df = pd.read_csv ( 'preds_multiclass.csv' )

test_data_df = test_dataset.to_pandas_dataframe ()

print (
    classification_report (
        test_data_df [ 'y' ] , test_set_predictions_df [ 'y' ]
    )
)

              precision    recall  f1-score   support

           0       0.81      0.81      0.81      4434
           1       0.81      0.84      0.82      4642
           2       0.55      0.44      0.49       925

    accuracy                           0.79     10001
   macro avg       0.72      0.70      0.71     10001
weighted avg       0.78      0.79      0.79     10001

