In [1]:
from google.cloud import aiplatform
from datetime import datetime
import kfp
from kfp.v2 import compiler
#import kfp.v2.dsl as dsl
#import google_cloud_pipeline_components as gcc_aip
from google_cloud_pipeline_components.v1.dataset import TabularDatasetCreateOp
from google_cloud_pipeline_components.v1.automl.training_job import AutoMLTabularTrainingJobRunOp
from google_cloud_pipeline_components.v1.endpoint import EndpointCreateOp, ModelDeployOp

from google.cloud import bigquery
from google.protobuf import json_format
from google.protobuf.struct_pb2 import Value
import json
import numpy as np

  from kfp.v2 import compiler


In [2]:
project = !gcloud config get-value project
PROJECT_ID = project[0]
PROJECT_ID

'gcp-ccai-auto-ml-contactcenter'

In [3]:
PROJECT_ID = 'gcp-ccai-auto-ml-contactcenter'
REGION= "europe-west3"
REPO_NAME = "repo-demo3"
SERVICE_ACCOUNT = "944308723981-compute@developer.gserviceaccount.com"
BUCKET = "ccai-storage"
PIPELINE_NAME = "automl_pipeline"
YAML_NAME = f"{PIPELINE_NAME}.yml"
PIPELINE_ROOT = f"gs://{BUCKET}/pipeline_root/"
DISPLAY_NAME = PIPELINE_NAME.replace("_", "-")
BQ_SOURCE = "bq://gcp-ccai-auto-ml-contactcenter.dataset_nlp.step_final_bq"
NOTEBOOK = "automl"
DATANAME = "datasetnlp"
BQ_NAME = "finaldf5"
PIPELINE_ROOT

'gs://ccai-storage/pipeline_root/'

In [4]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

URI = f"gs://{BUCKET}/{DATANAME}/models/{NOTEBOOK}"
DIR = f"temp/{NOTEBOOK}"

In [76]:
# Resources
DEPLOY_COMPUTE = 'n1-standard-4'

# Model Training
VAR_TARGET = 'topic'
VAR_OMIT = (
    'uri,url,title,body,time,dateTime,dateTimePub,lang,isDuplicate,dataType,sentiment,' +
    'eventUri,image,authors,sharesFacebook,' +
    'sourceLocationLabel,categoryLabels,' +
    'categoryWeights,alexaGlobalRank,' +
    'alexaCountryRank,date_column,year,year_month,' +
    'num_documents,' +
    'PERSON,OTHER,ORGANIZATION,' +
    'EVENT,LOCATION,WORK_OF_ART,CONSUMER_GOOD,NUMBER,DATE,' +
    'NUMBER_mean_salience,' +
    'DATE_mean_salience,PRICE,ADDRESS,' +
    'ADDRESS_mean_salience,PHONE_NUMBER,PHONE_NUMBER_mean_salience'
)

COLUMN_TOPICK = "shares_scaled, body_pre"

In [47]:
aiplatform.init(project=PROJECT_ID, location=REGION)
bq = bigquery.Client()

In [48]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

URI = f"gs://{BUCKET}/{DATASET}/models/{NOTEBOOK}"
DIR = f"temp/{NOTEBOOK}"
URI


'gs://ccai-storage/datasetnlp/models/automl'

In [49]:
SERVICE_ACCOUNT = !gcloud config list --format='value(core.account)' 
SERVICE_ACCOUNT = SERVICE_ACCOUNT[0]
SERVICE_ACCOUNT

'944308723981-compute@developer.gserviceaccount.com'

In [50]:
!gcloud projects get-iam-policy $PROJECT_ID --filter="bindings.members:$SERVICE_ACCOUNT" --format='table(bindings.role)' --flatten="bindings[].members"

ROLE
organizations/329273198709/roles/Ruolopersonalizzato
organizations/329273198709/roles/Ruolopersonalizzato517
roles/aiplatform.user
roles/artifactregistry.admin
roles/artifactregistry.createOnPushWriter
roles/artifactregistry.writer
roles/bigquery.admin
roles/cloudtranslate.editor
roles/contactcenterinsights.editor
roles/dataflow.admin
roles/dataflow.developer
roles/dataflow.worker
roles/datastore.user
roles/dialogflow.client
roles/pubsub.editor
roles/securesourcemanager.repoCreator
roles/securesourcemanager.repoWriter
roles/source.writer
roles/storage.objectAdmin
roles/viewer


In [10]:
!rm -rf {DIR}
!mkdir -p {DIR}

In [11]:
!rm -rf {URI}
!mkdir -p {URI}

In [12]:
REGION

'europe-west3'

In [22]:

@kfp.dsl.pipeline(
    name = f'kfp-{NOTEBOOK}-{DATASET}-{TIMESTAMP}',
    pipeline_root = URI+'/'+str(TIMESTAMP)+'/kfp/'
)
def pipeline(
    project: str,
    dataset: str,
    display_name: str,
    deploy_machine: str,
    bq_source: str,
    var_target: str,
    var_omit: str,
    features: dict,
    labels: dict 
):
    
       # dataset
    dataset = TabularDatasetCreateOp(
        project = project,
        display_name = display_name,
        bq_source = bq_source,
        labels = labels,
        location = REGION
    )
    
    # training
    model = AutoMLTabularTrainingJobRunOp(
        project = project,
        display_name = display_name,
        optimization_prediction_type = "classification",
        optimization_objective="minimize-log-loss",
        budget_milli_node_hours = 1000,
        disable_early_stopping=False,
        column_specs = features,
        dataset = dataset.outputs['dataset'],
        target_column = var_target,
        predefined_split_column_name = 'split',
        labels = labels,
        location = REGION
    )
    
    # Endpoint: Creation
    endpoint = EndpointCreateOp(
        project = project,
        display_name = display_name,
        labels = labels,
        location = REGION
    )
    
    # Endpoint: Deployment of Model
    deployment = ModelDeployOp(
        model = model.outputs["model"],
        endpoint = endpoint.outputs["endpoint"],
        dedicated_resources_min_replica_count = 1,
        dedicated_resources_max_replica_count = 1,
        traffic_split = {"0": 100},
        dedicated_resources_machine_type= deploy_machine
    )

In [23]:
compiler.Compiler().compile(
    pipeline_func = pipeline,
    package_path = f"{DIR}/{NOTEBOOK}.json"
)

In [24]:
!gsutil cp {DIR}/{NOTEBOOK}.json {URI}/{TIMESTAMP}/kfp/

Copying file://temp/automl/automl.json [Content-Type=application/json]...
/ [1 files][ 47.0 KiB/ 47.0 KiB]                                                
Operation completed over 1 objects/47.0 KiB.                                     


In [51]:
# get feature names
query = f"SELECT * FROM {DATASET}.INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '{BQ_NAME}'"
schema = bq.query(query).to_dataframe()
OMIT = VAR_OMIT.split(",") + [VAR_TARGET, 'split']
features = schema[~schema.column_name.isin(OMIT)].column_name.tolist()
features = dict.fromkeys(features, 'auto')

In [52]:
features

{'relevance': 'auto',
 'sourceTitle': 'auto',
 'importanceRank': 'auto',
 'month': 'auto',
 'shares_scaled': 'auto',
 'body_pre': 'auto',
 'score': 'auto',
 'magnitude': 'auto',
 'Toxic': 'auto',
 'Insult': 'auto',
 'Profanity': 'auto',
 'Derogatory': 'auto',
 'Sexual': 'auto',
 'Death_Harm__Tragedy': 'auto',
 'Violent': 'auto',
 'Firearms__Weapons': 'auto',
 'Public_Safety': 'auto',
 'Health': 'auto',
 'Religion__Belief': 'auto',
 'Illicit_Drugs': 'auto',
 'War__Conflict': 'auto',
 'Politics': 'auto',
 'Finance': 'auto',
 'Legal': 'auto',
 'PERSON_mean_salience': 'auto',
 'OTHER_mean_salience': 'auto',
 'ORGANIZATION_mean_salience': 'auto',
 'EVENT_mean_salience': 'auto',
 'LOCATION_mean_salience': 'auto',
 'WORK_OF_ART_mean_salience': 'auto',
 'CONSUMER_GOOD_mean_salience': 'auto',
 'PRICE_mean_salience': 'auto'}

In [27]:
pipeline = aiplatform.PipelineJob(
    display_name = f'{NOTEBOOK}_{DATASET}_{TIMESTAMP}',
    template_path = f"{URI}/{TIMESTAMP}/kfp/{NOTEBOOK}.json",
    parameter_values = {
        "project" : PROJECT_ID,
        "dataset" : DATASET,
        "display_name" : f'{NOTEBOOK}_{DATASET}_{TIMESTAMP}',
        "deploy_machine" : DEPLOY_COMPUTE,
        "bq_source" : f'bq://{PROJECT_ID}.{DATASET}.{BQ_NAME}',
        "var_target" : VAR_TARGET,
        "var_omit" : VAR_OMIT,
        "features" : features,
        "labels" : {'notebook': NOTEBOOK}       
    },
    labels = {'notebook': NOTEBOOK},
    enable_caching=True
)

In [28]:
response = pipeline.run(
    service_account = SERVICE_ACCOUNT
)

Creating PipelineJob
PipelineJob created. Resource name: projects/944308723981/locations/europe-west3/pipelineJobs/kfp-automl-datasetnlp-20240324222952-20240324231852
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/944308723981/locations/europe-west3/pipelineJobs/kfp-automl-datasetnlp-20240324222952-20240324231852')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/europe-west3/pipelines/runs/kfp-automl-datasetnlp-20240324222952-20240324231852?project=944308723981
PipelineJob projects/944308723981/locations/europe-west3/pipelineJobs/kfp-automl-datasetnlp-20240324222952-20240324231852 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/944308723981/locations/europe-west3/pipelineJobs/kfp-automl-datasetnlp-20240324222952-20240324231852 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/944308723981/locations/europe-west3/pipelineJobs/kfp-automl-datasetnlp-20240324222952-202403

In [30]:
print(f"Review the Pipeline as it runs here:\nhttps://console.cloud.google.com/vertex-ai/locations/{REGION}/pipelines/runs/{pipeline.resource_name.split('/')[-1]}?project={PROJECT_ID}")

Review the Pipeline as it runs here:
https://console.cloud.google.com/vertex-ai/locations/europe-west3/pipelines/runs/kfp-automl-datasetnlp-20240324222952-20240324231852?project=gcp-ccai-auto-ml-contactcenter


aiplatform.get_pipeline_df(pipeline = f'kfp-{NOTEBOOK}-{DATASET}-{TIMESTAMP}')

In [32]:
models = aiplatform.Model.list(filter=f'labels.notebook={NOTEBOOK}')

In [33]:
model = models[0]
model.resource_name

'projects/944308723981/locations/europe-west3/models/8349999164586721280'

In [34]:
evaluation = model.get_model_evaluation().to_dict() # get first evaluation

In [35]:
evaluation.keys()

dict_keys(['name', 'metricsSchemaUri', 'metrics', 'createTime', 'sliceDimensions', 'modelExplanation'])

In [36]:
evaluation['metrics'].keys()

dict_keys(['auPrc', 'auRoc', 'logLoss', 'confusionMatrix', 'confidenceMetrics'])

In [37]:
evaluation['metrics']['logLoss']

0.88903844

In [38]:
evaluation['metrics']['confidenceMetrics'][3]

{'falseNegativeCount': '2',
 'recallAt1': 0.6858038,
 'falsePositiveRateAt1': 0.044885177,
 'truePositiveCount': '956',
 'recall': 0.9979123,
 'trueNegativeCount': '2577',
 'confidenceThreshold': 0.01,
 'falsePositiveRate': 0.6157173,
 'precisionAt1': 0.6858038,
 'confusionMatrix': {'rows': [[3.0, 1.0, 0.0, 5.0, 1.0, 1.0, 13.0, 7.0, 0.0],
   [2.0, 55.0, 0.0, 0.0, 1.0, 0.0, 6.0, 27.0, 0.0],
   [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 6.0, 0.0],
   [1.0, 1.0, 0.0, 236.0, 0.0, 2.0, 30.0, 19.0, 0.0],
   [0.0, 2.0, 1.0, 2.0, 34.0, 0.0, 7.0, 10.0, 0.0],
   [0.0, 0.0, 2.0, 0.0, 0.0, 7.0, 0.0, 1.0, 0.0],
   [1.0, 2.0, 1.0, 42.0, 0.0, 1.0, 108.0, 30.0, 0.0],
   [1.0, 10.0, 2.0, 25.0, 9.0, 3.0, 24.0, 214.0, 0.0],
   [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
  'annotationSpecs': [{'displayName': 'Technology', 'id': '0'},
   {'displayName': 'Sports', 'id': '1'},
   {'displayName': 'Science', 'id': '2'},
   {'displayName': 'Politics', 'id': '3'},
   {'displayName': 'Health', 'id': '4'},
   {'displ

In [39]:
for i in range(len(evaluation['metrics']['confusionMatrix']['annotationSpecs'])):
    print('True Label = ', evaluation['metrics']['confusionMatrix']['annotationSpecs'][i]['displayName'], ' has Predicted labels = ', evaluation['metrics']['confusionMatrix']['rows'][i])


True Label =  Technology  has Predicted labels =  [3.0, 1.0, 0.0, 5.0, 1.0, 1.0, 13.0, 7.0, 0.0]
True Label =  Sports  has Predicted labels =  [2.0, 55.0, 0.0, 0.0, 1.0, 0.0, 6.0, 27.0, 0.0]
True Label =  Science  has Predicted labels =  [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 6.0, 0.0]
True Label =  Politics  has Predicted labels =  [1.0, 1.0, 0.0, 236.0, 0.0, 2.0, 30.0, 19.0, 0.0]
True Label =  Health  has Predicted labels =  [0.0, 2.0, 1.0, 2.0, 34.0, 0.0, 7.0, 10.0, 0.0]
True Label =  Environment  has Predicted labels =  [0.0, 0.0, 2.0, 0.0, 0.0, 7.0, 0.0, 1.0, 0.0]
True Label =  Business  has Predicted labels =  [1.0, 2.0, 1.0, 42.0, 0.0, 1.0, 108.0, 30.0, 0.0]
True Label =  Arts and Entertainment  has Predicted labels =  [1.0, 10.0, 2.0, 25.0, 9.0, 3.0, 24.0, 214.0, 0.0]
True Label =  DROPPED  has Predicted labels =  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [40]:
model_client = aiplatform.gapic.ModelServiceClient(
    client_options = {
        'api_endpoint' : f'{REGION}-aiplatform.googleapis.com'
    }
)

In [41]:
slices = model_client.list_model_evaluation_slices(parent = evaluation['name'])

In [42]:
for slice in slices:
    print('Label = ', slice.slice_.value, 'has logLoss = ', slice.metrics['logLoss'])

Label =  Business has logLoss =  0.34745255
Label =  Technology has logLoss =  0.11609337
Label =  Science has logLoss =  0.04861673
Label =  Environment has logLoss =  0.03680636
Label =  Politics has logLoss =  0.28194597
Label =  Health has logLoss =  0.09024783
Label =  Arts and Entertainment has logLoss =  0.39729494
Label =  Sports has logLoss =  0.15564342


In [64]:
    query = f"""
        SELECT {COLUMN_TOPICK}
        FROM `{DATASET}.{BQ_NAME}`
        WHERE split='TEST'
        LIMIT 10
    """

In [65]:
query

"\n    SELECT * EXCEPT(topic, split, uri,url,title,body,date,time,dateTime,dateTimePub,lang,isDuplicate,dataType,sentiment,eventUri,image,authors,sharesFacebook,sourceLocationLabel,categoryLabels,categoryWeights,alexaGlobalRank,alexaCountryRank,date_column,year,year_month,num_documents,PERSON,OTHER,ORGANIZATION,EVENT,LOCATION,WORK_OF_ART,CONSUMER_GOOD,NUMBER,DATE,NUMBER_mean_salience,DATE_mean_salience,PRICE,ADDRESS,ADDRESS_mean_salience,PHONE_NUMBER,PHONE_NUMBER_mean_salience,DATE)\n    FROM `datasetnlp.finaldf5`\n    WHERE split='TEST'\n    LIMIT 10\n"

In [99]:
pred = bq.query(
    query = f"""
        SELECT relevance
        FROM `{DATASET}.{BQ_NAME}`
        WHERE split='TEST'
        LIMIT 10
    """
).to_dataframe()


In [100]:

pred.head(4)

Unnamed: 0,relevance
0,1
1,1
2,1
3,1


In [101]:
newobs = pred.to_dict(orient='records')
newobs[0]

{'relevance': 1}

In [102]:
instances = [json_format.ParseDict(newob, Value()) for newob in newobs]

In [103]:
instances

[struct_value {
   fields {
     key: "relevance"
     value {
       number_value: 1
     }
   }
 },
 struct_value {
   fields {
     key: "relevance"
     value {
       number_value: 1
     }
   }
 },
 struct_value {
   fields {
     key: "relevance"
     value {
       number_value: 1
     }
   }
 },
 struct_value {
   fields {
     key: "relevance"
     value {
       number_value: 1
     }
   }
 },
 struct_value {
   fields {
     key: "relevance"
     value {
       number_value: 1
     }
   }
 },
 struct_value {
   fields {
     key: "relevance"
     value {
       number_value: 1
     }
   }
 },
 struct_value {
   fields {
     key: "relevance"
     value {
       number_value: 1
     }
   }
 },
 struct_value {
   fields {
     key: "relevance"
     value {
       number_value: 1
     }
   }
 },
 struct_value {
   fields {
     key: "relevance"
     value {
       number_value: 1
     }
   }
 },
 struct_value {
   fields {
     key: "relevance"
     value {
       number_value

### Get Predictions: Python Client


In [104]:
aiplatform.Endpoint.list(filter=f'labels.notebook={NOTEBOOK}')

[<google.cloud.aiplatform.models.Endpoint object at 0x7f1a8453b910> 
 resource name: projects/944308723981/locations/europe-west3/endpoints/6842661081882558464,
 <google.cloud.aiplatform.models.Endpoint object at 0x7f1a84539690> 
 resource name: projects/944308723981/locations/europe-west3/endpoints/238132228343726080]

In [105]:
endpoint = aiplatform.Endpoint.list(filter=f'labels.notebook={NOTEBOOK}')[0]
endpoint.display_name

'automl_datasetnlp_20240324222952'

In [106]:
prediction = endpoint.predict(instances = instances) # or instances = newobs
prediction.predictions[0]

InvalidArgument: 400 {"error": "Column: relevance. Error: Expected string_value but got number_value. Type casting is not allowed."}

In [96]:
prediction.predictions[0]['classes'][np.argmax(prediction.predictions[0]['scores'])]

NameError: name 'prediction' is not defined

### Get Predictions: REST

In [97]:
with open(f'{DIR}/request.json','w') as file:
    file.write(json.dumps({"instances": [newobs[0]]}))

In [98]:
!curl -X POST \
-H "Authorization: Bearer "$(gcloud auth application-default print-access-token) \
-H "Content-Type: application/json; charset=utf-8" \
-d @{DIR}/request.json \
https://{REGION}-aiplatform.googleapis.com/v1/{endpoint.resource_name}:predict

{"error": "Column prefix: . Error: Missing struct property: relevance."}

### Get Predictions: gcloud (CLI)

In [None]:
!gcloud beta ai endpoints predict {endpoint.name.rsplit('/',1)[-1]} --region={REGION} --json-request={DIR}/request.json

---
## Explanations
Interpretation Guide
- https://cloud.google.com/vertex-ai/docs/predictions/interpreting-results-automl#tabular

In [None]:
explanation = endpoint.explain(instances = instances)

In [None]:
explanation.predictions[0]

In [None]:
print("attribution:")
print("baseline output",explanation.explanations[0].attributions[0].baseline_output_value)
print("instance output",explanation.explanations[0].attributions[0].instance_output_value)
print("output_index",explanation.explanations[0].attributions[0].output_index)
print("output display value",explanation.explanations[0].attributions[0].output_display_name)
print("approximation error",explanation.explanations[0].attributions[0].approximation_error)

In [None]:
import matplotlib.pyplot as plt
features = []
scores = []
for k in explanation.explanations[0].attributions[0].feature_attributions:
    features.append(k)
    scores.append(explanation.explanations[0].attributions[0].feature_attributions[k])
features = [x for _, x in sorted(zip(scores, features))]
scores = sorted(scores)
fig, ax = plt.subplots()
fig.set_size_inches(9, 9)
ax.barh(features, scores)
fig.show()