---
## Prediction

In [2]:
import kfp
from kfp import compiler
from kfp.dsl import component, pipeline, Artifact, ClassificationMetrics, Input, Output, Model, Metrics

from google.cloud import aiplatform as aip
from typing import NamedTuple

from datetime import datetime

#import kfp.v2.dsl as dsl
#import google_cloud_pipeline_components as gcc_aip
from google_cloud_pipeline_components.v1.dataset import TabularDatasetCreateOp
from google_cloud_pipeline_components.v1.automl.training_job import AutoMLTabularTrainingJobRunOp
from google_cloud_pipeline_components.v1.endpoint import EndpointCreateOp, ModelDeployOp

from google.cloud import bigquery 
from google.protobuf import json_format
from google.protobuf.struct_pb2 import Value
import json
import numpy as np

In [3]:
PROJECT_ID = 'gcp-ccai-auto-ml-contactcenter'
REGION= "europe-west3"
REPO_NAME = "repo-demo3"
SERVICE_ACCOUNT = "944308723981-compute@developer.gserviceaccount.com"
BUCKET = "ccai-storage"
PIPELINE_NAME = "automl_pipeline"
YAML_NAME = f"{PIPELINE_NAME}.yml"
PIPELINE_ROOT = f"gs://{BUCKET}/pipeline_root/"
DISPLAY_NAME = PIPELINE_NAME.replace("_", "-")
NOTEBOOK = "automl"
DATANAME = "datasetnlp"
FILE_PATH = 'articlesoutput.parquet'
FOLDER = 'pipeline'
PROJECT_ID = 'gcp-ccai-auto-ml-contactcenter'
TABLE_ID = "stepfinalbq"
TEXT_COLUMN = 'body_pre'
LOCATION = "europe-west3"
NUM_DOC = 10
#BQ_SOURCE = "bq://gcp-ccai-auto-ml-contactcenter.datasetnlp.stepfinalbq"
OUTPUT_PROCESSING = 'step1_pipeline.parquet'
OUTPUT_TOKENIZATION = 'step2_pipeline.parquet'
OUTPUT_SENTIMENT = 'step3_pipeline.parquet'
OUTPUT_MODERATE = 'step4_pipeline.parquet'
OUTPUT_ENTITIES = 'step5_pipeline.parquet'
OUTPUT_FINAL = 'step_final_bq.parquet'

PIPELINE_ROOT

'gs://ccai-storage/pipeline_root/'

In [4]:
# Resources
DEPLOY_COMPUTE = 'n1-standard-4'

# Model Training
VAR_TARGET = 'topic'
VAR_OMIT = (
    'uri,url,date,body,time,dateTime,dateTimePub,lang,isDuplicate,dataType,sentiment,' +
    'eventUri,image,sharesFacebook,' +
    'sourceLocationLabel,categoryLabels,' +
    'categoryWeights,' +
    'alexaCountryRank,date_column,year,year_month,' +
    'num_documents,' +
    'PERSON,OTHER,ORGANIZATION,' +
    'EVENT,LOCATION,WORK_OF_ART,CONSUMER_GOOD,NUMBER,DATE,' +
    'NUMBER_mean_salience,' +
    'DATE_mean_salience,PRICE,ADDRESS,' +
    'ADDRESS_mean_salience,PHONE_NUMBER,PHONE_NUMBER_mean_salience'
)

COLUMN_TOPICK = "shares_scaled, body_pre"

In [5]:
aip.init(project=PROJECT_ID, staging_bucket=PIPELINE_ROOT, location=REGION)
bq = bigquery.Client()

In [6]:
# Remove the duplicate date from the string
VAR_OMIT_rev = VAR_OMIT.replace("DATE,", "")

print(VAR_OMIT_rev)

uri,url,date,body,time,dateTime,dateTimePub,lang,isDuplicate,dataType,sentiment,eventUri,image,sharesFacebook,sourceLocationLabel,categoryLabels,categoryWeights,alexaCountryRank,date_column,year,year_month,num_documents,PERSON,OTHER,ORGANIZATION,EVENT,LOCATION,WORK_OF_ART,CONSUMER_GOOD,NUMBER,NUMBER_mean_salience,DATE_mean_salience,PRICE,ADDRESS,ADDRESS_mean_salience,PHONE_NUMBER,PHONE_NUMBER_mean_salience


In [7]:
pred = bq.query(
    query = f"""
        SELECT * EXCEPT({VAR_TARGET}, split, {VAR_OMIT_rev})
        FROM {DATANAME}.{TABLE_ID}
        WHERE split='TEST'
        LIMIT 10
    """
).to_dataframe()

In [8]:
pred['relevance'] = pred['relevance'].astype(str)
pred['importanceRank'] = pred['importanceRank'].astype(str)
pred['alexaGlobalRank'] = pred['alexaGlobalRank'].astype(str)
pred['month'] = pred['month'].astype(str)

newobs = pred.to_dict(orient='records')
newobs[0]

{'title': 'GF, Varrese sotto attacco: pesanti critiche da due ex Vipponi',
 'relevance': '1',
 'authors': 'Debora Manzoli',
 'sourceTitle': 'libero.it',
 'importanceRank': '1000000',
 'alexaGlobalRank': '0',
 'month': '12',
 'shares_scaled': -0.28,
 'body_pre': "Scrittrice , copywriter , editor pubblicista mantovana , laureata Lettere , Cinema Tv . due libri all'attivo ama scrittura follia . L'ultima puntata Grande Fratello visto protagonisti Massimiliano Varrese , ripreso Alfonso Signorini brutti atteggiamenti confronti Beatrice Luzzi . molti web infatti richiesto squalifica Varrese , stanchi atteggiamenti troppo aggressivi , gieffino fine ' cavata solamente scuse Beatrice pubblico ramanzina presentatore . , però , andato giù proprio , due ex vipponi Grande Fratello , passato stati squalificati gioco , passati all'attacco . Ecco tratta cosa detto . Dopo ramanzina Alfonso Signorini diretta , Varrese ammesso propri errori fine ' cavata ( sempre resto ) semplice ammonizione . `` uomo dev

In [9]:
instances = [json_format.ParseDict(newob, Value()) for newob in newobs]

### Get Predictions: Python Client

In [10]:
aip.Endpoint.list(filter=f'labels.notebook={NOTEBOOK}')

[<google.cloud.aiplatform.models.Endpoint object at 0x7f3aa693f2e0> 
 resource name: projects/944308723981/locations/europe-west3/endpoints/1221042847017336832,
 <google.cloud.aiplatform.models.Endpoint object at 0x7f3aa693f5e0> 
 resource name: projects/944308723981/locations/europe-west3/endpoints/1242434945247346688,
 <google.cloud.aiplatform.models.Endpoint object at 0x7f3aa693f4c0> 
 resource name: projects/944308723981/locations/europe-west3/endpoints/6842661081882558464,
 <google.cloud.aiplatform.models.Endpoint object at 0x7f3aa693fbe0> 
 resource name: projects/944308723981/locations/europe-west3/endpoints/238132228343726080]

In [11]:
endpoint = aip.Endpoint.list(filter=f'labels.notebook={NOTEBOOK}')[0]
endpoint.display_name

'automl_datasetnlp_20240325224945'

In [12]:
prediction = endpoint.predict(instances = instances) # or instances = newobs
prediction.predictions[0]

{'classes': ['Technology',
  'Sports',
  'Science',
  'Politics',
  'Health',
  'Environment',
  'Business',
  'Arts and Entertainment'],
 'scores': [0.01056508254259825,
  0.01481661759316921,
  0.00365602714009583,
  0.01545155979692936,
  0.001204681699164212,
  0.001032748143188655,
  0.01098049152642488,
  0.9422927498817444]}

In [13]:
### Get Predictions: REST