# Run Batch Inference

Now that we've trained our model, we can run batch inference using the Vertex AI API.

Note that it is not possible to use the Vertex AI SDK, as apparently running batch inference is not yet a feature within that SDK. Meaning we have to use curl.

In [1]:
import pandas as pd
import json

In [None]:
# the csv file we're using for testing needs to be in Google Cloud Storage

In [26]:
!gsutil cp test_data/inference_sample.csv gs://aaa-aca-ml-workshop/beatles/inference_sample.csv

Copying file://test_data/inference_sample.csv [Content-Type=text/csv]...
/ [1 files][ 18.8 KiB/ 18.8 KiB]                                                
Operation completed over 1 objects/18.8 KiB.                                     


In [2]:
inference_sample = pd.read_feather("test_data/inference_sample.feather")

In [3]:
inference_sample.head()

Unnamed: 0,user_name,30_Seconds_to_Mars,65daysofstatic,A_Perfect_Circle,A_Tribe_Called_Quest,ABBA,ACDC,Adele,Aerosmith,Air,...,tag_shoegazer,tag_hair_metal,tag_rapcore,tag_underground_hip_hop,tag_symphonic_black_metal,tag_darkwave,tag_world,tag_latin,tag_spanish,Like_The_Beatles
0,thegiant,1.0,,,,,,11.0,1.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
1,nezter,,,,,,,,,3.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,False
2,augustohp,,52.0,502.0,,1.0,452.0,1.0,215.0,14.0,...,0.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,True
3,stalphonzo,,,,,,6.0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
4,davenall,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False


In [27]:
# parameters
LOCATION = "us-central1"
PROJECT = "ds-training-380514"
BATCH_JOB_NAME = "foobar"
MODEL_ID = "projects/354621994428/locations/us-central1/models/5489591077625135104"
URI = "gs://aaa-aca-ml-workshop/beatles/inference_sample.csv"
OUTPUT_URI_PREFIX = "gs://aaa-aca-ml-workshop/beatles/batch_prediction_output"
MACHINE_TYPE = "e2-standard-2"
STARTING_REPLICA_COUNT = 1
MAX_REPLICA_COUNT = 10  # default
GENERATE_EXPLANATION = "false"  # default value

In [28]:
request_body = {
    "displayName": BATCH_JOB_NAME,
    "model": MODEL_ID,
    "inputConfig": {
        "instancesFormat": "csv",
        "gcsSource": {
            "uris": [
                URI
            ]
        },
    },
    "outputConfig": {
        "predictionsFormat": "csv",
        "gcsDestination": {
            "outputUriPrefix": OUTPUT_URI_PREFIX
        }
    },
    "dedicatedResources": {
        "machineSpec": {
            "machineType": MACHINE_TYPE,
            "acceleratorCount": "0"
        },
        "startingReplicaCount": STARTING_REPLICA_COUNT,
        "maxReplicaCount": MAX_REPLICA_COUNT
    },
    "generateExplanation": GENERATE_EXPLANATION
}

In [29]:
request_body

{'displayName': 'foobar',
 'model': 'projects/354621994428/locations/us-central1/models/5489591077625135104',
 'inputConfig': {'instancesFormat': 'csv',
  'gcsSource': {'uris': ['gs://aaa-aca-ml-workshop/beatles/inference_sample.csv']}},
 'outputConfig': {'predictionsFormat': 'csv',
  'gcsDestination': {'outputUriPrefix': 'gs://aaa-aca-ml-workshop/beatles/batch_prediction_output'}},
 'dedicatedResources': {'machineSpec': {'machineType': 'e2-standard-2',
   'acceleratorCount': '0'},
  'startingReplicaCount': 1,
  'maxReplicaCount': 10},
 'generateExplanation': 'false'}

In [30]:
with open("request.json", "w") as outfile:
    outfile.write(json.dumps(request_body))

In [31]:
api_endpoint = f"https://{LOCATION}-aiplatform.googleapis.com/v1/projects/{PROJECT}/locations/{LOCATION}/batchPredictionJobs"

In [32]:
!curl -X POST \
    -H "Authorization: Bearer $(gcloud auth print-access-token)" \
    -H "Content-Type: application/json; charset=utf-8" \
    -d @request.json \
    {api_endpoint}

{
  "name": "projects/354621994428/locations/us-central1/batchPredictionJobs/2378639062748626944",
  "displayName": "foobar",
  "model": "projects/354621994428/locations/us-central1/models/5489591077625135104",
  "inputConfig": {
    "instancesFormat": "csv",
    "gcsSource": {
      "uris": [
        "gs://aaa-aca-ml-workshop/beatles/inference_sample.csv"
      ]
    }
  },
  "outputConfig": {
    "predictionsFormat": "csv",
    "gcsDestination": {
      "outputUriPrefix": "gs://aaa-aca-ml-workshop/beatles/batch_prediction_output"
    }
  },
  "dedicatedResources": {
    "machineSpec": {
      "machineType": "e2-standard-2"
    },
    "startingReplicaCount": 1,
    "maxReplicaCount": 10
  },
  "manualBatchTuningParameters": {
    "batchSize": 1000
  },
  "state": "JOB_STATE_PENDING",
  "createTime": "2023-03-27T14:18:15.866757Z",
  "updateTime": "2023-03-27T14:18:15.866757Z",
  "modelVersionId": "1",
  "disableContainerLogging": true
}


In [25]:
!gsutil ls gs://aaa-aca-ml-workshop/beatles/batch_prediction_output/prediction-beatles_automl_file_out_2200_tags-automl-2022_11_29T11_41_37_878Z

CommandException: One or more URLs matched no objects.


In [17]:
!gsutil cp gs://aaa-aca-ml-workshop/beatles/batch_prediction_output/prediction-beatles_automl_file_out_2200_tags-automl-2022_11_29T11_41_37_878Z/prediction.errors_stats-00000-of-00001.csv errors_stats-00000-of-00001.csv

CommandException: No URLs matched: gs://aaa-aca-ml-workshop/beatles/batch_prediction_output/prediction-beatles_automl_file_out_2200_tags-automl-2022_11_29T11_41_37_878Z/prediction.errors_stats-00000-of-00001.csv


In [18]:
pd.read_csv('errors_stats-00000-of-00001.csv')

Unnamed: 0,user_name,30_Seconds_to_Mars,65daysofstatic,A_Perfect_Circle,A_Tribe_Called_Quest,ABBA,ACDC,Adele,Aerosmith,Air,...,tag_hair_metal,tag_rapcore,tag_underground_hip_hop,tag_symphonic_black_metal,tag_darkwave,tag_world,tag_latin,tag_spanish,Like_The_Beatles,errors_Like_The_Beatles


In [None]:
!gsutil cp gs://aaa-aca-ml-workshop/beatles/batch_prediction_output/prediction-beatles_automl_file_out_2200_tags-automl-2022_11_29T11_41_37_878Z/prediction.results-00000-of-00002.csv results_1.csv

In [None]:
!gsutil cp gs://aaa-aca-ml-workshop/beatles/batch_prediction_output/prediction-beatles_automl_file_out_2200_tags-automl-2022_11_29T11_41_37_878Z/prediction.results-00001-of-00002.csv results_2.csv

In [None]:
results_1 = pd.read_csv('results_1.csv')
results_2 = pd.read_csv('results_2.csv')

In [None]:
results_1.shape

In [None]:
results_2.shape

In [None]:
results_10 = pd.concat([results_1, results_2])

In [None]:
results_10

In [None]:
results_10.columns

In [None]:
results_10.loc[:, 'prediction'] = results_10.Like_The_Beatles_True_scores > results_10.Like_The_Beatles_False_scores

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(results_10.Like_The_Beatles, results_10.prediction)

In [None]:
confusion_matrix(results_10.Like_The_Beatles, results_10.prediction, normalize='true') * 100

In [None]:
tn, fp, fn, tp = confusion_matrix(results_10.Like_The_Beatles, results_10.prediction).ravel()

In [None]:
print(tn)
print(fp)
print(fn)
print(tp)