# Run Batch Inference

Now that we've trained our model, we can run batch inference using the Vertex AI API.

Note that it is not possible to use the Vertex AI SDK, as apparently running batch inference is not yet a feature within that SDK. Meaning we have to use curl.

In [1]:
import pandas as pd
import json

In [2]:
# the csv file we're using for testing needs to be in Google Cloud Storage

In [3]:
# !gsutil cp test_data/inference_sample.csv gs://csalling-docai-datasets-regional/beatles/inference_sample.csv

In [4]:
inference_sample = pd.read_feather("test_data/inference_sample.feather")

In [5]:
inference_sample.head()

Unnamed: 0,user_name,30_Seconds_to_Mars,65daysofstatic,A_Perfect_Circle,A_Tribe_Called_Quest,ABBA,ACDC,Adele,Aerosmith,Air,...,tag_shoegazer,tag_hair_metal,tag_rapcore,tag_underground_hip_hop,tag_symphonic_black_metal,tag_darkwave,tag_world,tag_latin,tag_spanish,Like_The_Beatles
0,thegiant,1.0,,,,,,11.0,1.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
1,nezter,,,,,,,,,3.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,False
2,augustohp,,52.0,502.0,,1.0,452.0,1.0,215.0,14.0,...,0.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,True
3,stalphonzo,,,,,,6.0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
4,davenall,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False


In [6]:
# parameters
LOCATION = "us-central1"
PROJECT = "mwpmltr"
BATCH_JOB_NAME = "foobar"
MODEL_ID = "projects/mwpmltr/locations/us-central1/models/8993391587719380992"  # I think this is the last successful model that was trained
URI = "gs://csalling-docai-datasets-regional/beatles/inference_sample.csv"
OUTPUT_URI_PREFIX = "gs://csalling-docai-datasets-regional/beatles/batch_prediction_output"
MACHINE_TYPE = "e2-standard-2"
STARTING_REPLICA_COUNT = 1
MAX_REPLICA_COUNT = 10  # default
GENERATE_EXPLANATION = "false"  # default value

In [7]:
request_body = {
    "displayName": BATCH_JOB_NAME,
    "model": MODEL_ID,
    "inputConfig": {
        "instancesFormat": "csv",
        "gcsSource": {
            "uris": [
                URI
            ]
        },
    },
    "outputConfig": {
        "predictionsFormat": "csv",
        "gcsDestination": {
            "outputUriPrefix": OUTPUT_URI_PREFIX
        }
    },
    "dedicatedResources": {
        "machineSpec": {
            "machineType": MACHINE_TYPE,
            "acceleratorCount": "0"
        },
        "startingReplicaCount": STARTING_REPLICA_COUNT,
        "maxReplicaCount": MAX_REPLICA_COUNT
    },
    "generateExplanation": GENERATE_EXPLANATION
}

In [8]:
request_body

{'displayName': 'foobar',
 'model': 'projects/mwpmltr/locations/us-central1/models/8993391587719380992',
 'inputConfig': {'instancesFormat': 'csv',
  'gcsSource': {'uris': ['gs://csalling-docai-datasets-regional/beatles/inference_sample.csv']}},
 'outputConfig': {'predictionsFormat': 'csv',
  'gcsDestination': {'outputUriPrefix': 'gs://csalling-docai-datasets-regional/beatles/batch_prediction_output'}},
 'dedicatedResources': {'machineSpec': {'machineType': 'e2-standard-2',
   'acceleratorCount': '0'},
  'startingReplicaCount': 1,
  'maxReplicaCount': 10},
 'generateExplanation': 'false'}

In [9]:
with open("request.json", "w") as outfile:
    outfile.write(json.dumps(request_body))

In [10]:
api_endpoint = f"https://{LOCATION}-aiplatform.googleapis.com/v1/projects/{PROJECT}/locations/{LOCATION}/batchPredictionJobs"

In [11]:
!curl -X POST \
    -H "Authorization: Bearer $(gcloud auth print-access-token)" \
    -H "Content-Type: application/json; charset=utf-8" \
    -d @request.json \
    {api_endpoint}

{
  "name": "projects/55590906972/locations/us-central1/batchPredictionJobs/3965190361164283904",
  "displayName": "foobar",
  "model": "projects/55590906972/locations/us-central1/models/8993391587719380992",
  "inputConfig": {
    "instancesFormat": "csv",
    "gcsSource": {
      "uris": [
        "gs://csalling-docai-datasets-regional/beatles/inference_sample.csv"
      ]
    }
  },
  "outputConfig": {
    "predictionsFormat": "csv",
    "gcsDestination": {
      "outputUriPrefix": "gs://csalling-docai-datasets-regional/beatles/batch_prediction_output"
    }
  },
  "dedicatedResources": {
    "machineSpec": {
      "machineType": "e2-standard-2"
    },
    "startingReplicaCount": 1,
    "maxReplicaCount": 10
  },
  "manualBatchTuningParameters": {
    "batchSize": 1000
  },
  "state": "JOB_STATE_PENDING",
  "createTime": "2023-03-26T19:06:05.208988Z",
  "updateTime": "2023-03-26T19:06:05.208988Z",
  "modelVersionId": "1",
  "disableContainerLogging": true
}


In [12]:
!gsutil ls gs://csalling-docai-datasets-regional/beatles/batch_prediction_output/prediction-beatles_automl_file_out_2200_tags-automl-2022_11_29T11_41_37_878Z

gs://csalling-docai-datasets-regional/beatles/batch_prediction_output/prediction-beatles_automl_file_out_2200_tags-automl-2022_11_29T11_41_37_878Z/prediction.errors_stats-00000-of-00001.csv
gs://csalling-docai-datasets-regional/beatles/batch_prediction_output/prediction-beatles_automl_file_out_2200_tags-automl-2022_11_29T11_41_37_878Z/prediction.results-00000-of-00002.csv
gs://csalling-docai-datasets-regional/beatles/batch_prediction_output/prediction-beatles_automl_file_out_2200_tags-automl-2022_11_29T11_41_37_878Z/prediction.results-00001-of-00002.csv


In [13]:
!gsutil cp gs://csalling-docai-datasets-regional/beatles/batch_prediction_output/prediction-beatles_automl_file_out_2200_tags-automl-2022_11_29T11_41_37_878Z/prediction.errors_stats-00000-of-00001.csv errors_stats-00000-of-00001.csv

Copying gs://csalling-docai-datasets-regional/beatles/batch_prediction_output/prediction-beatles_automl_file_out_2200_tags-automl-2022_11_29T11_41_37_878Z/prediction.errors_stats-00000-of-00001.csv...
/ [1 files][  6.6 KiB/  6.6 KiB]                                                
Operation completed over 1 objects/6.6 KiB.                                      


In [14]:
pd.read_csv('errors_stats-00000-of-00001.csv')

Unnamed: 0,user_name,30_Seconds_to_Mars,65daysofstatic,A_Perfect_Circle,A_Tribe_Called_Quest,ABBA,ACDC,Adele,Aerosmith,Air,...,tag_hair_metal,tag_rapcore,tag_underground_hip_hop,tag_symphonic_black_metal,tag_darkwave,tag_world,tag_latin,tag_spanish,Like_The_Beatles,errors_Like_The_Beatles


In [15]:
!gsutil cp gs://csalling-docai-datasets-regional/beatles/batch_prediction_output/prediction-beatles_automl_file_out_2200_tags-automl-2022_11_29T11_41_37_878Z/prediction.results-00000-of-00002.csv results_1.csv

Copying gs://csalling-docai-datasets-regional/beatles/batch_prediction_output/prediction-beatles_automl_file_out_2200_tags-automl-2022_11_29T11_41_37_878Z/prediction.results-00000-of-00002.csv...
/ [1 files][ 16.7 KiB/ 16.7 KiB]                                                
Operation completed over 1 objects/16.7 KiB.                                     


In [16]:
!gsutil cp gs://csalling-docai-datasets-regional/beatles/batch_prediction_output/prediction-beatles_automl_file_out_2200_tags-automl-2022_11_29T11_41_37_878Z/prediction.results-00001-of-00002.csv results_2.csv

Copying gs://csalling-docai-datasets-regional/beatles/batch_prediction_output/prediction-beatles_automl_file_out_2200_tags-automl-2022_11_29T11_41_37_878Z/prediction.results-00001-of-00002.csv...
/ [1 files][ 10.2 KiB/ 10.2 KiB]                                                
Operation completed over 1 objects/10.2 KiB.                                     


In [17]:
results_1 = pd.read_csv('results_1.csv')
results_2 = pd.read_csv('results_2.csv')

In [18]:
results_1.shape

(8, 516)

In [19]:
results_2.shape

(2, 516)

In [20]:
results_10 = pd.concat([results_1, results_2])

In [21]:
results_10

Unnamed: 0,user_name,30_Seconds_to_Mars,65daysofstatic,A_Perfect_Circle,A_Tribe_Called_Quest,ABBA,ACDC,Adele,Aerosmith,Air,...,tag_rapcore,tag_underground_hip_hop,tag_symphonic_black_metal,tag_darkwave,tag_world,tag_latin,tag_spanish,Like_The_Beatles,Like_The_Beatles_True_scores,Like_The_Beatles_False_scores
0,adherr,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,0.608192,0.391808
1,Andy_Greenwell,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,0.06812,0.93188
2,davenall,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,0.031825,0.968175
3,absentbebnim,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,0.036338,0.963662
4,lilyean,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,0.045222,0.954778
5,thegiant,1.0,,,,,,11.0,1.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,0.395616,0.604384
6,stalphonzo,,,,,,6.0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,0.550182,0.449818
7,auserzz,,,,,,,25.0,,,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,False,0.056349,0.943651
0,nezter,,,,,,,,,3.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,False,0.53027,0.46973
1,augustohp,,52.0,502.0,,1.0,452.0,1.0,215.0,14.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,True,0.94838,0.05162


In [22]:
results_10.columns

Index(['user_name', '30_Seconds_to_Mars', '65daysofstatic', 'A_Perfect_Circle',
       'A_Tribe_Called_Quest', 'ABBA', 'ACDC', 'Adele', 'Aerosmith', 'Air',
       ...
       'tag_rapcore', 'tag_underground_hip_hop', 'tag_symphonic_black_metal',
       'tag_darkwave', 'tag_world', 'tag_latin', 'tag_spanish',
       'Like_The_Beatles', 'Like_The_Beatles_True_scores',
       'Like_The_Beatles_False_scores'],
      dtype='object', length=516)

In [23]:
results_10.loc[:, 'prediction'] = results_10.Like_The_Beatles_True_scores > results_10.Like_The_Beatles_False_scores

In [24]:
from sklearn.metrics import confusion_matrix

In [25]:
confusion_matrix(results_10.Like_The_Beatles, results_10.prediction)

array([[4, 2],
       [2, 2]])

In [26]:
confusion_matrix(results_10.Like_The_Beatles, results_10.prediction, normalize='true') * 100

array([[66.66666667, 33.33333333],
       [50.        , 50.        ]])

In [27]:
tn, fp, fn, tp = confusion_matrix(results_10.Like_The_Beatles, results_10.prediction).ravel()

In [28]:
print(tn)
print(fp)
print(fn)
print(tp)

4
2
2
2
