In [55]:
import sagemaker as sg
import pandas as pd

In [51]:
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark import SQLContext

## Copy test data locally

In [54]:
! aws s3 cp s3://623571303006-dev-model-training/output/test/part-00000-422a5ce9-e7d3-4f4e-9360-0fe04ac664c3-c000.csv ./test.csv

download: s3://623571303006-dev-model-training/output/test/part-00000-422a5ce9-e7d3-4f4e-9360-0fe04ac664c3-c000.csv to ./test.csv


In [61]:
! aws s3 cp s3://623571303006-dev-model-training/raw_data/classes.txt .

download: s3://623571303006-dev-model-training/raw_data/classes.txt to ./classes.txt


In [116]:
classes = pd.read_csv("classes.txt", header=None, names=["class"])
classes['label'] = classes.index + 1
test_data = pd.read_csv("test.csv", header=None, names=["true_label", "text"]).merge(classes, left_on="true_label", right_on="label", how="left").drop("label", axis="columns")


In [117]:
test_data.sample(frac=0.2).head()

Unnamed: 0,true_label,text,class
106616,2,wartburg theological seminary wartburg theolog...,EducationalInstitution
470588,3,tommy johnagin tommy johnagin born march 25 19...,Artist
377137,12,transition the first edition album transition ...,Album
334334,7,clara barton school clara barton school is a h...,Building
66583,12,focus 3 focus 3 is the third studio album from...,Album


## Run Model Predictor

In [118]:
endpoint = sg.predictor.Predictor("dev-bztext-dbpedia", serializer=sg.serializers.JSONSerializer(), deserializer=sg.deserializers.JSONDeserializer())

In [120]:
resp = endpoint.predict({"instances": test_data[test_data.index < 1000]['text'].tolist()})

In [121]:
resp_cleaned = [{'label': r['label'][0].replace('__label__',''), 'prob':r['prob'][0]} for r in resp]

In [126]:
results = test_data[test_data.index < 1000].merge(pd.DataFrame(resp_cleaned), left_index=True, right_index=True)

In [128]:
results.sample(frac=0.1).head()

Unnamed: 0,true_label,text,class,label,prob
802,4,daniel da silva born 1973 daniel da silva born...,Athlete,4,0.969495
455,3,kat edmonson katherine kat edmonson born augus...,Artist,3,0.957409
584,3,john silvester writer john silvester is an aus...,Artist,3,0.912039
766,4,ron east ronald allan east born august 26 1943...,Athlete,4,0.697741
692,4,anton n kinsk anton n kinsk born 31 may 1975 i...,Athlete,4,0.999292


## Batch Transform Job

In [136]:
test_transformer = sg.transformer.Transformer(
    "dev-bztext-dbpedia",
    2,
    "ml.m4.xlarge",
    output_path="s3://623571303006-dev-model-training/preds/validate",
    strategy="MultiRecord",
)
test_transformer.transform("s3://623571303006-dev-model-training/output/validate", split_type="Line", content_type="text/csv")
test_transformer.wait()

..............................
[34mArguments: serve[0m
[34m[03/15/2022 02:32:52 INFO 140114531981120] Finding and loading model[0m
[34m[03/15/2022 02:32:52 INFO 140114531981120] Trying to load model from /opt/ml/model/model.bin[0m
[34m[03/15/2022 02:32:52 INFO 140114531981120] Number of server workers: 4[0m
[34m[2022-03-15 02:32:52 +0000] [1] [INFO] Starting gunicorn 19.7.1[0m
[34m[2022-03-15 02:32:52 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2022-03-15 02:32:52 +0000] [1] [INFO] Using worker: sync[0m
[34m[2022-03-15 02:32:52 +0000] [34] [INFO] Booting worker with pid: 34[0m
[34m[2022-03-15 02:32:52 +0000] [35] [INFO] Booting worker with pid: 35[0m
[34m[2022-03-15 02:32:52 +0000] [36] [INFO] Booting worker with pid: 36[0m
[34m[2022-03-15 02:32:52 +0000] [37] [INFO] Booting worker with pid: 37[0m
[35mArguments: serve[0m
[35m[03/15/2022 02:32:57 INFO 140002674067264] Finding and loading model[0m
[35m[03/15/2022 02:32:57 INFO 14000267406726

UnexpectedStatusException: Error for Transform job blazingtext-2022-03-15-02-28-05-918: Failed. Reason: ClientError: See job logs for more information

In [None]:
! aws s3 cp s3://623571303006-dev-model-training/preds/validate .