In [2]:
import numpy as np
import pandas as pd

import sagemaker

role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = sess.boto_region_name
bucket = sess.default_bucket()
prefix = 'module_5/part_3'

- Usaremos el estimador que entrenamos anterioremente en el notebook [module_4/4_05.ipynb](../module_4/4_05.ipynb).
- Podemos usar el método de clase attach de *Estimator con el training_job_name usado anteriormete para no tener que realizar el entrenamiento otra vez.

In [4]:
estimator = sagemaker.estimator.Estimator.attach(training_job_name='dbpedia-blazingtext', )


2022-12-13 21:52:19 Starting - Preparing the instances for training
2022-12-13 21:52:19 Downloading - Downloading input data
2022-12-13 21:52:19 Training - Training image download completed. Training in progress.
2022-12-13 21:52:19 Uploading - Uploading generated training model
2022-12-13 21:52:19 Completed - Training job completed


In [6]:
text_classifier = estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge", 
    serializer=sagemaker.serializers.JSONSerializer()
)

------!

- Tenemos que realizar el mismo procesamiento que en proceso de entrenamiento.

In [7]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
sentences = [
    "Convair was an american aircraft manufacturing company which later expanded into rockets and spacecraft.",
    "Berwick secondary college is situated in the outer melbourne metropolitan suburb of berwick .",
]

# using the same nltk tokenizer that we used during data preparation for training
tokenized_sentences = [" ".join(nltk.word_tokenize(sent)) for sent in sentences]
tokenized_sentences

['Convair was an american aircraft manufacturing company which later expanded into rockets and spacecraft .',
 'Berwick secondary college is situated in the outer melbourne metropolitan suburb of berwick .']

- BlazingText admite application/json como content-type para la inferencia. 
- El contenido tiene que incluir la lista de frases con la clave *instances*.

In [18]:
payload = {"instances": tokenized_sentences}
response = text_classifier.predict(payload)
response

b'[{"label": ["__label__Artist"], "prob": [0.7752249240875244]}]'

In [19]:
import json
predictions = json.loads(response)
print(json.dumps(predictions, indent=2))

[
  {
    "label": [
      "__label__Artist"
    ],
    "prob": [
      0.7752249240875244
    ]
  }
]


- Por defecto solo nos da  la clase con mayor probabilidad, pero podemos modificarlo.

In [16]:
payload = {"instances": tokenized_sentences, "configuration": {"k": 2}}

response = text_classifier.predict(payload)

predictions = json.loads(response)
print(json.dumps(predictions, indent=2))

[
  {
    "label": [
      "__label__Company",
      "__label__MeanOfTransportation"
    ],
    "prob": [
      0.996828556060791,
      0.003055884735658765
    ]
  },
  {
    "label": [
      "__label__EducationalInstitution",
      "__label__Company"
    ],
    "prob": [
      0.9986417889595032,
      0.000642663799226284
    ]
  }
]


In [21]:
text_classifier.delete_endpoint()