# NLP text classification for Hongtao NLP - 2 - training and deployment

using huggingface bert-base-chinese, recommended from jackie liu(thanks)

In [2]:
import pandas as pd
import urllib.request
import os

# Split data

In [3]:
from sklearn.model_selection import train_test_split
dataset = pd.read_csv("dataset.csv")
dataset.columns =['label','category']
print(dataset.head(3))
train, test = train_test_split(dataset,test_size=0.25,random_state=0)



  label                            category
0   GCI  万物互联 | 看广云物联如何在云上构建无服务器化的“天匠物联网平台”
1   GCI                AIoT 出海部署，雅观智能抢滩“躺赢”
2   AIU             OPPO 小布助手，与你智趣相投 | 客户案例


In [4]:
print ("train size {}, test size{}".format(train.shape,test.shape))

train size (266, 2), test size(89, 2)


In [5]:
train.to_csv('./data/train.csv',index=False,encoding='utf-8')
test.to_csv('./data/test.csv',index=False,encoding='utf-8')

In [6]:
import boto3
import sagemaker
import os
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()

prefix='hongtao-nlp2'

bucket = sess.default_bucket() 
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "train/train.csv")
).upload_file("./data/train.csv")
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "test/test.csv")
).upload_file("./data/test.csv")

training_input_path = f's3://{sess.default_bucket()}/{prefix}/train/train.csv'
test_input_path = f's3://{sess.default_bucket()}/{prefix}/test/test.csv'

#git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.6.1'} # v4.6.1 is referring to the `transformers_version` you use in the estimator.

# Training

In [7]:
# import sagemaker
# from sagemaker.huggingface import HuggingFace

# hyperparameters={'per_device_train_batch_size':4,
#                  'per_device_eval_batch_size': 4,
#                  'model_name_or_path': 'enriqueyanh/bert_cn', #'roberta-large',
#                  'train_file':'/opt/ml/input/data/train/train.csv',
#                  'validation_file':'/opt/ml/input/data/test/test.csv',
#                  'test_file':'/opt/ml/input/data/test/test.csv',
#                  'do_train': True,
#                  'do_predict': True,
#                  'do_eval': True,
#                  'save_total_limit':3,
#                  'num_train_epochs': 3,
#                  'output_dir': '/opt/ml/model',
#                  'num_train_epochs': 10,
#                  'learning_rate': 5e-5,
#                  'seed': 7,
#                  'fp16': False,
#                  'eval_steps': 1000,
#                  }


# # create the Estimato
# huggingface_estimator = HuggingFace(
#       entry_point='run_glue.py', # script
#       source_dir='./', # relative path to example
#       instance_type='ml.p3.2xlarge',
#       instance_count=1,
#       volume_size=50,
#       transformers_version='4.6',
#       pytorch_version='1.7',
#       py_version='py36',
#       role=role,
#       base_job_name='hongtao-nlp2-bert-cn-epoch10',
#       hyperparameters = hyperparameters
# )



In [8]:
import sagemaker
from sagemaker.huggingface import HuggingFace

# gets role for executing training job
role = sagemaker.get_execution_role()
hyperparameters = {
    'model_name_or_path':'bert-base-chinese',
    'output_dir':'/opt/ml/model',
    'train_file':'/opt/ml/input/data/train/train.csv',
    'validation_file':'/opt/ml/input/data/test/test.csv',
    'test_file':'/opt/ml/input/data/test/test.csv',
    'do_train': True,
    'do_predict': True,
    'do_eval': True,
    'save_total_limit':3,
    'num_train_epochs': 3,
    'output_dir': '/opt/ml/model',
    'num_train_epochs': 10,
    'learning_rate': 5e-5,
    'seed': 7,
    'fp16': False,
    'eval_steps': 1000,
    # add your remaining hyperparameters
    # more info here https://github.com/huggingface/transformers/tree/v4.17.0/examples/pytorch/text-classification
}

# git configuration to download our fine-tuning script
git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.17.0'}

# creates Hugging Face estimator
huggingface_estimator = HuggingFace(
    entry_point='run_glue.py',
    source_dir='./examples/pytorch/text-classification',
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    role=role,
    git_config=git_config,
    transformers_version='4.17.0',
    pytorch_version='1.10.2',
    py_version='py38',
    hyperparameters = hyperparameters
)



In [None]:
# starting the train job
# huggingface_estimator.fit()
huggingface_estimator.fit({'train':training_input_path,'test':test_input_path})

2022-11-09 14:39:52 Starting - Starting the training job...
2022-11-09 14:40:17 Starting - Preparing the instances for trainingProfilerReport-1668004792: InProgress
.........
2022-11-09 14:41:39 Downloading - Downloading input data...
2022-11-09 14:42:16 Training - Downloading the training image....................

In [None]:
# huggingface_estimator.fit({'train':training_input_path,'test':test_input_path})

# Deploy

In [None]:
from sagemaker.huggingface import HuggingFaceModel
import sagemaker

role = sagemaker.get_execution_role()

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   model_data=huggingface_estimator.model_data,  # path to your trained sagemaker model
   role=role, # iam role with permissions to create an Endpoint
   transformers_version="4.6", # transformers version used
   pytorch_version="1.7", # pytorch version used
   py_version="py36", # python version of the DLC
)

In [None]:
predictor = huggingface_model.deploy(
   initial_instance_count=1,
   instance_type="ml.g4dn.xlarge"
)



In [None]:
%%time
# example request, you always need to define "inputs"
data = {
   "inputs": "我们从今年才开始给文章打标签，如果目前的样本量小"
}

# request
predictor.predict(data)

## ONLY for existed endpoint, create new Predictor


In [None]:
import sagemaker
from sagemaker.predictor import Predictor
#predictor = Predictor()
from sagemaker.serializers import NumpySerializer
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import NumpyDeserializer
from sagemaker.deserializers import JSONDeserializer
#import sagemaker.serializers.
###
#'please change to your endpoint_name'
###
endpoint_name = 'huggingface-pytorch-inference-2022-09-27-08-59-15-528'
predictor = Predictor(
    endpoint_name=endpoint_name,
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer(),
)

data = {
   "inputs": "亚马逊云计算AWS入门"
}

# request
predictor.predict(data)

