# Text Classification using Amazon BlazingText

**The purpose is to build model classifying the comments from Reddit into 4 categories: Positive, Negative, Neutral, Ambigious

**The dataset is available at https://www.datasetlist.com/

**Data preparation was done on Amazon EMR Notebook using SparkNLP 

In [2]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3

from sagemaker.tuner import (
    IntegerParameter, CategoricalParameter, 
    ContinuousParameter, HyperparameterTuner)

sess = sagemaker.Session()

role = get_execution_role()
print(
    role
)  

bucket = "erasolon-ml-output" 
prefix_input = "emr/goemotions"  
prefix_output = "blazingtext/goemotions"  

arn:aws:iam::861737859161:role/service-role/AmazonSageMaker-ExecutionRole-20210405T205521


In [3]:
%%time

train_channel = prefix_input + "/train_output/part-00000-4eecc79f-ac2b-4cf2-96e5-2decb968edd2-c000.txt"
validation_channel = prefix_input + "/validation_output/part-00000-1870e87d-5743-4431-8c6b-91daf98624e0-c000.txt"
test_channel = prefix_input + "/test_output/part-00000-44794e58-707c-48ee-b2a1-a9e485f2e97a-c000.txt"


s3_train_data = "s3://{}/{}".format(bucket, train_channel)
s3_validation_data = "s3://{}/{}".format(bucket, validation_channel)
s3_test_data = "s3://{}/{}".format(bucket, test_channel)

s3_output_location = "s3://{}/{}/output".format(bucket, prefix_output)

CPU times: user 9 µs, sys: 2 µs, total: 11 µs
Wall time: 13.8 µs


## Training

In [4]:
region_name = boto3.Session().region_name
container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "blazingtext", "latest")
print("Using SageMaker BlazingText container: {} ({})".format(container, region_name))

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: latest.


Using SageMaker BlazingText container: 811284229777.dkr.ecr.us-east-1.amazonaws.com/blazingtext:1 (us-east-1)


In [5]:
bt_model = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.c4.4xlarge",
    volume_size=30,
    max_run=360000,
    input_mode="File",
    output_path=s3_output_location,base_job_name="GoEmotions-BlazingText",
    sagemaker_session=sess,
    hyperparameters={
        "mode": "supervised",
        "epochs": 5,
        "min_count": 2,
        "early_stopping": True,
        "patience": 4,
        "min_epochs": 5,
        "word_ngrams": 1,
        
    },
)

In [6]:
train_data = sagemaker.inputs.TrainingInput(
    s3_train_data,
    distribution="FullyReplicated",
    s3_data_type="S3Prefix",
    input_mode='File',
    content_type='text/plain',
    compression='None',
)
validation_data = sagemaker.inputs.TrainingInput(
    s3_validation_data,
    distribution="FullyReplicated",
    s3_data_type="S3Prefix",
    input_mode='File',
    content_type='text/plain',
    compression='None',
)
test_data = sagemaker.inputs.TrainingInput(
    s3_test_data,
    distribution="FullyReplicated",
    s3_data_type="S3Prefix",
    input_mode='File',
    content_type='text/plain',
    compression='None',
)
data_channels = {"train": train_data, "validation": validation_data, "test": test_data}

In [7]:
%%time

bt_model.fit(inputs=data_channels, logs=True)

2022-01-16 05:17:26 Starting - Starting the training job...
2022-01-16 05:17:52 Starting - Launching requested ML instancesProfilerReport-1642310246: InProgress
......
2022-01-16 05:18:52 Starting - Preparing the instances for training.........
2022-01-16 05:20:17 Downloading - Downloading input data...
2022-01-16 05:20:55 Training - Training image download completed. Training in progress.
2022-01-16 05:20:55 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[01/16/2022 05:20:46 INFO 139813160293760] nvidia-smi took: 0.0755469799041748 secs to identify 0 gpus[0m
[34m[01/16/2022 05:20:46 INFO 139813160293760] Running single machine CPU BlazingText training using supervised mode.[0m
[34mNumber of CPU sockets found in instance is  1[0m
[34m[01/16/2022 05:20:46 INFO 139813160293760] Processing /opt/ml/input/data/train/part-00000-4eecc79f-ac2b-4cf2-96e5-2decb968edd2-c000.txt . File size: 2.665308952331543 MB[0m
[34m[01/16/2022 05:20:46 INFO 139813160293760

## Hyperparameter Tuning

In [8]:
objective_metric_name = 'validation:accuracy'
objective_type = 'Maximize'

In [9]:
%%time

hyperparameter_ranges = {
    "epochs": IntegerParameter(5, 15, scaling_type="Linear"),
    "learning_rate": ContinuousParameter(0.005, 0.01, scaling_type="Auto"),
    "min_count": IntegerParameter(2, 5, scaling_type="Linear"),
    "vector_dim": IntegerParameter(32, 64, scaling_type="Linear"),
    "word_ngrams": IntegerParameter(1, 3, scaling_type="Linear")
}

tuner = HyperparameterTuner(bt_model,
                            objective_metric_name,
                            hyperparameter_ranges,
                            base_tuning_job_name ="GoEmotions-BlazingText",
                            max_jobs=4,
                            max_parallel_jobs=4,
                            strategy="Random",
                            early_stopping_type='Auto',
                            objective_type=objective_type)


data_channels_tune = {"train": train_data, "test": test_data, "validation": validation_data}

tuner.fit(inputs=data_channels_tune, logs=True)

.........................................................!
CPU times: user 337 ms, sys: 24.2 ms, total: 362 ms
Wall time: 4min 48s


In [10]:
boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.job_name
)["HyperParameterTuningJobStatus"]

'Completed'

In [11]:
# run this cell to check current status of hyperparameter tuning job
tuning_job_result = boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.job_name
)

status = tuning_job_result["HyperParameterTuningJobStatus"]
if status != "Completed":
    print("Reminder: the tuning job has not been completed.")

job_count = tuning_job_result["TrainingJobStatusCounters"]["Completed"]
print("%d training jobs have completed" % job_count)

objective = tuning_job_result["HyperParameterTuningJobConfig"]["HyperParameterTuningJobObjective"]
is_minimize = objective["Type"] != "Maximize"
objective_name = objective["MetricName"]

3 training jobs have completed


In [12]:
from pprint import pprint

if tuning_job_result.get("BestTrainingJob", None):
    print("Best model found so far:")
    pprint(tuning_job_result["BestTrainingJob"])
else:
    print("No training jobs have reported results yet.")

Best model found so far:
{'CreationTime': datetime.datetime(2022, 1, 16, 5, 21, 52, tzinfo=tzlocal()),
 'FinalHyperParameterTuningJobObjectiveMetric': {'MetricName': 'validation:accuracy',
                                                 'Value': 0.5776000022888184},
 'ObjectiveStatus': 'Succeeded',
 'TrainingEndTime': datetime.datetime(2022, 1, 16, 5, 25, 57, tzinfo=tzlocal()),
 'TrainingJobArn': 'arn:aws:sagemaker:us-east-1:861737859161:training-job/goemotions-blazingte-220116-0521-001-1d4d98ca',
 'TrainingJobName': 'GoEmotions-BlazingTe-220116-0521-001-1d4d98ca',
 'TrainingJobStatus': 'Completed',
 'TrainingStartTime': datetime.datetime(2022, 1, 16, 5, 24, 42, tzinfo=tzlocal()),
 'TunedHyperParameters': {'epochs': '12',
                          'learning_rate': '0.008744285399472224',
                          'min_count': '3',
                          'vector_dim': '34',
                          'word_ngrams': '2'}}


## Deploy the best trained model

In [13]:
from sagemaker.serializers import JSONSerializer

text_classifier = tuner.deploy(
    initial_instance_count=1, instance_type="ml.m4.xlarge", serializer=JSONSerializer()
)


2022-01-16 05:25:57 Starting - Preparing the instances for training
2022-01-16 05:25:57 Downloading - Downloading input data
2022-01-16 05:25:57 Training - Training image download completed. Training in progress.
2022-01-16 05:25:57 Uploading - Uploading generated training model
2022-01-16 05:25:57 Completed - Training job completed
--------!

## Running Classification

In [14]:
sentences = [
    "love it",
    "hate it",
    "dont know",
]

payload = {"instances": sentences}

response = text_classifier.predict(payload)

predictions = json.loads(response)
print(json.dumps(predictions, indent=2))

[
  {
    "label": [
      "__label__positive"
    ],
    "prob": [
      0.9981802701950073
    ]
  },
  {
    "label": [
      "__label__negative"
    ],
    "prob": [
      0.7251734733581543
    ]
  },
  {
    "label": [
      "__label__negative"
    ],
    "prob": [
      0.4644191563129425
    ]
  }
]


## Clean Up

In [15]:
text_classifier.delete_endpoint()