In [None]:
## Run this in terminal

# source activate pytorch_p38
# /home/ec2-user/anaconda3/envs/pytorch_p38/bin/python -m pip install --upgrade pip
# pip install "transformers==4.4.2" "datasets[s3]==1.5.0"
# pip install sagemaker --upgrade

In [None]:
import numpy as np
import pandas as pd
import re
import sagemaker
import string

from datasets import Dataset
from transformers import AutoTokenizer

## 1. SageMaker set up

In [None]:
# Create a sagemaker session
sess = sagemaker.Session()

# The SageMaker session bucket is used for uploading data, models and logs
sagemaker_session_bucket = <<S3 bucket name>>
# SageMaker will automatically create this bucket if it doesn't exist
if sagemaker_session_bucket is None and sess is not None:
    # Set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

# Get sagemaker execution role
role = sagemaker.get_execution_role()
# add the default bucket to the session
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"Bucket: {sess.default_bucket()}")
print(f"Region: {sess.boto_region_name}")

## 2. Data

### 2.1 Enrich data with offensive tweets
Enriching the sentiment data with offensive language tweets from twitter. The data is available at Hate Speech and Offensive Language Repository [here](https://github.com/t-davidson/hate-speech-and-offensive-language/tree/master/data)

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
# Load the offensive data sample from the data folder
data = pd.read_csv('./data/offensive_data.csv', usecols=['class', 'tweet'])
# Get offensive tweets only
offensive_text = data[data['class'] == 1]
offensive_text['class'] = 0

# Clean the tweet data by removing handles name, emojis etc
offensive_text['tweet'] = offensive_text['tweet'].str.replace('!', '')
offensive_text['tweet'] = offensive_text['tweet'].str.replace('"', '')
offensive_text['tweet'] = offensive_text['tweet'].str.replace("'", "")
offensive_text['tweet'] = offensive_text['tweet'].str.replace('  ', ' ')
offensive_text['tweet'] = offensive_text['tweet'].str.replace('RT', '')
offensive_text['tweet'] = [re.sub('&#[^\s]+', '', x) for x in offensive_text['tweet']]
offensive_text['tweet'] = [re.sub('@[^\s]+', '', x) for x in offensive_text['tweet']]
offensive_text['tweet'] = [re.sub('#[^\s]+', '', x) for x in offensive_text['tweet']]
offensive_text['tweet'] = [re.sub('http\S+', '', x) for x in offensive_text['tweet']]

# Clean the rest and keep only English words
offensive_text['tweet'] = [clean_text(x) for x in offensive_text['tweet']]
offensive_text.columns = ['label', 'text']

# Split the data to train and test, then add to the SST2 dataset. 
msk = np.random.rand(len(offensive_text)) < 0.8
enrich_train = offensive_text[msk]
enrich_test = offensive_text[~msk]

### 2.2 Download The Stanford Sentiment Treebank dataset

In [None]:
# Download the SST2 data from s3
!curl https://sagemaker-sample-files.s3.amazonaws.com/datasets/text/SST2/sst2.test > ./data/sst2.test
!curl https://sagemaker-sample-files.s3.amazonaws.com/datasets/text/SST2/sst2.train > ./data/sst2.train
!curl https://sagemaker-sample-files.s3.amazonaws.com/datasets/text/SST2/sst2.val > ./data/sst2.val

## Tokenize sentences

In [None]:
# Tokenizer used in pre-processing
tokenizer_name = "distilbert-base-uncased"

# S3 key prefix for the data
s3_prefix = "dataset/sst"

# Download tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

# Tokenizer helper function to tokenize sentences to max 54 words (median text length is 54 words.
def tokenize(batch):
    return tokenizer(batch["text"], max_length=54, padding="max_length", truncation=True)

In [None]:
# Load dataset
test_df = pd.read_csv("./data/sst2.test", sep="delimiter", header=None, engine="python", names=["line"])
train_df = pd.read_csv("./data/sst2.train", sep="delimiter", header=None, engine="python", names=["line"])

test_df[["label", "text"]] = test_df["line"].str.split(" ", 1, expand=True)
train_df[["label", "text"]] = train_df["line"].str.split(" ", 1, expand=True)

test_df.drop("line", axis=1, inplace=True)
train_df.drop("line", axis=1, inplace=True)

test_df["label"] = pd.to_numeric(test_df["label"], downcast="integer")
train_df["label"] = pd.to_numeric(train_df["label"], downcast="integer")

In [None]:
# Add offensive tweets to the SST2 dataset
train_df = pd.concat([train_df, enrich_train], axis=0)
test_df = pd.concat([test_df, enrich_test], axis=0)

# Reshuffle the datasets
train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

# Convert pandas dataframe to dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
# Tokenize dataset
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Set format for pytorch
train_dataset = train_dataset.rename_column("label", "labels")
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

test_dataset = test_dataset.rename_column("label", "labels")
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()

# save train_dataset to s3
training_input_path = f"s3://{sess.default_bucket()}/{s3_prefix}/train/"
train_dataset.save_to_disk(training_input_path, fs=s3)

# save test_dataset to s3
test_input_path = f"s3://{sess.default_bucket()}/{s3_prefix}/test/"
test_dataset.save_to_disk(test_input_path, fs=s3)

## 3. Fine-tune the model and start a SageMaker training job

In [None]:
from sagemaker.huggingface import HuggingFace

# Hyperparameters which are passed into the training job
hyperparameters = {"epochs": 3, 
                   "train_batch_size": 8,
                   "seed": 0,
                   "model_name": "distilbert-base-uncased"}

base_job_name = "huggingface-sentiment-project" # training job name
output_path = f"s3://{sess.default_bucket()}/output/" # output directory

In [None]:
estimator = HuggingFace(
    entry_point = "train.py", # the training script name
    source_dir = "./scripts", # the training script location
    base_job_name = base_job_name, # training job name
    output_path = output_path, # output directory
    instance_type = "ml.p3.2xlarge", # training instance
    instance_count = 1, # number of training instance
    volume_size = 100, # disk size of the training instance to hold data and model files temporarily
    role = role, # sagemaker role
    pytorch_version = "1.9", # version of pytorch library
    py_version = "py38", # version of python
    transformers_version = "4.12", # version of the transformers library 
    hyperparameters = hyperparameters, # hyperparameters defined in previous step
)

In [None]:
# Start the training job with the uploaded dataset as input
estimator.fit({"train": training_input_path, "test": test_input_path})

In [None]:
predictor = estimator.deploy(1, "ml.t2.medium")

In [None]:
payload = {"inputs": "This is bad experience and customer service was rude"}

predictor.predict(payload)

In [None]:
import boto3
import json

runtime_client = boto3.client('sagemaker-runtime')
content_type = "application/json"

data = json.loads(json.dumps(payload))
payload = json.dumps(data)

endpoint_name = <<model endpoint name>>

response = runtime_client.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType=content_type,
    Body=payload)

result = json.loads(response['Body'].read().decode())

if result[0]['label'] == 'LABEL_1':
    output = {"outcome": "Positive"}
else:
    output = {"outcome": "Negative"}

print(payload)
print(result)
print(output)

In [None]:
# delete endpoint
predictor.delete_model()
predictor.delete_endpoint()

### Deploy the model using `model_data`

In [None]:
model_data = 's3://<<S3 bucket name>>/output/model.tar.gz'

In [None]:
from sagemaker.huggingface import HuggingFaceModel
import sagemaker 

role = sagemaker.get_execution_role()

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   model_data = model_data,  # path to your trained sagemaker model
   role = role, # iam role with permissions to create an Endpoint
   transformers_version = "4.12", # transformers version used
   pytorch_version = "1.9", # pytorch version used
   py_version = "py38", # python version of the DLC
)

In [None]:
# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
   initial_instance_count=1,
   instance_type="ml.t2.medium"
)

In [None]:
# example request, you always need to define "inputs"
data = {
   "inputs": "New Avatar movie is too long and I felt so bored watching it"
}

# request
result = predictor.predict(data)

if result[0]['label'] == 'LABEL_1':
    output = {"outcome": "Positive"}
else:
    output = {"outcome": "Negative"}
    
print(output)

In [None]:
# delete endpoint
predictor.delete_model()
predictor.delete_endpoint()