In [47]:
#Import libraries
import warnings
warnings.filterwarnings('ignore')
import boto3
import sagemaker
from sagemaker import get_execution_role
import pandas as pd
import numpy as np
import botocore
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')
%matplotlib inline

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [48]:
!pip install --disable-pip-version-check -q sagemaker==2.35.0
!pip install --disable-pip-version-check -q nltk==3.5

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


In [49]:
sm = boto3.client(service_name='sagemaker')
sess = sagemaker.Session(sagemaker_client=sm)

bucket = 'daria-hlibova-test'
region = sess.boto_region_name
role = get_execution_role()

Couldn't call 'get_role' to get Role ARN from role name AmazonSageMaker-ExecutionRole-20210831T104726 to get Role path.
Assuming role was created in SageMaker AWS console, as the name contains `AmazonSageMaker-ExecutionRole`. Defaulting to Role ARN with service-role in path. If this Role ARN is incorrect, please add IAM read permissions to your role or supply the Role Arn directly.


### 1. Prepare Dataset

#### 1.1 Load dataset

In [50]:
!aws s3 cp 's3://daria-hlibova-test/SMSSpamCollection.txt' ./
path = './SMSSpamCollection.txt'
df = pd.read_csv(path, sep = '\t', header = None)
df.columns = ['label', 'text']
df.head(2)

download: s3://daria-hlibova-test/SMSSpamCollection.txt to ./SMSSpamCollection.txt


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...


#### 1.2 Transform dataset

In [51]:
# Encode response
le = preprocessing.LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

In [52]:
#Function for tokenizing text and remove punctuations
def join_text_label(x):
    words = nltk.word_tokenize(x['text'])
    cleaned_text= " ".join([word.lower() for word in words if word.isalnum()])
    return cleaned_text

In [53]:
#Function for prepraring label for BlazingText algorithm
def prepare_label(x):
    return f"__label__{x['label_encoded']}"

In [54]:
df['prepared_text'] = df.apply(lambda x:join_text_label(x), axis = 1)
df['label_encoded'] = df.apply(lambda x:prepare_label(x), axis = 1)

In [55]:
df_blazingtext = df[['label_encoded','prepared_text']].reset_index(drop=True)

In [56]:
df_blazingtext.head(5)

Unnamed: 0,label_encoded,prepared_text
0,__label__0,go until jurong point crazy available only in ...
1,__label__0,ok lar joking wif u oni
2,__label__1,free entry in 2 a wkly comp to win fa cup fina...
3,__label__0,u dun say so early hor u c already then say
4,__label__0,nah i do think he goes to usf he lives around ...


#### 1.3 Split Dataset

In [57]:
#Split dataset for train and test sets
df_train, df_vtest = train_test_split(df_blazingtext, 
                                           test_size=0.2,
                                           stratify=df_blazingtext['label_encoded'])

#### 1.4 Upload Dataset to S3 bucket

In [58]:
blazingtext_train_path = './train.csv'
df_train.to_csv(blazingtext_train_path, index=False, header=False, sep=' ')

blazingtext_test_path = './val.csv'
df_vtest.to_csv(blazingtext_test_path, index=False, header=False, sep=' ')

train_s3_uri = sess.upload_data(bucket='daria-hlibova-test', key_prefix='blazingtext/data', path=blazingtext_train_path)
test_s3_uri = sess.upload_data(bucket='daria-hlibova-test', key_prefix='blazingtext/data', path=blazingtext_test_path)

### 2. Train model

In [59]:
# Cntainer image to use for training with the BlazingText algorithm.
image_uri = sagemaker.image_uris.retrieve(
    region=region,
    framework='blazingtext'
)

In [60]:
# Creating an estimator instance
estimator = sagemaker.estimator.Estimator(
    image_uri=image_uri,
    role=role, 
    instance_count=1, 
    instance_type='ml.m4.xlarge',
    volume_size=30,
    max_run=7200,
    sagemaker_session=sess
)

In [61]:
#Settinmg hyperparameters
estimator.set_hyperparameters(mode='supervised',   # supervised (text classification)
                              epochs=10,           # number of complete passes through the dataset
                              learning_rate=0.01,  # step size for the  numerical optimizer
                              min_count=5,         # discard words that appear less than this number                             
                              vector_dim=300,      # number of dimensions in vector space
                              word_ngrams=2)       # number of words in a word n-gram

In [62]:
#Creating train data channel
train_data = sagemaker.inputs.TrainingInput(
    train_s3_uri,
    distribution='FullyReplicated', 
    content_type='text/plain', 
    s3_data_type='S3Prefix'
)

In [63]:
#Creating test data channel
test_data = sagemaker.inputs.TrainingInput(
    test_s3_uri,
    distribution='FullyReplicated', 
    content_type='text/plain', 
    s3_data_type='S3Prefix'
)

In [64]:
data_channels = {
    'train': train_data,
    'validation': test_data
}

In [65]:
#Fitting the model
estimator.fit(
    inputs=data_channels,
    wait=False,
    logs = True
)

training_job_name = estimator.latest_training_job.name
print('Training Job Name:  {}'.format(training_job_name))

Training Job Name:  blazingtext-2022-01-15-10-57-02-937


In [66]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/jobs/{}">Training job</a></b>'.format(region, training_job_name)))

In [67]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/TrainingJobs;prefix={};streamFilter=typeLogStreamPrefix">CloudWatch logs</a> (after about 5 minutes)</b>'.format(region, training_job_name)))

In [None]:
%%time

estimator.latest_training_job.wait(logs=False)


2022-01-15 10:57:03 Starting - Starting the training job
2022-01-15 10:57:05 Starting - Launching requested ML instances........
2022-01-15 10:57:51 Starting - Preparing the instances for training...................
2022-01-15 10:59:31 Downloading - Downloading input data...
2022-01-15 10:59:51 Training - Downloading the training image...
2022-01-15 11:00:12 Training - Training image download completed. Training in progress....
2022-01-15 11:00:33 Uploading - Uploading generated training model...

### 3. Deploy the model

In [None]:
#Creating endpoint

%%time

text_classifier = estimator.deploy(initial_instance_count=1,
                                   instance_type='ml.m5.large',
                                   serializer=sagemaker.serializers.JSONSerializer(),
                                   deserializer=sagemaker.deserializers.JSONDeserializer())

print()
print('Endpoint name:  {}'.format(text_classifier.endpoint_name))

In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/endpoints/{}">SageMaker REST Endpoint</a></b>'.format(region, text_classifier.endpoint_name)))


### 4. Test the model

In [None]:
reviews = ['i will be later',
           'call me'] 

In [None]:
tokenized_reviews = [' '.join(nltk.word_tokenize(review)) for review in reviews]

payload = {"instances" : tokenized_reviews}
print(payload)

In [None]:
import json
predictions = text_classifier.predict(data=payload)
for prediction in predictions:
    print('Predicted class: {}'.format(prediction['label'][0].lstrip('__label__')))