## Load Data

In [10]:
import pandas as pd
import numpy as np

training_data_path = 'ABC Training Data-Grid view.csv'
labels_data_path = 'Antecedents- labels.csv'

training_data = pd.read_csv(training_data_path)
labels_data = pd.read_csv(labels_data_path)

training_data.head()


Unnamed: 0,Antecedents,Labels
0,i asked my husband to please put away the laun...,They were given directions or a task to comple...
1,told aiden to wash his hands,They were given directions or a task to comple...
2,It was time to clean up their toys,They were given directions or a task to comple...
3,jack was stomping his feet and i asked him to ...,They were given directions or a task to comple...
4,she had to write a sentence about her day. wri...,They were given directions or a task to comple...


In [11]:
labels_data.head()

Unnamed: 0,Name,Type
0,They were given directions or a task to complete,Antecedent
1,They were in the middle of a long task or assi...,Antecedent
2,"Given a difficult, unclear, or challenging tas...",Antecedent
3,They were in the middle of something they enjo...,Antecedent
4,Someone corrected or helped them,Antecedent


In [13]:
len(training_data)

172

## Data Augmentation

In [16]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.corpus import wordnet
from nltk import pos_tag
from collections import defaultdict
import string

def get_wordnet_pos(treebank_tag):
    """Convert the part-of-speech naming scheme
       from the nltk default to that which is recognized by the WordNet API"""
    return {
        'J': wordnet.ADJ,
        'V': wordnet.VERB,
        'N': wordnet.NOUN,
        'R': wordnet.ADV
    }.get(treebank_tag[0], wordnet.NOUN)  # Default to noun if part-of-speech is not found

def synonym_replacement(sentence, num_replacements=1):
    # Tokenize and POS tag the words in the sentence
    words = word_tokenize(sentence)
    pos_tags = pos_tag(words)

    # Get synonyms for each word, considering its part of speech
    synonyms = defaultdict(list)
    for word, tag in pos_tags:
        wordnet_pos = get_wordnet_pos(tag)  # Convert to WordNet POS notation
        for syn in wordnet.synsets(word, pos=wordnet_pos):
            for lemma in syn.lemmas():
                synonym = lemma.name().replace('_', ' ').replace('-', ' ')
                if synonym != word:
                    synonyms[word].append(synonym)

    # Select random words to replace
    words_to_replace = random.sample(list(synonyms.keys()), min(num_replacements, len(synonyms)))

    # Perform replacements
    new_sentence = sentence
    for word in words_to_replace:
        syn_list = synonyms[word]
        if syn_list:
            # Choose a random synonym for the word
            synonym = random.choice(syn_list)
            new_sentence = new_sentence.replace(word, synonym, 1)

    return new_sentence

# Test the function
original_text = "i asked my husband to please put away the laundry and he did what he always does"
augmented_text = synonym_replacement(original_text, num_replacements=5)
print("Original:", original_text)
print("Augmented:", augmented_text)

Original: i asked my husband to please put away the laundry and he did what he always does
Augmented: ace asked my husband to please set away the washables and he come what he forever does


In [43]:
def augment_sentences(dataframe, augment_factor=5):
    augmented_rows = []
    for _, row in dataframe.iterrows():
        text, label = row['Antecedents'], row['Labels']
        unique_augmented_texts = set()
        while len(unique_augmented_texts) < augment_factor:
            augmented_text = synonym_replacement(text, num_replacements=3)
            unique_augmented_texts.add(augmented_text)
        for aug_text in unique_augmented_texts:
            augmented_rows.append([aug_text, label])
    return augmented_rows



augmented_data = augment_sentences(training_data, augment_factor=5)

augmented_df = pd.DataFrame(augmented_data, columns=['Antecedents', 'Labels'])

data = pd.concat([training_data[['Antecedents', 'Labels']], augmented_df])

data = combined_dataset.reset_index(drop=True)


len(data)

1032

## Text Preprocessing

In [40]:
import re
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt') 

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [44]:
def clean_and_tokenize(text):
    # Convert to lower case
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Join tokens back into string
    return ' '.join(tokens)

data['antecedents_clean'] = data['Antecedents'].apply(clean_and_tokenize)
data['antecedents_clean'].head()


0    i asked my husband to please put away the laun...
1                         told aiden to wash his hands
2                   it was time to clean up their toys
3    jack was stomping his feet and i asked him to ...
4    she had to write a sentence about her day writ...
Name: antecedents_clean, dtype: object

## Format the Data for Blazing Text

In [45]:
data['labels_formatted'] = data['Labels'].apply(lambda x: ' '.join(['__label__' + label.strip() for label in x.split(',')]))

data['blazingtext_format'] = data['antecedents_clean'] + " " + data['labels_formatted']

## Split Training/Validation Sets

In [46]:
from sklearn.model_selection import train_test_split

train, validation = train_test_split(data['blazingtext_format'], test_size=0.2, random_state=42) 

train.to_csv("train_data.txt", index=False, header=False)
validation.to_csv("validation_data.txt", index=False, header=False)


## Upload Data to S3

In [47]:
import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

bucket = 'abc-scoring-blazingtexts-trainingdata'
prefix = 'sagemaker/antecedents-classification'

train_data = sagemaker_session.upload_data(path="train_data.txt", bucket=bucket, key_prefix=f"{prefix}/training")
validation_data = sagemaker_session.upload_data(path="validation_data.txt", bucket=bucket, key_prefix=f"{prefix}/validation")

print(f'Training data location: {train_data}')
print(f'Validation data location: {validation_data}')


Training data location: s3://abc-scoring-blazingtexts-trainingdata/sagemaker/antecedents-classification/training/train_data.txt
Validation data location: s3://abc-scoring-blazingtexts-trainingdata/sagemaker/antecedents-classification/validation/validation_data.txt


## Set Up and Training

In [48]:
role = get_execution_role()
container = get_image_uri(boto3.Session().region_name, 'blazingtext', 'latest')

blazingtext = sagemaker.estimator.Estimator(container,
                                            role,
                                            instance_count=1,
                                            instance_type='ml.m5.large',
                                            sagemaker_session=sagemaker_session)

blazingtext.set_hyperparameters(mode="supervised",
                                epochs=50,  # Increased from 10 to 50
                                learning_rate=0.05,  # Adjusted learning rate
                                vector_dim=200,  # Increased vector dimensions
                                early_stopping=True,
                                patience=10,  # Increased patience
                                min_epochs=10,  # Increased min epochs
                                word_ngrams=3)  # Trying higher n-grams

blazingtext.fit({'train': train_data, 'validation': validation_data})


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: blazingtext-2024-04-14-21-55-27-493


2024-04-14 21:55:27 Starting - Starting the training job...
2024-04-14 21:55:43 Starting - Preparing the instances for training......
2024-04-14 21:56:33 Downloading - Downloading input data...
2024-04-14 21:57:28 Training - Training image download completed. Training in progress...[34mArguments: train[0m
  self.stdout = io.open(c2pread, 'rb', bufsize)[0m
[34m[04/14/2024 21:57:31 INFO 140170336032576] nvidia-smi took: 0.025162935256958008 secs to identify 0 gpus[0m
[34m[04/14/2024 21:57:31 INFO 140170336032576] Running single machine CPU BlazingText training using supervised mode.[0m
[34mNumber of CPU sockets found in instance is  1[0m
[34m[04/14/2024 21:57:31 INFO 140170336032576] Processing /opt/ml/input/data/train/train_data.txt . File size: 0.16261005401611328 MB[0m
[34m[04/14/2024 21:57:31 INFO 140170336032576] Processing /opt/ml/input/data/validation/validation_data.txt . File size: 0.04179954528808594 MB[0m
[34mRead 0M words[0m
[34mNumber of words:  544[0m
[34m

## Hyperparameter Tuning

In [49]:
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

hyperparameter_ranges = {
    'epochs': IntegerParameter(10, 100),
    'learning_rate': ContinuousParameter(0.01, 0.1),
    'vector_dim': IntegerParameter(100, 300),
    'word_ngrams': IntegerParameter(1, 5),
}

tuner = HyperparameterTuner(
    estimator=blazingtext,
    objective_metric_name='validation:accuracy',
    hyperparameter_ranges=hyperparameter_ranges,
    max_jobs=20,
    max_parallel_jobs=3,
    objective_type='Maximize'
)

# Start hyperparameter tuning
tuner.fit({'train': train_data, 'validation': validation_data})


INFO:sagemaker:Creating hyperparameter tuning job with name: blazingtext-240414-2203


.....................................................................................................................................................!


In [50]:
role = get_execution_role()
container = get_image_uri(boto3.Session().region_name, 'blazingtext', 'latest')

blazingtext = sagemaker.estimator.Estimator(container,
                                            role,
                                            instance_count=1,
                                            instance_type='ml.m5.large',
                                            sagemaker_session=sagemaker_session)

# Setting hyperparameters based on the best training job results
blazingtext.set_hyperparameters(mode="supervised",
                                epochs=100,
                                learning_rate=0.09582345631158493, 
                                vector_dim=120, 
                                early_stopping=True, 
                                patience=10,  
                                min_epochs=10,  
                                word_ngrams=1)  

blazingtext.fit({'train': train_data, 'validation': validation_data})

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: blazingtext-2024-04-14-22-28-33-119


2024-04-14 22:28:33 Starting - Starting the training job...
2024-04-14 22:28:48 Starting - Preparing the instances for training......
2024-04-14 22:29:44 Downloading - Downloading input data...
2024-04-14 22:30:29 Downloading - Downloading the training image..[34mArguments: train[0m
  self.stdout = io.open(c2pread, 'rb', bufsize)[0m
[34m[04/14/2024 22:30:43 INFO 140234246543168] nvidia-smi took: 0.025173664093017578 secs to identify 0 gpus[0m
[34m[04/14/2024 22:30:43 INFO 140234246543168] Running single machine CPU BlazingText training using supervised mode.[0m
[34mNumber of CPU sockets found in instance is  1[0m
[34m[04/14/2024 22:30:43 INFO 140234246543168] Processing /opt/ml/input/data/train/train_data.txt . File size: 0.16261005401611328 MB[0m
[34m[04/14/2024 22:30:43 INFO 140234246543168] Processing /opt/ml/input/data/validation/validation_data.txt . File size: 0.04179954528808594 MB[0m
[34mRead 0M words[0m
[34mNumber of words:  544[0m
[34mLoading validation data

## Model Deployment

In [51]:
blazingtext_antecedents_model = blazingtext.create_model()

predictor = blazingtext_antecedents_model.deploy(initial_instance_count=1,
                                     instance_type='ml.m4.xlarge')


# Get the endpoint name
endpoint_name = predictor.endpoint_name
print(f"Endpoint Name: {endpoint_name}")



INFO:sagemaker:Creating model with name: blazingtext-2024-04-14-22-38-15-590
INFO:sagemaker:Creating endpoint-config with name blazingtext-2024-04-14-22-38-16-116
INFO:sagemaker:Creating endpoint with name blazingtext-2024-04-14-22-38-16-116


----!Endpoint Name: blazingtext-2024-04-14-22-38-16-116
