In [1]:
!pip install --upgrade numexpr



In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from collections import defaultdict
import string

In [2]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load the data

In [4]:
#Paths to data/label
training_data_path = 'ABC Training Data-Grid view.csv'
labels_data_path = 'Antecedents- labels.csv'

#Load data
antecedents_data = pd.read_csv(training_data_path)
labels_data = pd.read_csv(labels_data_path)

#Read in data
texts = antecedents_data['Texts']
labels = antecedents_data['Labels'].apply(lambda x: x.split(','))  # labels are comma-separated

In [5]:
len(antecedents_data)

102

In [6]:
antecedents_data.head()

Unnamed: 0,Texts,Labels
0,i asked my husband to please put away the laun...,They were given directions or a task to comple...
1,told aiden to wash his hands,They were given directions or a task to comple...
2,It was time to clean up their toys,They were given directions or a task to comple...
3,jack was stomping his feet and i asked him to ...,They were given directions or a task to comple...
4,she had to write a sentence about her day. wri...,They were given directions or a task to comple...


## Preprocess text

In [19]:
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove punctuation and make lowercase
    tokens = [w.lower() for w in tokens if w.isalpha()]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    return tokens

preprocessed_texts = texts.apply(preprocess_text)

preprocessed_texts.head()


0    [asked, husband, please, put, away, laundry, a...
1                           [told, aiden, wash, hands]
2                                  [time, clean, toys]
3          [jack, stomping, feet, asked, walk, nicely]
4    [write, sentence, day, writing, hard, even, th...
Name: Texts, dtype: object

In [20]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove punctuation and make lowercase
    tokens = [w.lower() for w in tokens if w.isalpha()]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # Join the tokens back into a string
    return ' '.join(tokens)
preprocessed_texts = texts.apply(preprocess_text)

preprocessed_texts.head()

0         asked husband please put away laundry always
1                                told aiden wash hands
2                                      time clean toys
3                 jack stomping feet asked walk nicely
4    write sentence day writing hard even though tr...
Name: Texts, dtype: object

## Data Augmentation: Synonym Replacement

In [24]:
def get_wordnet_pos(treebank_tag):
    """Convert the part-of-speech naming scheme
       from the nltk default to that which is recognized by the WordNet API"""
    return {
        'J': wordnet.ADJ,
        'V': wordnet.VERB,
        'N': wordnet.NOUN,
        'R': wordnet.ADV
    }.get(treebank_tag[0], wordnet.NOUN)  # Default to noun if part-of-speech is not found

def synonym_replacement(sentence, num_replacements=1):
    # Tokenize and POS tag the words in the sentence
    words = word_tokenize(sentence)
    pos_tags = pos_tag(words)

    # Get synonyms for each word, considering its part of speech
    synonyms = defaultdict(list)
    for word, tag in pos_tags:
        wordnet_pos = get_wordnet_pos(tag)  # Convert to WordNet POS notation
        for syn in wordnet.synsets(word, pos=wordnet_pos):
            for lemma in syn.lemmas():
                synonym = lemma.name().replace('_', ' ').replace('-', ' ')
                if synonym != word:
                    synonyms[word].append(synonym)

    # Select random words to replace
    words_to_replace = random.sample(list(synonyms.keys()), min(num_replacements, len(synonyms)))

    # Perform replacements
    new_sentence = sentence
    for word in words_to_replace:
        syn_list = synonyms[word]
        if syn_list:
            # Choose a random synonym for the word
            synonym = random.choice(syn_list)
            new_sentence = new_sentence.replace(word, synonym, 1)

    return new_sentence

# Test the function
original_text = "i asked my husband to please put away the laundry and he did what he always does"
augmented_text = synonym_replacement(original_text, num_replacements=5)
print("Original:", original_text)
print("Augmented:", augmented_text)

Original: i asked my husband to please put away the laundry and he did what he always does
Augmented: i ask my hubby to please arrange away tatomic number 2 laundry and he did what he always behave


In [25]:
def augment_sentences(dataframe, augment_factor=5):
    augmented_rows = []
    for _, row in dataframe.iterrows():
        text, label = row['Texts'], row['Labels']
        unique_augmented_texts = set()
        while len(unique_augmented_texts) < augment_factor:
            augmented_text = synonym_replacement(text, num_replacements=3)
            unique_augmented_texts.add(augmented_text)
        for aug_text in unique_augmented_texts:
            augmented_rows.append([aug_text, label])
    return augmented_rows



augmented_data = augment_sentences(antecedents_data, augment_factor=5)

augmented_df = pd.DataFrame(augmented_data, columns=['Texts', 'Labels'])

combined_dataset = pd.concat([antecedents_data[['Texts', 'Labels']], augmented_df])

combined_dataset = combined_dataset.reset_index(drop=True)

#combined_dataset.to_csv('augmented_training_data.csv', index=False)

len(combined_dataset)

612

## Model Training

### 1. Split dataset

In [26]:
from sklearn.model_selection import train_test_split

# Features and Labels
X = combined_dataset['Texts']  # the features we want to analyze
y = combined_dataset['Labels']  # the labels, or answers, we want to test against

# Split the data into training and validation sets (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Output the size of the splits
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Validation set size: {X_val.shape[0]} samples")


Training set size: 489 samples
Validation set size: 123 samples


### 2. Feature extraction

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

# Fit and transform the training data to compute TF-IDF features
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the validation data to compute TF-IDF features
X_val_tfidf = vectorizer.transform(X_val)

# We can take a look at the shape of the resulting feature vectors
print(f"Training feature vectors shape: {X_train_tfidf.shape}")
print(f"Validation feature vectors shape: {X_val_tfidf.shape}")


Training feature vectors shape: (489, 2869)
Validation feature vectors shape: (123, 2869)


### 3. Model training using Random Forest  with Multiouptout Classifier

In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

# Convert the labels into a binary format for multi-label classification
mlb = MultiLabelBinarizer()
y_train_mlb = mlb.fit_transform(y_train.apply(lambda x: x.split(',')))
y_val_mlb = mlb.transform(y_val.apply(lambda x: x.split(',')))

# Initialize the MultiOutputClassifier with RandomForest
forest = RandomForestClassifier(n_estimators=100, random_state=42)
multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)

# Train the model
multi_target_forest.fit(X_train_tfidf, y_train_mlb)

# Predict on the validation set
y_val_pred = multi_target_forest.predict(X_val_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_val_mlb, y_val_pred)
print(f"Validation Accuracy: {accuracy}")


Validation Accuracy: 0.9349593495934959


### 4. Hyperparameter Tuning 

In [13]:
from sklearn.model_selection import GridSearchCV

# Define a grid of hyperparameters to search over
param_grid = {
    'estimator__n_estimators': [50, 100, 200],
    'estimator__max_depth': [None, 10, 20, 30],
    'estimator__min_samples_split': [2, 5, 10]
}

# Initialize GridSearchCV with the MultiOutputClassifier and the parameter grid
grid_search = GridSearchCV(multi_target_forest, param_grid=param_grid, cv=3, verbose=2, n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train_tfidf, y_train_mlb)

# Best hyperparameters
print(f"Best hyperparameters: {grid_search.best_params_}")

# Best cross-validated score
print(f"Best cross-validated score: {grid_search.best_score_}")


Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best hyperparameters: {'estimator__max_depth': None, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 200}
Best cross-validated score: 0.754601226993865


In [14]:
from sklearn.metrics import classification_report

# Train the model with the best hyperparameters
best_forest = RandomForestClassifier(
    n_estimators=200, 
    max_depth=None, 
    min_samples_split=2,
    random_state=42
)

# Wrap the classifier with MultiOutputClassifier
best_multi_target_forest = MultiOutputClassifier(best_forest, n_jobs=-1)

# Train the model
best_multi_target_forest.fit(X_train_tfidf, y_train_mlb)

# Predict on the validation set
y_val_pred = best_multi_target_forest.predict(X_val_tfidf)

# Detailed performance analysis
print(classification_report(y_val_mlb, y_val_pred, target_names=mlb.classes_))


                                                       precision    recall  f1-score   support

                                                 busy       1.00      1.00      1.00         4
                                                 cold       1.00      1.00      1.00         3
                                              crowded       1.00      1.00      1.00         4
                                               etc.)"       1.00      1.00      1.00         3
                                             on phone       1.00      1.00      1.00        10
                        or challenging task/activity"       1.00      1.00      1.00         8
                         or overwhelming environment"       1.00      1.00      1.00         4
                           talking with someone else"       1.00      1.00      1.00        10
                                              unclear       1.00      1.00      1.00         8
                                   "Given a diffi

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
# Since the best hyperparameters were provided, we'll use those to create a new RandomForest
best_forest = RandomForestClassifier(
    n_estimators=200, 
    max_depth=None, 
    min_samples_split=2,
    random_state=42
)

# Wrap this forest in a MultiOutputClassifier
best_multi_target_forest = MultiOutputClassifier(best_forest, n_jobs=-1)

# Fit the model to the full training data
best_multi_target_forest.fit(X_train_tfidf, y_train_mlb)

# Predict on the validation set
y_val_pred = best_multi_target_forest.predict(X_val_tfidf)

# Generate the classification report
print(classification_report(y_val_mlb, y_val_pred, target_names=mlb.classes_))


                                                       precision    recall  f1-score   support

                                                 busy       1.00      1.00      1.00         4
                                                 cold       1.00      1.00      1.00         3
                                              crowded       1.00      1.00      1.00         4
                                               etc.)"       1.00      1.00      1.00         3
                                             on phone       1.00      1.00      1.00        10
                        or challenging task/activity"       1.00      1.00      1.00         8
                         or overwhelming environment"       1.00      1.00      1.00         4
                           talking with someone else"       1.00      1.00      1.00        10
                                              unclear       1.00      1.00      1.00         8
                                   "Given a diffi

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Cross Validation

In [16]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import numpy as np

# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Prepare to collect the scores
scores = []

# Perform the cross-validation
for train_index, test_index in kf.split(X):
    # Split data
    X_train_kf, X_val_kf = X.iloc[train_index], X.iloc[test_index]
    y_train_kf, y_val_kf = y.iloc[train_index], y.iloc[test_index]

    # Transform the labels
    y_train_kf_mlb = mlb.transform(y_train_kf.apply(lambda x: x.split(',')))
    y_val_kf_mlb = mlb.transform(y_val_kf.apply(lambda x: x.split(',')))

    # Vectorize the text
    X_train_kf_tfidf = vectorizer.fit_transform(X_train_kf)
    X_val_kf_tfidf = vectorizer.transform(X_val_kf)

    # Train the model
    best_multi_target_forest.fit(X_train_kf_tfidf, y_train_kf_mlb)

    # Predict on the validation fold
    y_val_kf_pred = best_multi_target_forest.predict(X_val_kf_tfidf)

    # Compute the accuracy for the current fold
    accuracy = accuracy_score(y_val_kf_mlb, y_val_kf_pred)

    # Append the score
    scores.append(accuracy)

# Display the accuracy for each fold
print(f"Accuracy for each fold: {scores}")

# Compute the mean accuracy
mean_accuracy = np.mean(scores)
print(f"Mean accuracy across all folds: {mean_accuracy}")


Accuracy for each fold: [0.926829268292683, 0.9186991869918699, 0.9262295081967213, 0.8852459016393442, 0.9180327868852459]
Mean accuracy across all folds: 0.9150073304011729
[CV] END estimator__max_depth=None, estimator__min_samples_split=2, estimator__n_estimators=50; total time=   8.6s
[CV] END estimator__max_depth=None, estimator__min_samples_split=2, estimator__n_estimators=100; total time=  16.4s
[CV] END estimator__max_depth=None, estimator__min_samples_split=2, estimator__n_estimators=100; total time=  16.6s
[CV] END estimator__max_depth=None, estimator__min_samples_split=2, estimator__n_estimators=200; total time=  31.9s
[CV] END estimator__max_depth=None, estimator__min_samples_split=5, estimator__n_estimators=50; total time=   8.4s
[CV] END estimator__max_depth=None, estimator__min_samples_split=5, estimator__n_estimators=50; total time=   7.8s
[CV] END estimator__max_depth=None, estimator__min_samples_split=5, estimator__n_estimators=50; total time=   9.4s
[CV] END estimato

## Save the model

In [40]:
import joblib

# Save the trained MultiOutputClassifier with RandomForest
joblib.dump(multi_target_forest, 'antecedent_multi_target_forest.joblib')

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, 'vectorizer.joblib')

# Save the MultiLabelBinarizer
joblib.dump(mlb, 'mlb.joblib')

['mlb.joblib']

## Create mapping from antecedent labels to functions

In [30]:
antecedent_functions_mapping = {
    "They were given directions or a task to complete": ["Escape"],
    "They were in the middle of a long task or assignment": ["Escape"],
    "Given a difficult, unclear, or challenging task/activity": ["Escape"],
    "They were in the middle of something they enjoy doing": ["Escape", "Access to tangibles"],
    "Someone corrected or helped them": ["Escape"],
    "They wanted something and got told 'no'": ["Access to tangibles"],
    "Loud, busy, crowded, or overwhelming environment": ["Sensory", "Escape"],
    "Stopping one activity/task and starting another": ["Escape", "Access to tangibles"],
    "While playing with a sibling or peer": ["Escape", "Access to tangibles", "Attention", "Sensory"],
    "While playing in a large group (like recess)": ["Escape", "Access to tangibles", "Attention", "Sensory"],
    "Nobody was really paying attention to them": ["Attention"],
    "Nothing to do or play with": ["Escape", "Access to tangibles", "Attention", "Sensory"],
    "Others were receiving a lot of attention": ["Attention"],
    "Parent/caregiver busy, on phone, talking with someone else": ["Attention"],
    "Sudden change of plans or routine": ["Escape", "Access to tangibles", "Attention"],
    "They wanted something unavailable": ["Escape", "Access to tangibles"],
    "Another person had an item they want": ["Access to tangibles"],
    "Belongings disturbed without permission/unexpected": ["Access to tangibles"],
    "Physical discomfort (wet, cold, etc.)": ["Sensory", "Escape"],
    "Asked to stop using something they like": ["Access to tangibles", "Escape"],
    "Disagreement with someone": ["Access to tangibles", "Escape", "Attention"],
    "Someone out-competed them": ["Access to tangibles", "Escape"],
    "Forced participation in activity or task": ["Escape"],
    "Their choice was not honored/accepted": ["Access to tangibles"],
    "Someone got in their personal space": ["Escape"],
    "Bright or fluorescent lights": ["Sensory", "Escape"],
    "Loud or startling noise": ["Sensory", "Escape"],
    "Someone raised their voice": ["Sensory", "Escape"],
    "They were touched without giving permission": ["Escape"],
    "Someone purposely antagonized them": ["Escape"],
    "Someone deceived or tricked them": ["Escape", "Access to tangibles"],
    "Peer pressure to fit in or impress others": ["Escape", "Attention"],
    "Bad day at school or work": ["Escape", "Access to tangibles"],
    "Not really sure or 'out of the blue' unexpectedly": ["Sensory", "Attention", "Escape", "Access to tangibles"],
    "During class instruction": ["Escape", "Attention"],
    "The kids are fighting with each other": ["Escape"],
    "I was trying to get something done in peace": ["Access to tangibles"],
    "Fighting/disagreement with partner": ["Access to tangibles", "Escape", "Attention"],
    "They were asked or told to wait for something.": ["Access to tangibles", "Attention", "Escape"]
}


joblib.dump(antecedent_functions_mapping, 'antecedent_functions_mapping.joblib')


['antecedent_functions_mapping.joblib']

## For future use...

In [31]:
# Load the model and components
loaded_model = joblib.load('antecedent_multi_target_forest.joblib')
loaded_vectorizer = joblib.load('vectorizer.joblib')
loaded_mlb = joblib.load('mlb.joblib')
loaded_functions_mapping = joblib.load('antecedent_functions_mapping.joblib')


### Now we can use these loaded objects to preprocess input, predict labels, and map them to functions.

In [68]:
#This is just an example

import joblib

# Load the saved model and other components
multi_target_forest = joblib.load('antecedent_multi_target_forest.joblib')
vectorizer = joblib.load('vectorizer.joblib')
mlb = joblib.load('mlb.joblib')
antecedent_functions_mapping = joblib.load('antecedent_functions_mapping.joblib')

# Example function to retrieve functions based on antecedents
def get_functions_from_antecedents(antecedent_labels):
    all_functions = [antecedent_functions_mapping[label] 
                     for label in antecedent_labels if label in antecedent_functions_mapping]
    unique_functions = list(set(sum(all_functions, [])))
    return unique_functions

# Example prediction function
def predict(input_text, threshold=0.3):
    preprocessed_text = preprocess_text(input_text)
    #print(f"Preprocessed text: '{preprocessed_text}'")
    tfidf_features = vectorizer.transform([preprocessed_text])
    
    probabilities = multi_target_forest.predict_proba(tfidf_features)
    proba_positive_class = np.array([prob[:, 1] for prob in probabilities]).T
    #print(f"Probabilities: {proba_positive_class}")

    binary_predictions = (proba_positive_class > threshold).astype(int)
    #print(f"Binary predictions shape: {binary_predictions.shape}")

    # Check if at least one label meets the threshold
    if not binary_predictions.any():
        # If no labels meet the threshold, take the label(s) with the highest probability below the threshold
        max_proba_idx = np.argmax(proba_positive_class, axis=1)
        binary_predictions[0, max_proba_idx] = 1
    
    antecedent_labels = mlb.inverse_transform(binary_predictions)
    
    functions = get_functions_from_antecedents(antecedent_labels[0]) if antecedent_labels else []
    
    return {
        "antecedent_labels": antecedent_labels[0] if antecedent_labels else (),
        "functions": functions
    }

In [69]:
# Now simulate a prediction

input_text1 = "i asked my husband to please put away the laundry and he did what he always does"
input_text2 = "Teacher is grading papers and the class is working independently"
input_text3 = "he walk into my bedroom without knocking door and sit near me"

# Test the prediction function with different texts
print(predict(input_text1))
print(predict(input_text2))
print(predict(input_text3))



{'antecedent_labels': ('They were in the middle of something they enjoy doing',), 'functions': ['Escape', 'Access to tangibles']}
{'antecedent_labels': (' on phone', ' talking with someone else"', '"Parent/caregiver busy', 'Nobody was really paying attention to them', 'They were in the middle of a long task or assignment'), 'functions': ['Escape', 'Attention']}
{'antecedent_labels': ('Someone got in their personal space', 'While playing with a sibling or peer'), 'functions': ['Sensory', 'Escape', 'Access to tangibles', 'Attention']}
