In [2]:
!pip install --upgrade numexpr

Collecting numexpr
  Downloading numexpr-2.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.9 kB)
Downloading numexpr-2.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (375 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m375.2/375.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: numexpr
  Attempting uninstall: numexpr
    Found existing installation: numexpr 2.7.3
    Uninstalling numexpr-2.7.3:
      Successfully uninstalled numexpr-2.7.3
Successfully installed numexpr-2.9.0


In [8]:
import pandas as pd
import random
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from collections import defaultdict
import string

In [5]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Load the data

In [11]:
#Paths to data/label
training_data_path = 'ABC Training Data-Grid view.csv'
labels_data_path = 'Antecedents- labels.csv'

#Load data
antecedents_data = pd.read_csv(training_data_path)
labels_data = pd.read_csv(labels_data_path)

#Read in data
texts = antecedents_data['Texts']
labels = antecedents_data['Labels'].apply(lambda x: x.split(','))  # labels are comma-separated

In [13]:
len(antecedents_data)

45

In [20]:
antecedents_data.head()

Unnamed: 0,Texts,Labels,Hypo. Function 1
0,i asked my husband to please put away the laun...,They were given directions or a task to comple...,Escape
1,told aiden to wash his hands,They were given directions or a task to comple...,Escape
2,It was time to clean up their toys,They were given directions or a task to comple...,Escape
3,jack was stomping his feet and i asked him to ...,They were given directions or a task to comple...,Escape
4,she had to write a sentence about her day. wri...,They were given directions or a task to comple...,Escape


## Preprocess text

In [14]:
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove punctuation and make lowercase
    tokens = [w.lower() for w in tokens if w.isalpha()]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    return tokens

preprocessed_texts = texts.apply(preprocess_text)

preprocessed_texts.head()


0    [asked, husband, please, put, away, laundry, a...
1                           [told, aiden, wash, hands]
2                                  [time, clean, toys]
3          [jack, stomping, feet, asked, walk, nicely]
4    [write, sentence, day, writing, hard, even, th...
Name: Texts, dtype: object

## Data Augmentation: Synonym Replacement

In [42]:
def get_wordnet_pos(treebank_tag):
    """Convert the part-of-speech naming scheme
       from the nltk default to that which is recognized by the WordNet API"""
    return {
        'J': wordnet.ADJ,
        'V': wordnet.VERB,
        'N': wordnet.NOUN,
        'R': wordnet.ADV
    }.get(treebank_tag[0], wordnet.NOUN)  # Default to noun if part-of-speech is not found

def synonym_replacement(sentence, num_replacements=1):
    # Tokenize and POS tag the words in the sentence
    words = word_tokenize(sentence)
    pos_tags = pos_tag(words)

    # Get synonyms for each word, considering its part of speech
    synonyms = defaultdict(list)
    for word, tag in pos_tags:
        wordnet_pos = get_wordnet_pos(tag)  # Convert to WordNet POS notation
        for syn in wordnet.synsets(word, pos=wordnet_pos):
            for lemma in syn.lemmas():
                synonym = lemma.name().replace('_', ' ').replace('-', ' ')
                if synonym != word:
                    synonyms[word].append(synonym)

    # Select random words to replace
    words_to_replace = random.sample(list(synonyms.keys()), min(num_replacements, len(synonyms)))

    # Perform replacements
    new_sentence = sentence
    for word in words_to_replace:
        syn_list = synonyms[word]
        if syn_list:
            # Choose a random synonym for the word
            synonym = random.choice(syn_list)
            new_sentence = new_sentence.replace(word, synonym, 1)

    return new_sentence

# Test the function
original_text = "i asked my husband to please put away the laundry and he did what he always does"
augmented_text = synonym_replacement(original_text, num_replacements=5)
print("Original:", original_text)
print("Augmented:", augmented_text)

Original: i asked my husband to please put away the laundry and he did what he always does
Augmented: I necessitate my husband to please position away the laundry and he make what he perpetually does


In [44]:
def augment_sentences(dataframe, augment_factor=5):
    augmented_rows = []
    for _, row in dataframe.iterrows():
        text, label = row['Texts'], row['Labels']
        unique_augmented_texts = set()
        while len(unique_augmented_texts) < augment_factor:
            augmented_text = synonym_replacement(text, num_replacements=3)
            unique_augmented_texts.add(augmented_text)
        for aug_text in unique_augmented_texts:
            augmented_rows.append([aug_text, label])
    return augmented_rows



augmented_data = augment_sentences(antecedents_data, augment_factor=5)

augmented_df = pd.DataFrame(augmented_data, columns=['Texts', 'Labels'])

combined_dataset = pd.concat([antecedents_data[['Texts', 'Labels']], augmented_df])

combined_dataset = combined_dataset.reset_index(drop=True)

#combined_dataset.to_csv('augmented_training_data.csv', index=False)

len(combined_dataset)

270

## Model Training

### 1. Split dataset

In [45]:
from sklearn.model_selection import train_test_split

# Features and Labels
X = combined_dataset['Texts']  # the features we want to analyze
y = combined_dataset['Labels']  # the labels, or answers, we want to test against

# Split the data into training and validation sets (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Output the size of the splits
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Validation set size: {X_val.shape[0]} samples")


Training set size: 216 samples
Validation set size: 54 samples


### 2. Feature extraction

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

# Fit and transform the training data to compute TF-IDF features
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the validation data to compute TF-IDF features
X_val_tfidf = vectorizer.transform(X_val)

# We can take a look at the shape of the resulting feature vectors
print(f"Training feature vectors shape: {X_train_tfidf.shape}")
print(f"Validation feature vectors shape: {X_val_tfidf.shape}")


Training feature vectors shape: (216, 1696)
Validation feature vectors shape: (54, 1696)


### 3. Model training using Random Forest  with Multiouptout Classifier

In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

# Convert the labels into a binary format for multi-label classification
mlb = MultiLabelBinarizer()
y_train_mlb = mlb.fit_transform(y_train.apply(lambda x: x.split(',')))
y_val_mlb = mlb.transform(y_val.apply(lambda x: x.split(',')))

# Initialize the MultiOutputClassifier with RandomForest
forest = RandomForestClassifier(n_estimators=100, random_state=42)
multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)

# Train the model
multi_target_forest.fit(X_train_tfidf, y_train_mlb)

# Predict on the validation set
y_val_pred = multi_target_forest.predict(X_val_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_val_mlb, y_val_pred)
print(f"Validation Accuracy: {accuracy}")


Validation Accuracy: 0.7592592592592593


### 4. Hyperparameter Tuning 

In [49]:
from sklearn.model_selection import GridSearchCV

# Define a grid of hyperparameters to search over
param_grid = {
    'estimator__n_estimators': [50, 100, 200],
    'estimator__max_depth': [None, 10, 20, 30],
    'estimator__min_samples_split': [2, 5, 10]
}

# Initialize GridSearchCV with the MultiOutputClassifier and the parameter grid
grid_search = GridSearchCV(multi_target_forest, param_grid=param_grid, cv=3, verbose=2, n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train_tfidf, y_train_mlb)

# Best hyperparameters
print(f"Best hyperparameters: {grid_search.best_params_}")

# Best cross-validated score
print(f"Best cross-validated score: {grid_search.best_score_}")


Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best hyperparameters: {'estimator__max_depth': None, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 200}
Best cross-validated score: 0.5
[CV] END estimator__max_depth=None, estimator__min_samples_split=2, estimator__n_estimators=50; total time=   6.1s
[CV] END estimator__max_depth=None, estimator__min_samples_split=2, estimator__n_estimators=100; total time=  10.5s
[CV] END estimator__max_depth=None, estimator__min_samples_split=2, estimator__n_estimators=100; total time=  10.3s
[CV] END estimator__max_depth=None, estimator__min_samples_split=2, estimator__n_estimators=200; total time=  20.4s
[CV] END estimator__max_depth=None, estimator__min_samples_split=5, estimator__n_estimators=50; total time=   5.1s
[CV] END estimator__max_depth=None, estimator__min_samples_split=5, estimator__n_estimators=50; total time=   6.2s
[CV] END estimator__max_depth=None, estimator__min_samples_split=5, estimator__n_estimators=5

In [50]:
from sklearn.metrics import classification_report

# Train the model with the best hyperparameters
best_forest = RandomForestClassifier(
    n_estimators=200, 
    max_depth=None, 
    min_samples_split=2,
    random_state=42
)

# Wrap the classifier with MultiOutputClassifier
best_multi_target_forest = MultiOutputClassifier(best_forest, n_jobs=-1)

# Train the model
best_multi_target_forest.fit(X_train_tfidf, y_train_mlb)

# Predict on the validation set
y_val_pred = best_multi_target_forest.predict(X_val_tfidf)

# Detailed performance analysis
print(classification_report(y_val_mlb, y_val_pred, target_names=mlb.classes_))


                                                       precision    recall  f1-score   support

                                                 busy       1.00      1.00      1.00         1
                                                 cold       1.00      1.00      1.00         2
                                              crowded       1.00      1.00      1.00         1
                                               etc.)"       1.00      1.00      1.00         2
                                             on phone       1.00      0.83      0.91         6
                        or challenging task/activity"       1.00      1.00      1.00         3
                         or overwhelming environment"       1.00      1.00      1.00         1
                           talking with someone else"       1.00      0.83      0.91         6
                                              unclear       1.00      1.00      1.00         3
                                   "Given a diffi

  _warn_prf(average, modifier, msg_start, len(result))


In [51]:
# Since the best hyperparameters were provided, we'll use those to create a new RandomForest
best_forest = RandomForestClassifier(
    n_estimators=200, 
    max_depth=None, 
    min_samples_split=2,
    random_state=42
)

# Wrap this forest in a MultiOutputClassifier
best_multi_target_forest = MultiOutputClassifier(best_forest, n_jobs=-1)

# Fit the model to the full training data
best_multi_target_forest.fit(X_train_tfidf, y_train_mlb)

# Predict on the validation set
y_val_pred = best_multi_target_forest.predict(X_val_tfidf)

# Generate the classification report
print(classification_report(y_val_mlb, y_val_pred, target_names=mlb.classes_))


                                                       precision    recall  f1-score   support

                                                 busy       1.00      1.00      1.00         1
                                                 cold       1.00      1.00      1.00         2
                                              crowded       1.00      1.00      1.00         1
                                               etc.)"       1.00      1.00      1.00         2
                                             on phone       1.00      0.83      0.91         6
                        or challenging task/activity"       1.00      1.00      1.00         3
                         or overwhelming environment"       1.00      1.00      1.00         1
                           talking with someone else"       1.00      0.83      0.91         6
                                              unclear       1.00      1.00      1.00         3
                                   "Given a diffi

  _warn_prf(average, modifier, msg_start, len(result))


### Cross Validation

In [52]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import numpy as np

# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Prepare to collect the scores
scores = []

# Perform the cross-validation
for train_index, test_index in kf.split(X):
    # Split data
    X_train_kf, X_val_kf = X.iloc[train_index], X.iloc[test_index]
    y_train_kf, y_val_kf = y.iloc[train_index], y.iloc[test_index]

    # Transform the labels
    y_train_kf_mlb = mlb.transform(y_train_kf.apply(lambda x: x.split(',')))
    y_val_kf_mlb = mlb.transform(y_val_kf.apply(lambda x: x.split(',')))

    # Vectorize the text
    X_train_kf_tfidf = vectorizer.fit_transform(X_train_kf)
    X_val_kf_tfidf = vectorizer.transform(X_val_kf)

    # Train the model
    best_multi_target_forest.fit(X_train_kf_tfidf, y_train_kf_mlb)

    # Predict on the validation fold
    y_val_kf_pred = best_multi_target_forest.predict(X_val_kf_tfidf)

    # Compute the accuracy for the current fold
    accuracy = accuracy_score(y_val_kf_mlb, y_val_kf_pred)

    # Append the score
    scores.append(accuracy)

# Display the accuracy for each fold
print(f"Accuracy for each fold: {scores}")

# Compute the mean accuracy
mean_accuracy = np.mean(scores)
print(f"Mean accuracy across all folds: {mean_accuracy}")


Accuracy for each fold: [0.7777777777777778, 0.8703703703703703, 0.7592592592592593, 0.7407407407407407, 0.7962962962962963]
Mean accuracy across all folds: 0.7888888888888889
