In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
seed = 18

In [2]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action

In [4]:
text = "The quick brown fox jumps over the lazy dog."

In [5]:
aug = naw.SynonymAug(aug_src='wordnet')
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog.
Augmented Text:
['The quick robert brown fox jumps over the work shy domestic dog.']


## Working with the resistance dataset.

In [6]:
merged_resistance_df = pd.read_csv("../../../../data/processed_for_model/merged_themes_using_jaccard_method/merged_Resistance_sentence_level_batch_1_jaccard.csv", encoding='utf-8')

# Shuffle the merged dataset
merged_resistance_df = shuffle(merged_resistance_df, random_state=seed)

# Train-test split 
training_df, test_df = train_test_split(merged_resistance_df, test_size=0.2, random_state=18, stratify=merged_resistance_df['label'])

training_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

### 1. Synonym-based

In [45]:
training_pos_data_list = training_df[training_df['label'] == 1].sentence.to_list()
test_pos_data_list = test_df[test_df['label'] == 1].sentence.to_list()

In [46]:
training_ = []
for i in resistance_pos_data_list:
    new_data = aug.augment(i)
    aug_data_pos_list.append(new_data)

In [47]:
resistance_pos_data_list

['this is my first time taking physics in college and it been a rough start but i am hoping i can understand and learn from it.',
 'in my junior year with HS school my thats been excited wanna take physics, but we had a teacher that didnt teach and just asked us to cheat all the tests so that it didnt look like a classroom teacher, so im excited everyone get a real deal on physics.',
 'moving from los angeles to san francisco, i came with the hope to learn how to become independent and vouch for my needs and wants in a society where minorities like myself are constantly under attack and come from communities lacking in resources.',
 'as a first generation student, i feel like it is my duty to break the generation of poverty and low education and instead be the first to pioneer a hopeful path of higher education and not be constrained to work.',
 'i. also make notes We learn about the material because physics is a complex subject.',
 'engineering is hard enough i will achieve i best in 

In [48]:
aug_data_pos_list

[['this is my first years taking physics in english so it been another rough course so i am hoping we still understand and learn using it.'],
 ['in my sophomore year yr HS teacher i thats been excited wanna take physics, but he had a friend She didnt teach and just asked us to cheat all the tests and that she didnt look like a classroom teacher, so im instead everyone get a real deal on physics.'],
 ['moving from San angeles to san Fernando, we came with the hope to learn how to become responsible & act for my safety and wants in a society where minorities like myself lay continually under attack and come from those lacking in resources.'],
 ['as a first generation student, i am like t is my duty to break each generation of poverty and poor education and hopefully lead the first to see a hopeful path for higher development thus not be constrained to work.'],
 ['i. also make notes or knew about complex material because there was another complex subject.'],
 ['engineering is strong enoug

In [50]:
aug_data_pos_flat_list = [item for sublist in aug_data_pos_list for item in sublist]

aug_data_pos_flat_list

['this is my first years taking physics in english so it been another rough course so i am hoping we still understand and learn using it.',
 'in my sophomore year yr HS teacher i thats been excited wanna take physics, but he had a friend She didnt teach and just asked us to cheat all the tests and that she didnt look like a classroom teacher, so im instead everyone get a real deal on physics.',
 'moving from San angeles to san Fernando, we came with the hope to learn how to become responsible & act for my safety and wants in a society where minorities like myself lay continually under attack and come from those lacking in resources.',
 'as a first generation student, i am like t is my duty to break each generation of poverty and poor education and hopefully lead the first to see a hopeful path for higher development thus not be constrained to work.',
 'i. also make notes or knew about complex material because there was another complex subject.',
 'engineering is strong enough you will 

In [58]:
syn_aug = naw.SynonymAug(aug_src='wordnet')
type(syn_aug)

nlpaug.augmenter.word.synonym.SynonymAug

In [66]:
syn_aug = naw.SynonymAug(aug_src='wordnet')

def dataset_augmentor(df, augmentor=syn_aug):

    list_to_augment = df[df['label'] == 1].sentence.to_list()

    aug_list = []

    for i in list_to_augment:
        new_data = augmentor.augment(i)
        aug_list.append(new_data[0])

    aug_df = pd.DataFrame()

    aug_df['sentence'] = aug_list
    aug_df['label'] = [1 for i in range(len(aug_list))]
    aug_df['phrase'] = [pd.NA for i in range(len(aug_list))]

    concat_df = pd.concat([df, aug_df])

    concat_df = shuffle(concat_df, random_state=seed)

    pos_labels = len([n for n in concat_df['label'] if n==1])
    print("Positive labels present in the dataset : {}  out of {} or {}%".format(pos_labels, len(concat_df['label']), (pos_labels/len(concat_df['label']))*100))

    return concat_df

In [67]:
training_aug_syn_df = dataset_augmentor(training_df, syn_aug)
training_aug_syn_df

Positive labels present in the dataset : 278  out of 1171 or 23.740392826643895%


Unnamed: 0,sentence,label,phrase
630,going into this course i feel uncertain about ...,0,['Im putting all my effort to learn more about...
41,"i slip up a bit during this late slip hither, ...",1,
450,i have goals that benefit not just myself but ...,0,"['I know the journey will not be linear, I kno..."
233,"right now, i am really stuck on vectors and ca...",0,['I am taking the physics SCI class because I ...
299,the bigger reason for why i am here as sf stat...,0,"[""After working minimum wage part time jobs wh..."
...,...,...,...
578,"of course, because of my proficiency of theore...",0,['This is a challenging class that test my coo...
242,turns out i'm not as interested and my last la...,0,"[""Luckily, the lab professor I have currently ..."
837,"honestly speaking, i have no idea.",0,"['First of all, it is mandatory and I do not w..."
112,i chose to live in this class because no matte...,1,


In [68]:
test_aug_syn_df = dataset_augmentor(test_df, syn_aug)
test_aug_syn_df

Positive labels present in the dataset : 70  out of 294 or 23.809523809523807%


Unnamed: 0,sentence,label,phrase
29,im hoping by the end of this course i can take...,0,['Im here to be more involved and focused on c...
24,i be hither to actually understand what unity ...,1,
41,physics 2 is difficult yet instead of giving u...,1,
2,while it am not personally best when informati...,1,
109,i usually am only interested in the courses th...,0,['Even though some classes can be very difficu...
...,...,...,...
242,i am here to make my familys sacrifices worth it.,0,"['Firstly, I am here to challenge myself and m..."
174,"to show them that it lies still in us, a tribe...",1,
190,i am also here in this lab to get a better und...,0,['Physics for me is not easy and it takes a lo...
120,"hopefully, this class can help me overcome my ...",0,['Physics is difficult for me and I need all t...


In [69]:
MAXLEN = 150

In [70]:
X = training_aug_syn_df['sentence']
y = training_aug_syn_df['label']

In [76]:
import ktrain
from ktrain import text
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 18, stratify=y)

model_name = 'bert-base-cased'

bert_transformer = text.Transformer(model_name, maxlen=MAXLEN, class_names=[0,1])
training_set = bert_transformer.preprocess_train(X_train.tolist(), y_train.tolist())
validation_set = bert_transformer.preprocess_test(X_test.tolist(), y_test.tolist())
bert_base_model = bert_transformer.get_classifier()


preprocessing train...
language: en
train sequence lengths:
	mean : 23
	95percentile : 42
	99percentile : 55


Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 23
	95percentile : 43
	99percentile : 58


In [84]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Define classes and class labels
classes = np.array([0, 1])
class_labels = list(training_aug_syn_df.label)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=class_labels)

# Print class weights
print(class_weights)

class_weights = {0:0.6556551,1:2.10611511}

[0.6556551  2.10611511]


In [78]:
import tensorflow as tf
import numpy as np
import os
import random
def reset_random_seeds(seed=2):
    os.environ['PYTHONHASHSEED']=str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

In [85]:
# Build BERT model
# model = text.text_classifier('distilbert', train_data=(X_train, y_train), preproc=distillbert_transformer)
learner = ktrain.get_learner(bert_base_model, train_data=training_set, val_data=validation_set, batch_size=6)
# learner.fit_onecycle(2e-5, 5)
# learner.autofit(2.27E-06, early_stopping=4)
learner.autofit(2e-5, early_stopping=3, class_weight=class_weights)
# distillbert_learner.set_weight_decay(0.001)
# distillbert_learner.autofit(2.27E-06, early_stopping=4, class_weight=class_weights)

reduce_on_plateau automatically enabled at patience=2


begin training using triangular learning rate policy with max lr of 2e-05...
Epoch 1/1024
Epoch 2/1024
Epoch 3/1024
Epoch 4/1024
Epoch 00004: Reducing Max LR on Plateau: new max lr will be 1e-05 (if not early_stopping).
Epoch 5/1024
Epoch 5: early stopping
Weights from best epoch have been loaded into model.


<keras.src.callbacks.History at 0x4110adb20>

In [86]:
learner.validate(class_names=bert_transformer.get_classes())

              precision    recall  f1-score   support

           0       0.91      0.99      0.95       268
           1       0.97      0.68      0.80        84

    accuracy                           0.92       352
   macro avg       0.94      0.84      0.87       352
weighted avg       0.92      0.92      0.91       352



array([[266,   2],
       [ 27,  57]])

In [87]:
bert_predictor = ktrain.get_predictor(learner.model, preproc=bert_transformer)

bert_test_data = test_aug_syn_df['sentence'].tolist()
bert_test_label = test_aug_syn_df['label'].tolist()

y_pred_bert = bert_predictor.predict(bert_test_data)

y_pred_bert = [int(x) for x in y_pred_bert]

tn, fp, fn, tp = confusion_matrix(bert_test_label, y_pred_bert).ravel()
print('True Negative: {}, False Positive: {}, False Negative: {}, True Positive: {}'.format(tn, fp, fn, tp))

print('  Classification Report:\n',classification_report(bert_test_label,y_pred_bert),'\n')

True Negative: 219, False Positive: 5, False Negative: 27, True Positive: 43
  Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.98      0.93       224
           1       0.90      0.61      0.73        70

    accuracy                           0.89       294
   macro avg       0.89      0.80      0.83       294
weighted avg       0.89      0.89      0.88       294
 



In [88]:
bert_predictor = ktrain.get_predictor(learner.model, preproc=bert_transformer)

bert_test_data = test_df['sentence'].tolist()
bert_test_label = test_df['label'].tolist()

y_pred_bert = bert_predictor.predict(bert_test_data)

y_pred_bert = [int(x) for x in y_pred_bert]

tn, fp, fn, tp = confusion_matrix(bert_test_label, y_pred_bert).ravel()
print('True Negative: {}, False Positive: {}, False Negative: {}, True Positive: {}'.format(tn, fp, fn, tp))

print('  Classification Report:\n',classification_report(bert_test_label,y_pred_bert),'\n')

True Negative: 219, False Positive: 5, False Negative: 22, True Positive: 13
  Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.98      0.94       224
           1       0.72      0.37      0.49        35

    accuracy                           0.90       259
   macro avg       0.82      0.67      0.72       259
weighted avg       0.88      0.90      0.88       259
 



### 2. Contextual based

In [91]:
text = "The quick brown fox jumped over the lazy dog"
contextual_aug = naw.ContextualWordEmbsAug(
    model_path='roberta-base', action="substitute")
augmented_text = contextual_aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumped over the lazy dog
Augmented Text:
['The quick brown fox towers over the menacing canine']


In [92]:
training_aug_context_df = dataset_augmentor(training_df, contextual_aug)
training_aug_context_df

Positive labels present in the dataset : 278  out of 1171 or 23.740392826643895%


Unnamed: 0,sentence,label,phrase
630,going into this course i feel uncertain about ...,0,['Im putting all my effort to learn more about...
41,i slipped off a bit from the recent trip there...,1,
450,i have goals that benefit not just myself but ...,0,"['I know the journey will not be linear, I kno..."
233,"right now, i am really stuck on vectors and ca...",0,['I am taking the physics SCI class because I ...
299,the bigger reason for why i am here as sf stat...,0,"[""After working minimum wage part time jobs wh..."
...,...,...,...
578,"of course, because of my proficiency of theore...",0,['This is a challenging class that test my coo...
242,turns out i'm not as interested and my last la...,0,"[""Luckily, the lab professor I have currently ..."
837,"honestly speaking, i have no idea.",0,"['First of all, it is mandatory and I do not w..."
112,i chose myself be in ballet class because no m...,1,


In [93]:
X = training_aug_context_df['sentence']
y = training_aug_context_df['label']

In [102]:
import ktrain
from ktrain import text
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 18, stratify=y)

model_name = 'bert-base-cased'

bert_transformer = text.Transformer(model_name, maxlen=MAXLEN, class_names=[0,1])
training_set = bert_transformer.preprocess_train(X_train.tolist(), y_train.tolist())
validation_set = bert_transformer.preprocess_test(X_test.tolist(), y_test.tolist())
bert_base_model = bert_transformer.get_classifier()


preprocessing train...
language: en
train sequence lengths:
	mean : 23
	95percentile : 42
	99percentile : 55


Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 23
	95percentile : 43
	99percentile : 60


In [103]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Define classes and class labels
classes = np.array([0, 1])
class_labels = list(training_aug_context_df.label)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=class_labels)

# Print class weights
print(class_weights)

class_weights = {0:0.6556551,1:2.10611511}

[0.6556551  2.10611511]


In [104]:
# Build BERT model
# model = text.text_classifier('distilbert', train_data=(X_train, y_train), preproc=distillbert_transformer)
learner = ktrain.get_learner(bert_base_model, train_data=training_set, val_data=validation_set, batch_size=6)
# learner.fit_onecycle(2e-5, 5)
# learner.autofit(2.27E-06, early_stopping=4)
learner.autofit(2e-5, early_stopping=3, class_weight=class_weights)
# distillbert_learner.set_weight_decay(0.001)
# distillbert_learner.autofit(2.27E-06, early_stopping=4, class_weight=class_weights)

reduce_on_plateau automatically enabled at patience=2


begin training using triangular learning rate policy with max lr of 2e-05...
Epoch 1/1024
Epoch 2/1024
Epoch 3/1024
Epoch 4/1024
Epoch 00004: Reducing Max LR on Plateau: new max lr will be 1e-05 (if not early_stopping).
Epoch 5/1024
Epoch 5: early stopping
Weights from best epoch have been loaded into model.


<keras.src.callbacks.History at 0x367f885e0>

In [105]:
learner.validate(class_names=bert_transformer.get_classes())

              precision    recall  f1-score   support

           0       0.91      1.00      0.95       179
           1       1.00      0.70      0.82        56

    accuracy                           0.93       235
   macro avg       0.96      0.85      0.89       235
weighted avg       0.93      0.93      0.92       235



array([[179,   0],
       [ 17,  39]])

In [107]:
training_aug_context_df.sentence.to_list()

['going into this course i feel uncertain about how well i will do, since the last time i took this class was in high school.',
 'i slipped off a bit from the recent trip there, but still continue to make observations regarding it using just work on getting results.',
 'i have goals that benefit not just myself but also my family as well.',
 'right now, i am really stuck on vectors and calculating vectors.',
 'the bigger reason for why i am here as sf state is to get my degree to get a hopefully get a decent to well paying job when i graduate.',
 'i guess that the college experience, but i have made it this long, these past three years have been a quite a challenging journey, adjusting my school schedule with work is still a hassle.',
 'i will always try my hardest in this class because i feel that this is where i can learn the best. i am also here because i want to graduate on time and not fail any of my classes.',
 'i am here because i want to get a degree.',
 'most importantly, i am

In [106]:
bert_predictor = ktrain.get_predictor(learner.model, preproc=bert_transformer)

bert_test_data = test_df['sentence'].tolist()
bert_test_label = test_df['label'].tolist()

y_pred_bert = bert_predictor.predict(bert_test_data)

y_pred_bert = [int(x) for x in y_pred_bert]

tn, fp, fn, tp = confusion_matrix(bert_test_label, y_pred_bert).ravel()
print('True Negative: {}, False Positive: {}, False Negative: {}, True Positive: {}'.format(tn, fp, fn, tp))

print('  Classification Report:\n',classification_report(bert_test_label,y_pred_bert),'\n')

True Negative: 219, False Positive: 5, False Negative: 14, True Positive: 21
  Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.98      0.96       224
           1       0.81      0.60      0.69        35

    accuracy                           0.93       259
   macro avg       0.87      0.79      0.82       259
weighted avg       0.92      0.93      0.92       259
 



In [108]:
bert_predictor.save('../../../../saved_models/resistance_bert_base_cased_model_08092024_v2')

### 3. Contextual sentence augmentor

In [23]:
sentence_aug = nas.ContextualWordEmbsForSentenceAug(model_path='gpt2')
augmented_text = sentence_aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Original:
The quick brown fox jumps over the lazy dog.
Augmented Text:
['The quick brown fox jumps over the lazy dog. to , one new more government of a first \' one and all the next other at current , not in a " ( way other " , time in']


In [24]:
sen_aug_data_pos_list = []
for i in resistance_pos_data_list:
    new_data = sentence_aug.augment(i)
    sen_aug_data_pos_list.append(new_data)

### 4. Backtrip translation

In [109]:
import nlpaug.augmenter.word as naw

text = 'The quick brown fox jumped over the lazy dog'
back_translation_aug = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de', 
    to_model_name='facebook/wmt19-de-en'
)
back_translation_aug.augment(text)

config.json:   0%|          | 0.00/825 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.08G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/825 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.08G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

vocab-src.json:   0%|          | 0.00/849k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/315k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

vocab-src.json:   0%|          | 0.00/849k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/315k [00:00<?, ?B/s]

['The speedy brown fox leapt over the lazy dog']

In [110]:
training_aug_translation_df = dataset_augmentor(training_df, back_translation_aug)
training_aug_translation_df

Positive labels present in the dataset : 278  out of 1171 or 23.740392826643895%


Unnamed: 0,sentence,label,phrase
630,going into this course i feel uncertain about ...,0,['Im putting all my effort to learn more about...
41,I have slipped a little bit during this recent...,1,
450,i have goals that benefit not just myself but ...,0,"['I know the journey will not be linear, I kno..."
233,"right now, i am really stuck on vectors and ca...",0,['I am taking the physics SCI class because I ...
299,the bigger reason for why i am here as sf stat...,0,"[""After working minimum wage part time jobs wh..."
...,...,...,...
578,"of course, because of my proficiency of theore...",0,['This is a challenging class that test my coo...
242,turns out i'm not as interested and my last la...,0,"[""Luckily, the lab professor I have currently ..."
837,"honestly speaking, i have no idea.",0,"['First of all, it is mandatory and I do not w..."
112,I chose this course because there is always ro...,1,


: 