In [None]:
from NLarge.llm import LLMAugmenter

llm_aug = LLMAugmenter()
res = llm_aug.paraphrase_with_question("This movie is a must-watch for all the family.")
print(res)

In [None]:
from NLarge.random import RandomAugmenter
from NLarge.random import Action

random_aug = RandomAugmenter()
random_aug("This is a simple example sentence for testing.", action=Action.SWAP, target_words=["awesome", "great"])

In [None]:
from NLarge.synonym import SynonymAugmenter

syn_aug = SynonymAugmenter()

sample_text = "The quick brown fox jumps over the lazy dog."
syn_aug(sample_text, aug_src='wordnet', aug_p=0.3)

In [None]:
import datasets
from datasets import Dataset, Features, Value, concatenate_datasets
from NLarge.dataset_concat import augment_data, MODE
from NLarge.pipeline import TextClassificationPipeline
from NLarge.model.RNN import TextClassifierRNN

In [None]:
original_train_data, original_test_data = datasets.load_dataset(
    "rotten_tomatoes", split=["train", "test"]
)

features = Features({"text": Value("string"), "label": Value("int64")})
original_train_data = Dataset.from_dict(
    {
        "text": original_train_data["text"],
        "label": original_train_data["label"],
    },
    features=features,
)

original_train_data[0]

In [None]:
# Augment and increase size by 100%
percentages = {
    MODE.RANDOM.SUBSTITUTE: 0.5,  # 50% of data for random augmentation
    MODE.SYNONYM.WORDNET: 0.5,  # 50% of data for synonym augmentation
}

augmented_data_list = augment_data(original_train_data, percentages)


# Convert augmented data into Datasets
augmented_dataset = Dataset.from_dict(
    {
        "text": [item["text"] for item in augmented_data_list],
        "label": [item["label"] for item in augmented_data_list],
    },
    features=features,
)

# Concatenate original and augmented datasets
augmented_train_data = concatenate_datasets(
    [original_train_data, augmented_dataset]
)

print(f"Original train size: {len(original_train_data)}")
print(f"Train size after 100% augmentation: {len(augmented_train_data)}")

In [None]:
pipeline_augmented = TextClassificationPipeline(
    augmented_data=augmented_train_data,
    test_data=original_test_data,
    max_length=128,
    test_size=0.2,
    model_class=TextClassifierRNN,
)
pipeline_augmented.train_model(n_epochs=10)