In [None]:
! pip install -q numpy requests nlpaug
! pip install -q nltk>=3.4.5
! pip install -q sacremoses
! pip install -q datasets==2.18.0

# Define NewDataset class

In [None]:
from main import NewDataset

In [None]:
datasets_params = {
    "kmfoda/booksum": ('chapter', 'summary_text'),
    "knkarthick/dialogsum": ('dialogue', 'summary')
}

dataset = NewDataset(datasets_params)
print(dataset)

{'train': Dataset({
    features: ['inp', 'target'],
    num_rows: 22060
}), 'validation': Dataset({
    features: ['inp', 'target'],
    num_rows: 1984
}), 'test': Dataset({
    features: ['inp', 'target'],
    num_rows: 2931
})}


# Stemming and stop words removing

In [None]:
! pip install -q nltk

import nltk

nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

eng_stopwords = stopwords.words('english')

from nltk.tokenize import word_tokenize

from nltk.stem import SnowballStemmer

from datasets import load_dataset, DatasetDict, concatenate_datasets

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
stemmer = SnowballStemmer(language='english', ignore_stopwords = False)

def text_remove_stopwords(seq) -> list:
    """"Assuming that seq is tokenized text"""
    return [t for t in seq if t not in eng_stopwords]

def text_stemming(seq) -> list:
    """"Assuming that seq is tokenized text"""

    return [stemmer.stem(word) for word in seq]

def text_process(text, fn_list = None) -> str:
    tokens = word_tokenize(" ".join(text.split()))

    if fn_list == None:
        fn_list = [text_remove_stopwords, text_stemming]

    for fn in fn_list:
        tokens = fn(tokens)

    return " ".join(tokens)

# Augmentation (Synonym)

[more about NLP augmentation lib](https://github.com/makcedward/nlpaug?tab=readme-ov-file#quick-demo)

In [None]:
import nlpaug.augmenter.word as naw

aug_ = naw.SynonymAug(aug_src='wordnet')

# model_path = os.path.join(os.environ.get("MODEL_DIR"), 'ppdb-2.0-s-all')
# aug_ppdb = naw.SynonymAug(aug_src='ppdb', model_path=model_path)

def aug(text) ->  str:
  augmented_text = aug_.augment(text)
  # augmented_text = aug_ppdb.augment(augmented_text)

  return augmented_text[0]

# Dataset Mapping

In [None]:
# Stemming and stopwords removing
dataset.map(lambda x: {dataset.inp: aug(x[dataset.inp]), dataset.target: x[dataset.target]})

Map:   0%|          | 0/22060 [00:00<?, ? examples/s]

Map:   0%|          | 0/1984 [00:00<?, ? examples/s]

Map:   0%|          | 0/2931 [00:00<?, ? examples/s]

<__main__.NewDataset at 0x7d29877c87c0>

In [None]:
# Back Translation (Augmentation)
dataset.map(lambda x: {dataset.inp: text_process(x[dataset.inp]), dataset.target: x[dataset.target]}, add_new=True, shuffle=True)

Map:   0%|          | 0/22060 [00:00<?, ? examples/s]

Map:   0%|          | 0/1984 [00:00<?, ? examples/s]

Map:   0%|          | 0/2931 [00:00<?, ? examples/s]

<__main__.NewDataset at 0x7d29877c87c0>