In [None]:
! pip install -q datasets==2.18.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
os.environ["MODEL_DIR"] = 'models'

# Define NewDataset class

In [17]:
# required
# pip install -q datasets==2.18.0

from datasets import load_dataset, concatenate_datasets

class NewDataset():
    def __init__(self, datasets, dataset_preprocesses = {}, input_col_name='inp', target_col_name='target'):
        """
          Assuming that 'datasets' is look like:

            {
              <dataset path or preset>: (<input col name>, <target col name>),
              ...
            }

          If u also want to preprocess datasets at first you can u use 'dataset_preprocesses':
          Assuming that 'dataset_preprocesses' is look like:

            {
              <dataset path or preset>: <fn>
            }
        """
        self.inp=input_col_name
        self.target=target_col_name

        self.dict_dataset = {
            'train': [],
            'validation': [],
            'test': []
        }

        for (name, (inp, target)) in datasets.items():
          dataset = load_dataset(path = name) # Load Dataset from HUgging Face

          if name in dataset_preprocesses.keys():
            print(f'{name} have a custom fn')
            dataset = dataset_preprocesses[name](dataset)

          dataset = dataset.select_columns([inp, target]) # remove useless columns
            # prepare cols names
          if inp != self.inp:
            dataset = dataset.rename_column(inp, self.inp)
          if target != self.target:
            dataset = dataset.rename_column(target, self.target)

          assert 'train' in list(dataset.keys())

          for k in self.dict_dataset.keys():
            self.dict_dataset[k].append(dataset[k])

        self.dict_dataset = {k: concatenate_datasets(v) for k, v in self.dict_dataset.items()}

    def map(self, fn, add_new=False, shuffle=False, **map_kwargs):
      for split, dataset in self.dict_dataset.items():

        if add_new == True:
          new_dataset = dataset.map(fn, **map_kwargs)
          self.dict_dataset[split] = concatenate_datasets([self.dict_dataset[split], new_dataset])

        else:
          self.dict_dataset[split] = dataset.map(fn, **map_kwargs)

        if shuffle == True:
          self.dict_dataset[split].shuffle()
      return self

    @property
    def splits(self):
      return [v for k, v in self.dict_dataset.items()]

    def __str__(self) -> str:
        return str(self.dict_dataset)


In [22]:
def preprocess_ds_for_pygmalion(dataset):

  def preprocess(text) -> str:
    # replace persona 1 name one to be main character
    text = text.replace("#Person1#", "You")
    # replace persona 2 to be assistent
    text = text.replace("#Person2#", "[CHARACTER]")

    return text

  return dataset.map(lambda x: {"dialogue": preprocess(x["dialogue"]), 'summary': preprocess(x["summary"])})

dataset_params = {
    "knkarthick/dialogsum": ("dialogue", "summary"),
    "npc-engine/light-batch-summarize-dialogue": ('dialogue_text', 't0pp_prediction')
}

dataset_preprocesses = {
    'knkarthick/dialogsum': preprocess_ds_for_pygmalion,
}
dataset = NewDataset(dataset_params, dataset_preprocesses=dataset_preprocesses)
print(dataset)

knkarthick/dialogsum have a custom fn
{'train': Dataset({
    features: ['inp', 'target'],
    num_rows: 29535
}), 'validation': Dataset({
    features: ['inp', 'target'],
    num_rows: 1500
}), 'test': Dataset({
    features: ['inp', 'target'],
    num_rows: 3500
})}


# Stemming and stop words removing

In [23]:
! pip install -q nltk

import nltk

nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

eng_stopwords = stopwords.words('english')

from nltk.tokenize import word_tokenize

from nltk.stem import SnowballStemmer

from datasets import load_dataset, DatasetDict, concatenate_datasets

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [24]:
stemmer = SnowballStemmer(language='english', ignore_stopwords = False)

def text_remove_stopwords(seq) -> list:
    """"Assuming that seq is tokenized text"""
    return [t for t in seq if t not in eng_stopwords]

def text_stemming(seq) -> list:
    """"Assuming that seq is tokenized text"""

    return [stemmer.stem(word) for word in seq]

def text_process(text, fn_list = None) -> str:
    tokens = word_tokenize(" ".join(text.split()))

    if fn_list == None:
        fn_list = [text_remove_stopwords, text_stemming]

    for fn in fn_list:
        tokens = fn(tokens)

    return " ".join(tokens)

# Augmentation (Synonym)

[more about NLP augmentation lib](https://github.com/makcedward/nlpaug?tab=readme-ov-file#quick-demo)

In [27]:
! pip -q install nlpaug

import nlpaug.augmenter.word as naw

aug_ = naw.SynonymAug(aug_src='wordnet')

# model_path = os.path.join(os.environ.get("MODEL_DIR"), 'ppdb-2.0-s-all')
# aug_ppdb = naw.SynonymAug(aug_src='ppdb', model_path=model_path)

def aug(text) ->  str:
  augmented_text = aug_.augment(text)
  # augmented_text = aug_ppdb.augment(augmented_text)

  return augmented_text[0]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


# Dataset Mapping

In [28]:
# Stemming and stopwords removing
dataset.map(lambda x: {dataset.inp: aug(x[dataset.inp]), dataset.target: x[dataset.target]})



Map:   0%|          | 0/29535 [00:00<?, ? examples/s]

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/3500 [00:00<?, ? examples/s]

<__main__.NewDataset at 0x7e076d2ad930>

In [29]:
# Back Translation (Augmentation)
dataset.map(lambda x: {dataset.inp: text_process(x[dataset.inp]), dataset.target: x[dataset.target]}, add_new=True, shuffle=True)

Map:   0%|          | 0/29535 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/3500 [00:00<?, ? examples/s]

<__main__.NewDataset at 0x7e076d2ad930>

# Push to HuggingFace Hub

In [30]:
import os
os.environ["HF_TOKEN"] = "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"

# import wandb
# wandb.init(mode='disabled')

In [31]:
from datasets import DatasetDict

dataset_dict = DatasetDict(dataset.dict_dataset)
dataset_dict.push_to_hub("doublecringe123/dialoguesum-booksum-stemmed-augmented")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/60 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.21k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/doublecringe123/dialoguesum-booksum-stemmed-augmented/commit/c7534c279d9fe5f65352c9fc137490684ff76bf3', commit_message='Upload dataset', commit_description='', oid='c7534c279d9fe5f65352c9fc137490684ff76bf3', pr_url=None, pr_revision=None, pr_num=None)