<a href="https://colab.research.google.com/github/satyajitghana/TSAI-DeepNLP-END2.0/blob/main/07_Seq2Seq/SST_Redo/SST_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install pytorch-lightning --quiet
! pip install nlpaug --quiet
! pip install gdown==3.13.0

# Stanford Sentiment TreeBank Dataset

In [None]:
import pandas as pd

In [None]:
! gdown https://drive.google.com/uc?id=1urNi0Rtp9XkvkxxeKytjl1WoYNYUEoPI

Downloading...
From: https://drive.google.com/uc?id=1urNi0Rtp9XkvkxxeKytjl1WoYNYUEoPI
To: /content/sst_dataset.zip
5.04MB [00:00, 19.1MB/s]


In [None]:
! unzip sst_dataset.zip

Archive:  sst_dataset.zip
   creating: sst_dataset/
  inflating: sst_dataset/sst_dataset_augmented.csv  
  inflating: sst_dataset/sst_dataset_cleaned.csv  
  inflating: sst_dataset/sst_dataset_synonym.csv  
  inflating: sst_dataset/sst_dataset_translated.csv  


## PyTorch `Dataset`

In [None]:
import torch
import torchtext
import pytorch_lightning as pl

from torch.utils.data import Dataset, DataLoader, random_split

from torchtext.utils import download_from_url, extract_archive
from torchtext.data.utils import get_tokenizer
from torchtext.experimental.functional import sequential_transforms, ngrams_func, totensor, vocab_func
from torchtext.vocab import build_vocab_from_iterator

import torchtext.experimental.functional as text_f

import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action

import random
import gdown

import pandas as pd

from pathlib import Path
from zipfile import ZipFile

from typing import Optional, Tuple, Any, Dict, List

In [None]:
url = 'https://drive.google.com/uc?id=1urNi0Rtp9XkvkxxeKytjl1WoYNYUEoPI'
output = 'sst_dataset.zip'

gdown.cached_download(url, output)

File exists: sst_dataset.zip


'sst_dataset.zip'

In [None]:
with ZipFile('sst_dataset.zip') as datasetzip:
    with datasetzip.open('sst_dataset/sst_dataset_augmented.csv') as f:
        dataset = pd.read_csv(f, index_col=0)

In [None]:
dataset.head()

Unnamed: 0,sentence_index,sentence,phrase,phrase_ids,splitset_label,sentiment_values,phrase_cleaned,synonym_sentences,backtranslated
0,1,The Rock is destined to be the 21st Century 's...,The Rock is destined to be the 21st Century 's...,226166,1,3,The Rock is destined to be the 21st Century's ...,The Rock is destine to be the twenty first Cen...,Rock is set to be the 21st century's new `` Co...
1,2,The gorgeously elaborate continuation of `` Th...,The gorgeously elaborate continuation of `` Th...,226300,1,4,The gorgeously elaborate continuation of `` Th...,The gorgeously elaborate continuation of ` ` T...,The gorgeously elaborate continue to `` The Lo...
2,3,Effective but too-tepid biopic,Effective but too-tepid biopic,13995,2,2,Effective but too-tepid biopic,Effective but too - lukewarm biopic,Effective but too-tepid biopic
3,4,If you sometimes like to go to the movies to h...,If you sometimes like to go to the movies to h...,14123,2,3,If you sometimes like to go to the movies to h...,If you sometimes like to go to the motion pict...,If you sometimes want to go to the movies to p...
4,5,"Emerges as something rare , an issue movie tha...","Emerges as something rare , an issue movie tha...",13999,2,4,"Emerges as something rare , an issue movie tha...","Emerges as something rare, an effect movie tha...",One of the rare 'and therefore does not feel h...


In [None]:
dataset_test = dataset[dataset['splitset_label'].isin([2])][['phrase_cleaned', 'sentiment_values']].rename(columns={"phrase_cleaned": 'phrase'}).reset_index(drop=True)

In [None]:
dataset_train_raw = dataset[dataset['splitset_label'].isin([1, 3])]

In [None]:
phrase_cleaned = dataset_train_raw[['phrase_cleaned', 'sentiment_values']]

In [None]:
dataset_train = pd.concat([
           dataset_train_raw[['phrase_cleaned', 'sentiment_values']].rename(columns={"phrase_cleaned": 'phrase'}),
           dataset_train_raw[['synonym_sentences', 'sentiment_values']].rename(columns={"synonym_sentences": 'phrase'}),
           dataset_train_raw[['backtranslated', 'sentiment_values']].rename(columns={"backtranslated": 'phrase'}),
], ignore_index=True)

In [None]:
dataset_train.head()

Unnamed: 0,phrase,sentiment_values
0,The Rock is destined to be the 21st Century's ...,3
1,The gorgeously elaborate continuation of `` Th...,4
2,Singer\/composer Bryan Adams contributes a sle...,3
3,You'd think by now America would have had enou...,2
4,Yet the act is still charming here .,3


In [None]:
dataset_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27483 entries, 0 to 27482
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   phrase            27483 non-null  object
 1   sentiment_values  27483 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 429.5+ KB


In [None]:
dataset_test.head()

Unnamed: 0,phrase,sentiment_values
0,Effective but too-tepid biopic,2
1,If you sometimes like to go to the movies to h...,3
2,"Emerges as something rare , an issue movie tha...",4
3,The film provides some great insight into the ...,2
4,Offers that rare combination of entertainment ...,4


In [None]:
dataset_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2125 entries, 0 to 2124
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   phrase            2125 non-null   object
 1   sentiment_values  2125 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 33.3+ KB


In [None]:
f'Train Data Size: {len(dataset_train)}, Test Data Size: {len(dataset_test)}'

'Train Data Size: 27483, Test Data Size: 2125'

In [None]:
class StanfordSentimentTreeBank(Dataset):
    """The Standford Sentiment Tree Bank Dataset
    Stanford Sentiment Treebank V1.0

    This is the dataset of the paper:

    Recursive Deep Models for Semantic Compositionality Over a Sentiment Treebank
    Richard Socher, Alex Perelygin, Jean Wu, Jason Chuang, Christopher Manning, Andrew Ng and Christopher Potts
    Conference on Empirical Methods in Natural Language Processing (EMNLP 2013)

    If you use this dataset in your research, please cite the above paper.

    @incollection{SocherEtAl2013:RNTN,
    title = {{Parsing With Compositional Vector Grammars}},
    author = {Richard Socher and Alex Perelygin and Jean Wu and Jason Chuang and Christopher Manning and Andrew Ng and Christopher Potts},
    booktitle = {{EMNLP}},
    year = {2013}
    }

    This file includes:
    1. original_rt_snippets.txt contains 10,605 processed snippets from the original pool of Rotten Tomatoes HTML files. Please note that some snippet may contain multiple sentences.

    2. dictionary.txt contains all phrases and their IDs, separated by a vertical line |

    3. sentiment_labels.txt contains all phrase ids and the corresponding sentiment labels, separated by a vertical line.
    Note that you can recover the 5 classes by mapping the positivity probability using the following cut-offs:
    [0, 0.2], (0.2, 0.4], (0.4, 0.6], (0.6, 0.8], (0.8, 1.0]
    for very negative, negative, neutral, positive, very positive, respectively.
    Please note that phrase ids and sentence ids are not the same.

    4. SOStr.txt and STree.txt encode the structure of the parse trees. 
    STree encodes the trees in a parent pointer format. Each line corresponds to each sentence in the datasetSentences.txt file. The Matlab code of this paper will show you how to read this format if you are not familiar with it.

    5. datasetSentences.txt contains the sentence index, followed by the sentence string separated by a tab. These are the sentences of the train/dev/test sets.

    6. datasetSplit.txt contains the sentence index (corresponding to the index in datasetSentences.txt file) followed by the set label separated by a comma:
        1 = train
        2 = test
        3 = dev

    Please note that the datasetSentences.txt file has more sentences/lines than the original_rt_snippet.txt. 
    Each row in the latter represents a snippet as shown on RT, whereas the former is each sub sentence as determined by the Stanford parser.

    For comparing research and training models, please use the provided train/dev/test splits.

    """

    ORIG_URL = "http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip"
    DATASET_NAME = "StanfordSentimentTreeBank"
    URL = 'https://drive.google.com/uc?id=1urNi0Rtp9XkvkxxeKytjl1WoYNYUEoPI'
    OUTPUT = 'sst_dataset.zip'
 

    def __init__(self, root, vocab=None, text_transforms=None, label_transforms=None, split='train', ngrams=1, use_transformed_dataset=True):
        """Initiate text-classification dataset.
        Args:
            data: a list of label and text tring tuple. label is an integer.
                [(label1, text1), (label2, text2), (label2, text3)]
            vocab: Vocabulary object used for dataset.
            transforms: a tuple of label and text string transforms.
        """

        super(self.__class__, self).__init__()

        if split not in ['train', 'test']:
            raise ValueError(f'split must be either ["train", "test"] unknown split {split}')

        self.vocab = vocab

        gdown.cached_download(self.URL, Path(root) / self.OUTPUT)

        self.generate_sst_dataset(split, Path(root) / self.OUTPUT)

        tokenizer = get_tokenizer("basic_english")

        # the text transform can only work at the sentence level
        # the rest of tokenization and vocab is done by this class
        self.text_transform = sequential_transforms(tokenizer, text_f.ngrams_func(ngrams))

        def build_vocab(data, transforms):
            def apply_transforms(data):
                for line in data:
                    yield transforms(line)
            return build_vocab_from_iterator(apply_transforms(data), len(data))

        if self.vocab is None:
            # vocab is always built on the train dataset
            self.vocab = build_vocab(self.dataset_train["phrase"], self.text_transform)


        if text_transforms is not None:
            self.text_transform = sequential_transforms(
                self.text_transform, text_transforms, text_f.vocab_func(self.vocab), text_f.totensor(dtype=torch.long)
            )
        else:
            self.text_transform = sequential_transforms(
                self.text_transform, text_f.vocab_func(self.vocab), text_f.totensor(dtype=torch.long)
            )

        self.label_transform = sequential_transforms(text_f.totensor(dtype=torch.long))

    def generate_sst_dataset(self, split, dataset_file):

        with ZipFile(dataset_file) as datasetzip:
            with datasetzip.open('sst_dataset/sst_dataset_augmented.csv') as f:
                dataset = pd.read_csv(f, index_col=0)

        self.dataset_orig = dataset.copy()

        dataset_train_raw = dataset[dataset['splitset_label'].isin([1, 3])]
        self.dataset_train = pd.concat([
                dataset_train_raw[['phrase_cleaned', 'sentiment_values']].rename(columns={"phrase_cleaned": 'phrase'}),
                dataset_train_raw[['synonym_sentences', 'sentiment_values']].rename(columns={"synonym_sentences": 'phrase'}),
                dataset_train_raw[['backtranslated', 'sentiment_values']].rename(columns={"backtranslated": 'phrase'}),
        ], ignore_index=True)

        if split == 'train':
            self.dataset = self.dataset_train.copy()
        else:
            self.dataset = dataset[dataset['splitset_label'].isin([2])] \
                                    [['phrase_cleaned', 'sentiment_values']] \
                                    .rename(columns={"phrase_cleaned": 'phrase'}) \
                                    .reset_index(drop=True)

    @staticmethod
    def discretize_label(label):
        if label <= 0.2: return 0
        if label <= 0.4: return 1
        if label <= 0.6: return 2
        if label <= 0.8: return 3
        return 4

    def __getitem__(self, idx):
        # print(f'text: {self.dataset["sentence"].iloc[idx]}, label: {self.dataset["sentiment_values"].iloc[idx]}')
        text = self.text_transform(self.dataset['phrase'].iloc[idx])
        label = self.label_transform(self.dataset['sentiment_values'].iloc[idx])
        # print(f't_text: {text} {text.shape}, t_label: {label}')
        return label, text 

    def __len__(self):
        return len(self.dataset)

    @staticmethod
    def get_labels():
        return ['very negative', 'negative', 'neutral', 'positive', 'very positive']

    def get_vocab(self):
        return self.vocab

    @property
    def collator_fn(self):
        def collate_fn(batch):
            pad_idx = self.get_vocab()['<pad>']
            
            labels, sequences = zip(*batch)

            labels = torch.stack(labels)

            lengths = torch.LongTensor([len(sequence) for sequence in sequences])

            # print('before padding: ', sequences[40])
            
            sequences = torch.nn.utils.rnn.pad_sequence(sequences, 
                                                        padding_value = pad_idx,
                                                        batch_first=True
                                                        )
            # print('after padding: ', sequences[40])
                    
            return labels, sequences, lengths
        
        return collate_fn

In [None]:
dataset = StanfordSentimentTreeBank(root='.', split='train')
loader = DataLoader(
    dataset,
    batch_size=32,
    shuffle=False,
    num_workers=2,
    pin_memory=True,
    collate_fn=dataset.collator_fn
)

  0%|          | 0/27483 [00:00<?, ?lines/s]

File exists: sst_dataset.zip


100%|██████████| 27483/27483 [00:00<00:00, 44853.31lines/s]


In [None]:
len(loader) * 32

27488

In [None]:
batch = next(iter(loader))

In [None]:
labels, text, lengths = batch

In [None]:
labels.shape, text.shape, lengths.shape

(torch.Size([32]), torch.Size([32, 45]), torch.Size([32]))

In [None]:
def random_deletion(words, p=0.1): 
    if len(words) == 1: # return if single word
        return words
    remaining = list(filter(lambda x: random.uniform(0, 1) > p, words)) 
    if len(remaining) == 0: # if not left, sample a random word
        return [random.choice(words)] 
    else:
        return remaining

def random_swap(sentence, n=3, p=0.1): 
    length = range(len(sentence))
    n = min(n, len(sentence))
    for _ in range(n):
        if random.uniform(0, 1) > p:
            idx1, idx2 = random.choices(length, k=2)
            sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1] 
    return sentence

## PyTorch Lightning `LightningDataModule`

In [None]:
class SSTDataModule(pl.LightningDataModule):
    """
    DataModule for SST, train, val, test splits and transforms
    """

    name = "stanford_sentiment_treebank"

    def __init__(
        self,
        data_dir: str = '.',
        val_split: int = 1000,
        num_workers: int = 2,
        batch_size: int = 64,
        *args,
        **kwargs,
    ):
        """
        Args:
            data_dir: where to save/load the data
            val_split: how many of the training images to use for the validation split
            num_workers: how many workers to use for loading data
            normalize: If true applies image normalize
            batch_size: desired batch size.
        """
        super().__init__(*args, **kwargs)

        self.data_dir = data_dir
        self.val_split = val_split
        self.num_workers = num_workers
        self.batch_size = batch_size

        self.dataset_train = ...
        self.dataset_val = ...
        self.dataset_test = ...

        self.SST = StanfordSentimentTreeBank

    def prepare_data(self):
        """Saves IMDB files to `data_dir`"""
        self.SST(self.data_dir)

    def setup(self, stage: Optional[str] = None):
        """Split the train and valid dataset"""

        train_trans, test_trans = self.default_transforms

        train_dataset = self.SST(self.data_dir, split='train', **train_trans)
        test_dataset = self.SST(self.data_dir, split='test', **test_trans)

        train_length = len(train_dataset)

        self.raw_dataset_train = train_dataset
        self.raw_dataset_test = test_dataset

        # self.dataset_train, self.dataset_val = random_split(train_dataset, [train_length - self.val_split, self.val_split])
        self.dataset_train = train_dataset
        self.dataset_test = test_dataset

    def train_dataloader(self):
        """IMDB train set removes a subset to use for validation"""
        loader = DataLoader(
            self.dataset_train,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=self.num_workers,
            pin_memory=True,
            collate_fn=self.collator_fn
        )
        return loader

    def val_dataloader(self):
        """IMDB val set uses a subset of the training set for validation"""
        loader = DataLoader(
            self.dataset_test,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=True,
            collate_fn=self.collator_fn
        )
        return loader

    def test_dataloader(self):
        """IMDB test set uses the test split"""
        loader = DataLoader(
            self.dataset_test,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=True,
            collate_fn=self.collator_fn
        )
        return loader

    def get_vocab(self):
        return self.raw_dataset_train.get_vocab()

    @property
    def default_transforms(self):
        train_transforms = {
            'text_transforms': text_f.sequential_transforms(
                random_deletion,
                random_swap
            ),
            'label_transforms': None
        }
        test_transforms = {
            'text_transforms': None,
            'label_transforms': None
        }

        return train_transforms, test_transforms

    @property
    def collator_fn(self):
        return self.raw_dataset_train.collator_fn

In [None]:
datamodule = SSTDataModule()
datamodule.setup()

  0%|          | 0/27483 [00:00<?, ?lines/s]

File exists: sst_dataset.zip


100%|██████████| 27483/27483 [00:00<00:00, 44661.64lines/s]
  0%|          | 0/27483 [00:00<?, ?lines/s]

File exists: sst_dataset.zip


100%|██████████| 27483/27483 [00:00<00:00, 44941.58lines/s]


In [None]:
train_loader = datamodule.train_dataloader()
val_loader = datamodule.val_dataloader()
test_loader = datamodule.test_dataloader()

In [None]:
len(train_loader), len(val_loader), len(test_loader)

(430, 34, 34)

In [None]:
a, b, c = next(iter(train_loader))

In [None]:
vocab = datamodule.get_vocab()

Example Sample Text

In [None]:
b[30].numpy()

array([  826,    74,  2099,     8,    12, 17011,    15,  9373,    18,
          53,    58,     2,    21,     3, 22737,   180,     4,   100,
        1149,    68,  2035, 11625,  1347,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1])

In [None]:
' '.join(vocab.itos[x] for x in b[30].numpy())

"unfortunately he carvey ' s rubber - grimace with up no . for the savorless script , get crafted make harris goldberg match <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>"

Label of above text

In [None]:
a[30].numpy()

array(1)

In [None]:
datamodule.dataset_train.get_labels()[a[30].numpy()]

'negative'

In [None]:
text = datamodule.raw_dataset_train.dataset['phrase'].iloc[0]
text

"The Rock is destined to be the 21st Century's new `` Conan '' and that he's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal ."