## Data Pre-processing

In [None]:
from datasets import load_dataset
import re
import string
import html
import torch
from tqdm.auto import tqdm

In [None]:
path = 'amazon_dataset.json'
dataset = load_dataset('json', data_files=path)
dataset

In [None]:
dataset.__len__()

In [None]:
raw_dataset = dataset['train']

In [None]:
# Define a function to concatenate title and text
def combine_title_text(example):
    # Concatenate title and text, separated by a space or any other delimiter if needed
    example['reviews'] = example['title'] + ". " + example['text']
    return example

# Apply the function to the dataset
raw_dataset = raw_dataset.map(combine_title_text, num_proc=4)

### Text Cleaning

In [None]:
emoticons_dict = {
    ":*": "kiss", ":-*": "kiss", ":x": "kiss", ":-)": "happy", ":-))": "happy",
    ":-)))": "happy", ":-))))": "happy", ":-)))))": "happy", ":-))))))": "happy",
    ":)": "happy", ":))": "happy", ":)))": "happy", ":))))": "happy", ":)))))": "happy",
    ":))))))": "happy", ":)))))))": "happy", ":o)": "happy", ":]": "happy", ":3": "happy",
    ":c)": "happy", ":>": "happy", "=]": "happy", "8)": "happy", "=)": "happy", ":}": "happy",
    ":^)": "happy", "|;-)": "happy", ":'-)": "happy", ":')": "happy", "\\o/": "happy",
    "*\\0/*": "happy", ":-D": "laugh", ":D": "laugh", "8-D": "laugh", "8D": "laugh",
    "x-D": "laugh", "xD": "laugh", "X-D": "laugh", "XD": "laugh", "=-D": "laugh", "=D": "laugh",
    "=-3": "laugh", "=3": "laugh", "B^D": "laugh", ">:[" : "sad", ":-(": "sad", ":-(((": "sad",
    ":(": "sad", ":))": "happy", ";)": "wink", ":-P": "tong", ">:\\": "annoyed", ":-|": "annoyed",
    "<3": "heart", "o_O": "surprise", ">:)": "devil", "D:<": "sad", ":-#": "seallips", "O:-)": "angel"
}

In [None]:
import emoji
# Function to convert emojis to text
def convert_emojis_to_text(text):

    for emoticon, description in emoticons_dict.items():
        text = text.replace(emoticon, description)
    return emoji.demojize(text)

In [None]:
from nltk.corpus import wordnet     # Install using `pip install nltk`
import nltk
nltk.download("all")

In [None]:
from langdetect import detect, DetectorFactory
# Fix randomness in language detection for consistency
DetectorFactory.seed = 42

# Define a function to detect and filter English text
def is_english(review):
    try:
        return detect(review['title']) == 'en'
    except:
        return False

# raw_dataset = raw_dataset.filter(is_english, num_proc=4)

In [None]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    # Trim leading and trailing whitespaces
    text = text.strip()
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    text = re.sub(r'\[\[.*?\]\]', '', text)
    text = text.replace('\/', '/')

    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'\\u2019', "'", text)
    text = re.sub(r'\\u201c', '"', text)
    text = re.sub(r'\\u201d', '"', text)
    text = re.sub(r"\\u2013", "-", text)
    text = re.sub(r'\\u200d', '', text)
    pattern = rf'([{re.escape(string.punctuation)}])\1+'
    text = re.sub(pattern, r'\1', text)

    text = html.unescape(text)


    # Remove extra whitespaces

    text = convert_emojis_to_text(text)
    # text = text.lower()

    return text

raw_dataset = raw_dataset.map(lambda x: {'cleaned_reviews': clean_text(x['reviews'])}, num_proc=4)

In [None]:
#Sample text cleaning
a = '''In my search to find a more affordable option to more expensive peppermint oil supplements, 
I ordered this.  Opened it and the rancid oil smell was overwhelming. Not subtle... it was nauseating.  So disappointed.  It does contain sunflower seed oil.  IBgard is expensive and contains artificial dyes.  So I was hopeful for this, but my search continues.  These pills are much better sized than the more expensive brand, but again, the rancid oil smell stopped me from even considering taking it. For those interested, I checked the expiration date and it shows 4/19.  This is my first order of this particular brand of supplements.<br />Here is the ingredient list<br />Oil of peppermint  50mg<br />&#34;Other ingredients&#34;  Sunflower seed oil, gelatin, vegetable glycerin, food glaze. Contains less than 2% acetlylated monoglycerides, polysorbate 80, sodium alginate, sorbic acid (preservative) and purified water.<br />These are not for me.  I will be tossing them toot sweet.  I update my reviews all the time if new info would helpful. 
And I DO NOT receive compensation of any kind for my reviews.
'''
print(clean_text(a))

In [None]:
raw_dataset

### Filter Reviews

In [None]:
MIN_WORD_COUNT = 10
# Define a function to check if a review meets the minimum word count
def is_long_enough(review):
    return len(review.split()) >= MIN_WORD_COUNT

# Apply the filter to keep only reviews with the minimum word count
raw_dataset = raw_dataset.filter(lambda x: is_long_enough(x["cleaned_reviews"]), num_proc=4)

In [None]:
from datasets import concatenate_datasets

# Function to filter, sample, and map float labels to integer classes
def filter_sample_and_map_labels(dataset, rating_value, label_int, sample_size=100000):
    filtered_dataset = dataset.filter(lambda x: x['rating'] == float(rating_value), num_proc=4)
    sampled_dataset = filtered_dataset.shuffle(seed=42).select(range(sample_size))
    return sampled_dataset.map(lambda x: {'target': label_int}, num_proc=4)

# Apply function to get 45,000 samples each for ratings 1.0, 3.0, and 5.0
rating_1_data = filter_sample_and_map_labels(raw_dataset, 1.0, 0)
#rating_2_data = filter_sample_and_map_labels(raw_dataset, 2.0, 2)
rating_3_data = filter_sample_and_map_labels(raw_dataset, 3.0, 1)
#rating_4_data = filter_sample_and_map_labels(raw_dataset, 4.0, 4)
rating_5_data = filter_sample_and_map_labels(raw_dataset, 5.0, 2)

# Concatenate the sampled datasets
sampled_dataset = concatenate_datasets([rating_1_data, rating_3_data, rating_5_data]).shuffle(seed=42)

In [None]:
from datasets import ClassLabel

num_classes = 3  # Adjust this to the actual number of classes
class_label = ClassLabel(num_classes=num_classes)

# Map the target column to ClassLabel type
sampled_dt = sampled_dataset.cast_column("target", class_label)

train_val_dataset = sampled_dt.train_test_split(test_size=0.3, seed=42, stratify_by_column="target")
train_dataset = train_val_dataset['train']
temp_dataset = train_val_dataset['test']
validation_test_dataset = temp_dataset.train_test_split(test_size=0.5, seed=42, stratify_by_column="target")

validation_dataset = validation_test_dataset['train']
test_dataset = validation_test_dataset['test']

### Data Augmentation

In [None]:
import nlpaug.augmenter.word as naw
from torch.cuda.amp import autocast


# Initialize augmenters
synonym_aug = naw.SynonymAug(aug_src='wordnet')

def create_back_translation_model(device):
    """Create a BackTranslationAug instance for a specific device."""
    return naw.BackTranslationAug(
        from_model_name='Helsinki-NLP/opus-mt-en-de', #using smaller model for GPU compatibility
        to_model_name='Helsinki-NLP/opus-mt-de-en',
        device=device
    )
'''Large model for better results :
    from_model_name='facebook/wmt19-en-de',  
    to_model_name='facebook/wmt19-de-en',
    '''
# contextual_word_embs_aug = naw.ContextualWordEmbsAug(
#     model_path='bert-base-uncased', action="insert", device='cuda'
# )


def batch_synonym_augmentation(examples):
    augmented_texts = synonym_aug.augment(examples['cleaned_reviews'])
    return {'cleaned_reviews': augmented_texts, 'augmented': [True] * len(augmented_texts)}

# def batch_back_translation(examples):
#     with autocast():
#         augmented_texts = back_translation_aug.augment(examples['cleaned_reviews'])
#     return {'cleaned_reviews': augmented_texts, 'augmented': [True] * len(augmented_texts)}
# 


In [None]:
# Define augmentation ratio and parallel processing parameters
import random
augmentation_ratio = 0.4  # Apply each augmentation to 40% of the dataset
num_proc = 4  # Use 4 processes in parallel
# Apply batch processing for back-translation with ratio
def apply_batch_augmentation_with_ratio(dataset, aug_func, batch_size=64):
    # Apply augmentation only to the specified ratio of the dataset
    augmented_dataset = dataset.filter(lambda x: random.random() <= augmentation_ratio)

    # Apply back-translation in batches
    augmented_dataset = augmented_dataset.map(
        aug_func,
        batched=True,
        batch_size=batch_size
    )
    return augmented_dataset

In [None]:
# for i in range(21):
#     print(synonym_augmentation(raw_dataset['cleaned_reviews'][i]), end="\n\n")
#     print('type', type(raw_dataset['cleaned_reviews'][i]) )
# Select the first 5 rows
#subset = raw_dataset.select(range(5))

# Apply `map` to only this subset
# augmented_subset = subset.map(
#     lambda x: apply_augmentation_with_ratio(x, synonym_augmentation)
# )


In [None]:
# # Assuming raw_dataset is a `datasets.Dataset` object
# def extract_rating(example):
#     return {'rating': example['rating']}

# # Map to extract ratings
# ratings_dataset = raw_dataset.map(extract_rating, remove_columns=raw_dataset.column_names, num_proc=4)

# # Convert ratings to a list and count them
# ratings = ratings_dataset['rating']
# rating_counter = Counter(ratings)


# print(rating_counter)

In [None]:
if torch.cuda.is_available():
    num_devices = torch.cuda.device_count()
    print(f"Number of available GPUs: {num_devices}")
    for i in range(num_devices):
        device_name = torch.cuda.get_device_name(i)
        print(f"Device {i}: {device_name}")
else:
    print("No CUDA GPUs found.")

### Apply Data Augmentation- Back Translation

In [None]:
import os
import random
from datasets import Dataset
from accelerate import Accelerator, notebook_launcher
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast
import torch
from accelerate.utils import gather_object


os.environ["NCCL_TIMEOUT"] = "1800"
# Add an empty 'corrected_sentence' column
dataset_1 = train_dataset#.select(range(100))
print('Dataset len: ',len(dataset_1))

# Apply augmentation to a random subset of the dataset
augmentation_ratio = 0.4  # Apply augmentation to 40% of the dataset

def filter_function(example):
    return random.random() <= augmentation_ratio

# Filter the dataset for augmentation
filtered_dataset = dataset_1.filter(filter_function)
print('Filtered Dataset len: ',len(filtered_dataset))



def back_translation_multi_gpus():
    accelerator = Accelerator()
    device = accelerator.device
    back_translation_aug = create_back_translation_model(device)

    filtered_dataset_1 = filtered_dataset.add_column("global_index", list(range(len(filtered_dataset))))
    # filtered_dataset_1 = filtered_dataset_1.add_column("corrected_sentence", filtered_dataset_1["cleaned_reviews"])
    # print("'corrected_sentence' column initialized with 'cleaned_reviews'.")

    # Set the format for PyTorch
    filtered_dataset_1.set_format(type='torch', columns=['cleaned_reviews', 'global_index'])
    # Create DataLoader
    batch_size = 40
    dataloader = DataLoader(filtered_dataset_1, batch_size=batch_size)
    dataloader = accelerator.prepare(dataloader)


    indexed_corrected_sentences=[]


    # No gradient calculation needed during inference
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Running BackTranslation"):
            torch.cuda.empty_cache()
            indices = batch['global_index'].tolist()
            cleaned_reviews = batch['cleaned_reviews']
            augmented_texts = back_translation_aug.augment(cleaned_reviews)
            if len(augmented_texts) != len(cleaned_reviews):
                print(f"Warning: Expected {len(cleaned_reviews)} augmented sentences, but got {len(augmented_texts)}")
                # Handle missing augmentations by retaining original sentences
                # This ensures that row counts remain consistent
                if len(augmented_texts) < len(cleaned_reviews):
                    augmented_texts += cleaned_reviews[len(augmented_texts):]
            # Collect augmented sentences and indices
            indexed_corrected_sentences.extend(zip(indices, augmented_texts))


    accelerator.wait_for_everyone()
    print('Gathering....')
    gathered_results = gather_object(indexed_corrected_sentences)

    # Ensure the results are only collected once
    if accelerator.is_main_process:
        print("Processing final results on the main process...")
        # index_to_sentence = dict(zip(gathered_results[0], gathered_results[1]))
        output_dict = dict(gathered_results)
        gathered_indices = set(output_dict.keys())
        sorted_items = sorted(output_dict.items())
        print('gathered_results',len(gathered_results))

        # Update the original dataset with augmented sentences
        augmented_cleaned_reviews = [sentence for _, sentence in sorted_items]
        print('corrected_sentences',len(augmented_cleaned_reviews))
        filtered_dataset_1.reset_format()

        filtered_dataset_1 = filtered_dataset_1.add_column("augmented_cleaned_reviews", augmented_cleaned_reviews)
        print('type', type(filtered_dataset_1))


        # Save the dataset to disk
        filtered_dataset_1.save_to_disk('/data/augmented_back_translation')
        print("Augmented dataset saved to '/data/augmented_back_translation'.")
    
    

In [None]:
if __name__ == "__main__":
    notebook_launcher(back_translation_multi_gpus, args=(), num_processes=torch.cuda.device_count())  # Adjust automatically based on available GPUs

In [None]:
# from datasets import load_from_disk
# aug_t_dataset = load_from_disk('/kaggle/working/augmented_dataset').map(lambda x: x, keep_in_memory=True)


### Apply Data Augmentation- Synonym Replacement

In [None]:
%%time
augmented_synonym = apply_batch_augmentation_with_ratio(train_dataset, batch_synonym_augmentation)
# augmented_back_translation = apply_batch_augmentation_with_ratio(train_dataset, batch_back_translation)

In [None]:
# augmented_dataset = concatenate_datasets([sampled_dataset, augmented_synonym, augmented_back_translation]).shuffle(seed=42)

### Save Datasets

In [None]:
# augmented_back_translation_path = '/data/augmented_back_translation'
# augmented_back_translation.save_to_disk(augmented_back_translation_path)
augmented_synonym_path = '/data/augmented_synonym'
augmented_synonym.save_to_disk(augmented_synonym_path)

In [None]:
train_path = '/data/amazon_train_dataset'
train_dataset.save_to_disk(train_path)

In [None]:
test_path = '/data/amazon_test_dataset'
test_dataset.save_to_disk(test_path)

In [None]:
val_path = '/data/amazon_validation_dataset'
validation_dataset.save_to_disk(val_path)

In [None]:
!zip -r augmented_split_dataset.zip "$train_path" "$test_path" "$val_path" "$augmented_synonym_path" 
#"$augmented_back_translation_path"