## Fine Tuning

In [None]:
import torch.multiprocessing as mp
mp.set_start_method('spawn', force=True)  # Ensure 'spawn' is used for multiprocessing


In [None]:
import torch, torchvision

cuda_available = torch.cuda.is_available()
cuda_version = torch.version.cuda if cuda_available else "CUDA not available"
print(cuda_version)
if cuda_available:
    print("CUDA is available. PyTorch is using the GPU:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available. PyTorch is using the CPU.")
torch.__version__


In [None]:
from transformers import InputExample, InputFeatures
from sklearn.model_selection import train_test_split
from transformers import create_optimizer
from datasets import load_dataset
from tqdm import tqdm
from collections import Counter

In [None]:
import torch

print(f"Number of GPUs available: {torch.cuda.device_count()}")


In [None]:
import transformers
transformers.__version__

In [None]:
# !unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [None]:
from datasets import load_from_disk, ClassLabel
train_dataset = load_from_disk('/data/amazon_train_dataset').map(lambda x: x, keep_in_memory=True)
test_dataset = load_from_disk('/data/amazon_test_dataset').map(lambda x: x, keep_in_memory=True)
validation_dataset = load_from_disk('/data/amazon_validation_dataset').map(lambda x: x, keep_in_memory=True)
augmented_synonym = load_from_disk('/data/augmented_synonym').map(lambda x: x, keep_in_memory=True)
augmented_back_translation = load_from_disk('/data/augmented_back_translation').map(lambda x: x, keep_in_memory=True)

In [None]:

augmented_back_translation = augmented_back_translation.remove_columns(['cleaned_reviews'])
augmented_back_translation = augmented_back_translation.rename_columns({'augmented_cleaned_reviews': 'cleaned_reviews'})
augmented_synonym = augmented_synonym.remove_columns(['augmented'])

In [None]:
from datasets import concatenate_datasets

train_dataset_aug = concatenate_datasets([train_dataset, augmented_back_translation, augmented_synonym])

In [None]:
# train_dataset_aug = train_dataset_aug.map(lambda x: {'cleaned_reviews': clean_text(x['cleaned_reviews'])}, num_proc=4)
# test_dataset = test_dataset.map(lambda x: {'cleaned_reviews': clean_text(x['cleaned_reviews'])}, num_proc=4)
# validation_dataset = validation_dataset.map(lambda x: {'cleaned_reviews': clean_text(x['cleaned_reviews'])}, num_proc=4)

In [None]:
import random, numpy as np
import torch

# Set the seed for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
id2label = {0: "NEGATIVE", 1: "NEUTRAL", 2:"POSITIVE"}
label2id = {"NEGATIVE": 0, "NEUTRAL": 1, "POSITIVE": 2}

In [None]:
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
MODEL = 'cardiffnlp/twitter-roberta-base-sentiment-latest' #'distilbert-base-uncased'

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# Tokenize function
def preprocess(batch):
    tokenized = tokenizer(
        batch['cleaned_reviews'],
        truncation=True,
        padding='max_length',
        max_length=512
    )
    # Include the labels in the tokenized outputs
    tokenized['labels'] = batch['target']
    return tokenized


# Apply the tokenizer
tokenized_train_dataset = train_dataset_aug.map(preprocess, num_proc=4)
tokenized_val_dataset = validation_dataset.map(preprocess, num_proc=4)
tokenized_test_dataset = test_dataset.map(preprocess, num_proc=4)

In [None]:
from datasets import ClassLabel

num_classes = 3  # Adjust this to the actual number of classes
class_label = ClassLabel(num_classes=num_classes)

# Map the target column to ClassLabel type
tokenized_train_dataset = tokenized_train_dataset.cast_column("labels", class_label)
tokenized_test_dataset = tokenized_test_dataset.cast_column("labels", class_label)
tokenized_val_dataset = tokenized_val_dataset.cast_column("labels", class_label)

In [None]:
keep_columns = ['input_ids', 'attention_mask', 'token_type_ids', 'labels']
all_columns = tokenized_train_dataset.column_names
eval_columns = tokenized_test_dataset.column_names
# Columns to remove
remove_columns = [col for col in all_columns if col not in keep_columns]
e_columns = [col for col in eval_columns if col not in keep_columns]

tokenized_train_dataset_updated = tokenized_train_dataset.map(remove_columns=remove_columns, num_proc=4)
tokenized_val_dataset_updated = tokenized_val_dataset.map(remove_columns=e_columns, num_proc=4)
tokenized_test_dataset_updated = tokenized_test_dataset.map(remove_columns=e_columns, num_proc=4)
print('Train Column', tokenized_train_dataset_updated.column_names)
print('Eval Column', tokenized_test_dataset_updated.column_names)

In [None]:
from torch.utils.data import DataLoader
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, BertConfig
from transformers import DefaultDataCollator, DataCollatorWithPadding
from torch.cuda.amp import autocast, GradScaler

torch.cuda.empty_cache()

# Hyperparameters
train_batch_size = 50
eval_batch_size = 50
epochs = 6  # Adjust epochs as needed
learning_rate = 2e-5

# def collate_fn(batch):
#     input_ids = torch.stack([torch.tensor(example['input_ids']) for example in batch])
#     attention_mask = torch.stack([torch.tensor(example['attention_mask']) for example in batch])
#     labels = torch.tensor([example['label'] for example in batch])
#     return {'input_ids': input_ids, 'attention_mask': attention_mask, 'label': labels}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='pt')

train_loader = DataLoader(tokenized_train_dataset_updated, batch_size=train_batch_size, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(tokenized_val_dataset_updated, batch_size=eval_batch_size, shuffle=False, collate_fn=data_collator)
test_loader = DataLoader(tokenized_test_dataset_updated, batch_size=eval_batch_size, shuffle=False, collate_fn=data_collator)

In [None]:
torch.cuda.memory_allocated()

In [None]:
torch.cuda.memory_reserved()

In [None]:
from accelerate import Accelerator
from sklearn.metrics import classification_report, accuracy_score, f1_score

def train_func():
    accelerator = Accelerator()
    device = accelerator.device

    config = AutoConfig.from_pretrained(
    MODEL,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
    )
    # Initialize model
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL,
        config=config,
    )
    
    # Define optimizer with hyperparameters
    optimizer = AdamW(
        model.parameters(),
        lr=learning_rate,
        betas=(0.9, 0.999),
        eps=1e-08,
        weight_decay=0.01  # Add weight decay
    
    )
    acc_train_loader = train_loader
    acc_val_loader = val_loader
    
    # Define the learning rate scheduler
    num_train_steps = len(acc_train_loader) * epochs
    num_warmup_steps = int(0.1 * num_train_steps)  # 10% of training steps
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_train_steps
    )
    model, optimizer, acc_train_loader, acc_val_loader, scheduler = accelerator.prepare(
        model, optimizer, acc_train_loader, acc_val_loader, scheduler
    )
    # Training loop with early stopping
    best_val_accuracy = 0
    patience = 3
    trigger_times = 0
    best_model_state = None
    scaler = GradScaler()

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        correct_train = 0
        total_train = 0
        
        # Training loop
        train_loop = tqdm(acc_train_loader, desc=f"Training Epoch {epoch+1}", leave=False)
        for batch in train_loop:
            # Move batch to device
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            
            optimizer.zero_grad()
            
            with autocast():
                # Forward pass
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                
            # Backward pass and optimization
            accelerator.backward(loss)
            
            optimizer.step()
            scheduler.step()
            
            # Update metrics
            total_train_loss += loss.item()
            _, predicted = torch.max(outputs.logits, dim=1)
            correct_train += (predicted == labels).sum().item()
            total_train += labels.size(0)
            accuracy = correct_train / total_train
            train_loop.set_postfix(loss=loss.item(), accuracy=accuracy)
        
        # Calculate average training loss and accuracy
        train_loss = total_train_loss / len(acc_train_loader)
        train_accuracy = correct_train / total_train
        
        # Validation
        model.eval()
        total_val_loss = 0
        correct_val = 0
        total_val = 0
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            val_loop = tqdm(acc_val_loader, desc=f"Validating Epoch {epoch+1}", leave=False)
            for batch in val_loop:
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                labels = batch['labels']
                
                # Forward pass
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                
                # Update metrics
                total_val_loss += loss.item()
                _, predicted = torch.max(outputs.logits, dim=1)
                all_preds.extend(accelerator.gather(predicted).cpu().numpy())
                all_labels.extend(accelerator.gather(labels).cpu().numpy())
                correct_val += (predicted == labels).sum().item()
                total_val += labels.size(0)
        
        # Calculate average validation loss and accuracy
        val_loss = total_val_loss / len(acc_val_loader)
        val_accuracy = correct_val / total_val
        
        # Compute F1 score
        f1 = f1_score(all_labels, all_preds, average='weighted')
        
        accelerator.print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}, "
                          f"Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}, F1 Score: {f1:.4f}")
        
        # Print classification report
        accelerator.print(classification_report(all_labels, all_preds, digits=4))
        
        # Early stopping logic
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            trigger_times = 0
            best_model_state = accelerator.unwrap_model(model).state_dict()
            accelerator.save(best_model_state, 'best_model.pt')

            save_path = "/model/saved_model"
            tokenizer_path = "/model/saved_tokenizer"
            optimizer_scheduler_path = "/model/optimizer_scheduler.pt"
            
            # Save model, tokenizer, and configuration after training
            if accelerator.is_main_process:  # Ensure only the main process saves the files
                # Save model weights and configuration
                accelerator.unwrap_model(model).save_pretrained(save_path)
                print(f"Model saved to {save_path}")
            
                # Save tokenizer
                tokenizer.save_pretrained(tokenizer_path)
                print(f"Tokenizer saved to {tokenizer_path}")
            
                # Save optimizer and scheduler state for later fine-tuning
                torch.save(
                    {
                        "optimizer_state_dict": optimizer.state_dict(),
                        "scheduler_state_dict": scheduler.state_dict(),
                    },
                    optimizer_scheduler_path,
                )
                print(f"Optimizer and scheduler states saved to {optimizer_scheduler_path}")

        else:
            trigger_times += 1
            if trigger_times >= patience:
                accelerator.print("Early stopping triggered.")
                accelerator.unwrap_model(model).load_state_dict(best_model_state)
                break


In [None]:
from accelerate import notebook_launcher

if __name__ == '__main__':
    notebook_launcher(train_func, args=(), num_processes=torch.cuda.device_count()) #Adjust automatically as per GPU availablity


In [None]:
# # Plot training metrics
# import matplotlib.pyplot as plt
# 
# epochs_range = range(1, len(train_losses) + 1)
# plt.figure(figsize=(12, 4))
# 
# plt.subplot(1, 2, 1)
# plt.plot(epochs_range, train_losses, label='Training Loss')
# plt.plot(epochs_range, val_losses, label='Validation Loss')
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.legend(loc='upper right')
# plt.title('Loss over Epochs')
# 
# plt.subplot(1, 2, 2)
# plt.plot(epochs_range, train_accuracies, label='Training Accuracy')
# plt.plot(epochs_range, val_accuracies, label='Validation Accuracy')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.legend(loc='lower right')
# plt.title('Accuracy over Epochs')
# 
# plt.show()

In [None]:
# Check unique labels in training dataset
train_labels = set(train_dataset['target'])
print(f"Training Labels: {train_labels}")

# Check unique labels in validation dataset
val_labels = set(validation_dataset['target'])
print(f"Validation Labels: {val_labels}")

In [None]:
batch = next(iter(val_loader))
print(batch['labels'])
print(batch['input_ids'].shape)

In [None]:

# For training data
train_labels = train_dataset['target']
train_label_counts = Counter(train_labels)
print(f"Training label distribution: {train_label_counts}")

# For validation data
val_labels = validation_dataset['target']
val_label_counts = Counter(val_labels)
print(f"Validation label distribution: {val_label_counts}")