In [None]:
%pip install transformers datasets wandb

In [None]:
import os
from google.colab import drive
drive.mount("/content/drive")
os.chdir("/content/drive/MyDrive/NLP_Project_1")
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

fine_tuned_model_name = "to_delete"

In [None]:
!wandb login

import wandb

In [2]:
import pandas as pd
import torch
import datasets
import gc
import numpy as np
import re
import random
import utils

from torch.nn.utils.rnn import pad_sequence
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
from collections import deque
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoConfig,  Adafactor, get_cosine_schedule_with_warmup
from transformers import XLNetForSequenceClassification, BertForSequenceClassification, RobertaForSequenceClassification, AlbertForSequenceClassification

SEED = 20220719
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
BACKBONE_NAME = "bert-base-uncased"

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)
random.seed(SEED)

tokenizer = AutoTokenizer.from_pretrained(BACKBONE_NAME)
tokenizer.add_tokens("_num_")

model_config = AutoConfig.from_pretrained(BACKBONE_NAME)

torch.cuda.set_device(DEVICE)
print(DEVICE)

ModuleNotFoundError: No module named 'pandas'

In [22]:
dataset = utils.load_data("./RawData", 8, 22, tokenizer)
original_train = dataset["train"]
original_valid = dataset["valid"]
collator = DataCollatorWithPadding(tokenizer, return_tensors = "pt")

In [23]:
def sweep_dataset(train, valid, collator, tokenizer, batch_size) :
    return utils.get_dataset(train, tokenizer, collator, batch_size, True), utils.get_dataset(valid, tokenizer, collator, batch_size, True)

def sweep_optimizer(input_model, optimizer, learning_rate) :
    if optimizer == "AdamW" :
        optimizer = torch.optim.AdamW(input_model.parameters(), lr = learning_rate, eps = 1e-6, weight_decay = 0.02)
    elif optimizer == 'RMSprop' :
        optimizer = torch.optim.RMSprop(input_model.parameters(), lr = learning_rate, weight_decay = 0.02)
    return optimizer

In [24]:
def create_model(backbone) :
    model = BertForSequenceClassification.from_pretrained(backbone, num_labels = 2, ignore_mismatched_sizes = True)
    model.resize_token_embeddings(len(tokenizer))
    model.train()
    return model

In [25]:
def accuracy(pred, true) :
    pred = pred.softmax(1)
    pred = pred.argmax(1)
    return pred.eq(true).sum() / len(pred)

def train_epoch(train_data, model, epoch, optimizer, device, lr_scheduler = None) :
    scaler = torch.cuda.amp.GradScaler()

    loss_fn = torch.nn.CrossEntropyLoss(label_smoothing = 0.1)

    cum_loss = deque(maxlen = 20)
    cum_acc = deque(maxlen = 20)

    curr_loss = []
    curr_acc = []

    model.train()
    with tqdm(train_data, unit = " batch") as tepoch :
        for i, batch in enumerate(tepoch) :
            tepoch.set_description(f"Train Epoch")

            optimizer.zero_grad()

            label = batch["labels"].to(device)
            batch = {k : v.to(device) for k, v in batch.items()}
            
            with torch.cuda.amp.autocast() :
                outputs = model(**batch)
                loss = loss_fn(outputs['logits'], label)

            logits = outputs["logits"]

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            if lr_scheduler :
                lr_scheduler.step()

            acc = accuracy(logits, label)
            cum_loss.append(float(loss))
            cum_acc.append(float(acc))

            curr_loss.append(float(loss))
            curr_acc.append(float(acc))

            tepoch.set_postfix(loss = sum(cum_loss) / len(cum_loss),
                               accuracy = sum(cum_acc) / len(cum_acc))

            wandb.log({"train_batch_loss" : sum(cum_loss) / len(cum_loss),
                       "train_batch_acc" : sum(cum_acc) / len(cum_acc)},
                        step = i + (epoch * len(train_data)))
            
    return sum(curr_loss) / len(curr_loss), sum(curr_acc) / len(curr_acc)

def valid_epoch(valid_data, model, epoch, device) :
    cum_loss = deque(maxlen = 20)
    cum_acc = deque(maxlen = 20)

    curr_loss = []
    curr_acc = []

    model.eval()
    with torch.no_grad() :
        with tqdm(valid_data, unit = " batch") as tepoch :
            for i, batch in enumerate(tepoch) :
                tepoch.set_description(f"Valid Epoch")
                label = batch["labels"].to(device)
                batch = {k : v.to(device) for k, v in batch.items()}
                
                with torch.cuda.amp.autocast():
                    outputs = model(**batch)

                loss = outputs["loss"]
                logits = outputs["logits"]

                acc = accuracy(logits, label)
                
                cum_loss.append(float(loss))
                cum_acc.append(float(acc))
                curr_loss.append(float(loss))
                curr_acc.append(float(acc))

                tepoch.set_postfix(loss = sum(cum_loss) / len(cum_loss), 
                                   accuracy = sum(cum_acc) / len(cum_acc))

                wandb.log({"valid_batch_loss" : sum(curr_loss) / len(curr_loss),
                           "valid_batch_acc" : sum(curr_acc) / len(curr_acc)},
                           step = i + (epoch * len(valid_data)))
            
    return sum(curr_loss) / len(curr_loss), sum(curr_acc) / len(curr_acc)

In [34]:
def run_sweep(config = None):
    with wandb.init(config=config) :
        w_config = wandb.config
        
        train_data, valid_data = sweep_dataset(original_train, original_valid, collator, tokenizer, w_config.batch_size)
        model = create_model(BACKBONE_NAME).to(DEVICE)
        optimizer = sweep_optimizer(model, w_config.optimizer, w_config.learning_rate)
        lr_scheduler = get_cosine_schedule_with_warmup(optimizer = optimizer,
                                                       num_warmup_steps = int(len(train_data) * w_config.epochs * 0.06),
                                                       num_training_steps = len(train_data) * w_config.epochs)
        for epoch in range(w_config.epochs):
            train_loss, train_acc = train_epoch(train_data, model, epoch, optimizer, DEVICE, lr_scheduler)
            valid_loss, valid_acc = valid_epoch(valid_data, model, epoch, DEVICE)
            wandb.log({"loss": valid_loss})      

In [None]:
sweep_config = {
  "name" : "bert_sweep",
    'metric' : {
    'name': 'loss',
    'goal': 'minimize'   
    },
  "method" : "random",
  "parameters" : {
    'optimizer': {
        'values': ['AdamW', 'RMSprop']
    },
    "epochs" : {
      "values" : [3, 5]
    },
    "learning_rate" : {
        "min": 1e-7,
        "max": 1e-4
    },
    "batch_size" : {
        'values' : [64, 128, 256, 512]
    }
  }
}

sweep_id = wandb.sweep(sweep_config, project = "Goorm_1st_project", entity = "2nd_group")

In [None]:
wandb.agent(sweep_id, run_sweep, count = 6)