In [3]:
import numpy as np
import pandas as pd
from datasets import load_dataset
import itertools

dataset = load_dataset("McGill-NLP/feedbackQA")

rating_class = {'Excellent':3 , 'Acceptable':2 , 'Could be Improved':1, 'Bad': 0}

def process_df(df):
    df['list_feedback'] = df['feedback'].apply(lambda x: [ r + "___" + e for r,e in zip(x['rating'],x['explanation']) ])
    df['sampled_feedback'] = df['list_feedback'].apply(lambda x: x[0].split("___") if (x[0].split("___")[0]!='Excellent' and x[0].split("___")[0]!='Acceptable') else (x[1].split("___") if (x[1].split("___")[0]!='Excellent' and x[1].split("___")[0]!='Acceptable') else np.random.choice(x).split("___")) )
    df['rating_class'] = df['sampled_feedback'].apply(lambda x: rating_class[x[0]])
    df['rating'] = df['sampled_feedback'].apply(lambda x: x[0])
    df['explanation'] = df['sampled_feedback'].apply(lambda x: x[1])
    return df

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader
from accelerate import Accelerator


import tqdm
from torch.utils.data import Dataset

class feedback_QA_dataset(Dataset):
    
    def __init__(self,df,tokenizer,max_length=2048):
        self.df = df
        self.max_len = max_length
        self.data = []
        self.tokenizer = tokenizer
        skipped = 0
        
        for i in range(len(self.df)):
            
            d = {}
            question = self.tokenizer(f"Question: {self.df.iloc[i]['question']}", add_special_tokens=True)
            answer = self.tokenizer(f"Answer: {self.df.iloc[i]['answer']}")
            padding = [0] * (self.max_len - len(question['input_ids']+answer['input_ids']) - 1)       
            
            input = question['input_ids'] + answer['input_ids'] + [tokenizer.eos_token_id] #+ padding
            labels = [-100]*len(question['input_ids']) + answer['input_ids'] + [tokenizer.eos_token_id] + [-100]*len(padding)
            attention_mask = [1]*len(question['input_ids']) + [1]*len(answer['input_ids']) + [1] + [0]*len(padding)

            
            if len(input) > self.max_len:
                skipped += 1
                continue
            input = input + padding
            
            d['input'] = input
            d['labels'] = labels
            d['attention_mask'] = attention_mask
            d['id'] = i

            for k in d.keys():
                d[k] = torch.tensor(d[k])

            self.data.append(d)
            
        print(f'skipped: {skipped}')

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        return self.data[idx]

def valid(model,valid_DL,epoch):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for b in valid_DL:
            out = model(input_ids=b['input_ids'], labels=b['labels'])
            loss = out.loss
            total_loss += loss.item()
    accelerator.print(f"Epoch: {epoch}\t\t---->\t\t Valid Loss per batch: {total_loss/len(valid_DL)} ")
    return (total_loss/len(valid_DL))


def train(model,train_DL,optimizer,epoch):
    model.train()
    optimizer.zero_grad()
    total_loss = 0
    for b in train_DL:
        out = model(input_ids=b['input_ids'], labels=b['labels'])
        loss = out.loss
        total_loss += loss.item()
        accelerator.backward(loss)
        optimizer.step()
    accelerator.print(f"Epoch: {epoch}\t\t---->\t\t Train Loss per batch: {total_loss/len(train_DL)} ")

def finetune(bert_chkpt,train_df, valid_df, BATCH_SIZE=2, EPOCHS=20, PATIENCE=5):
    # os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
    accelerator = Accelerator()
    device = accelerator.device
    
    tokenizer = AutoTokenizer.from_pretrained(bert_chkpt,cache_dir='/home/jupyter/Ravi_new/HF_cache')
    model = AutoModelForCausalLM.from_pretrained(bert_chkpt,cache_dir='/home/jupyter/Ravi_new/HF_cache')
    
    train_dataset = feedback_QA_dataset(train_df,tokenizer)
    train_DL = DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=True)

    valid_dataset = feedback_QA_dataset(valid_df,tokenizer)
    valid_DL = DataLoader(valid_dataset,batch_size=BATCH_SIZE,shuffle=True)

    optimizer = torch.optim.AdamW(model.parameters(),lr=1e-6)
    
    model, train_DL, valid_DL, optimizer = accelerator.prepare(model,train_DL,valid_DL,optimizer)

    best_val_loss = validate(model,valid_DL,0)
    for e in range(EPOCHS):
        train(model,train_DL,optimizer,e)
        val_loss = validate(model,valid_DL,e)
        if val_loss<best_val_loss:
            best_val_loss = val_loss
            patience = PATIENCE
            
            torch.save({'model_state':model.state_dict(),
                        'optimizer':optimizer.state_dict(),
                        'epoch':e},
                        f"{save_dir}/best_model_chkpt.pth.tar")
        else:
            patience -= 1
            accelerator.print(f"REDUCING PATIENCE...{patience}")

In [2]:
from accelerate import notebook_launcher
notebook_launcher(finetune,("meta-llama/Llama-2-7b-chat-hf",pd.read_csv('train_data.csv'),pd.read_csv('valid_data.csv')),num_processes=2)

Launching training on 2 GPUs.
skipped: 3566


RuntimeError: An issue was found when launching the training: 

-- Process 1 terminated with the following error:
Traceback (most recent call last):
  File "/home/jupyter/Ravi_new/py39_env/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
    fn(i, *args)
  File "/home/jupyter/Ravi_new/py39_env/lib/python3.9/site-packages/accelerate/utils/launch.py", line 543, in __call__
    self.launcher(*args)
  File "/tmp/ipykernel_22608/1636976021.py", line 104, in finetune
    train_DL = DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=True)
  File "/home/jupyter/Ravi_new/py39_env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 351, in __init__
    sampler = RandomSampler(dataset, generator=generator)  # type: ignore[arg-type]
  File "/home/jupyter/Ravi_new/py39_env/lib/python3.9/site-packages/torch/utils/data/sampler.py", line 107, in __init__
    raise ValueError("num_samples should be a positive integer "
ValueError: num_samples should be a positive integer value, but got num_samples=0
