In [1]:
import numpy as np
import pandas as pd
from datasets import load_dataset

dataset = load_dataset("McGill-NLP/feedbackQA")

Found cached dataset feedback_qa (/home/raja/.cache/huggingface/datasets/McGill-NLP___feedback_qa/plain_text/1.0.0/20c8f938f417c88303bb7041cea9554c1d14667686d7d7c5dda83dd4f39e5dc4)


  0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
rating_scores = {'Excellent':3 , 'Acceptable':2 , 'Could be Improved':1, 'Bad': -1}

def process_df(df):
    df['list_feedback'] = df['feedback'].apply(lambda x: [ r + "___" + e for r,e in zip(x['rating'],x['explanation']) ])
    df['sampled_feedback'] = df['list_feedback'].apply(lambda x: np.random.choice(x).split("___") )
    df['rating_score'] = df['sampled_feedback'].apply(lambda x: rating_scores[x[0]])
    df['rating'] = df['sampled_feedback'].apply(lambda x: x[0])
    df['explanation'] = df['sampled_feedback'].apply(lambda x: x[1])
    return df

In [3]:
train_df = process_df(pd.DataFrame(dataset['train']))
val_df = process_df(pd.DataFrame(dataset['validation']))
test_df = process_df(pd.DataFrame(dataset['test']))

In [4]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader

# Load model from HuggingFace Hub
bart_chkpt = 'facebook/bart-large'
tokenizer = AutoTokenizer.from_pretrained(bart_chkpt)

In [5]:
train_df.head()

Unnamed: 0,question,answer,feedback,list_feedback,sampled_feedback,rating_score,rating,explanation
0,How do I get help finding a job?,Coronavirus (COVID-19) information for job see...,"{'rating': ['Excellent', 'Could be Improved'],...",[Excellent___Has a link to detailed informatio...,"[Could be Improved, This answer provides a lin...",1,Could be Improved,"This answer provides a link for job searches, ..."
1,How do I get help finding a job?,Coronavirus (COVID-19) information for job see...,"{'rating': ['Excellent', 'Excellent'], 'explan...",[Excellent___A link to a job search website is...,"[Excellent, Includes a link to a Jobs Hub page...",3,Excellent,"Includes a link to a Jobs Hub page, which is b..."
2,How do I get help finding a job?,Coronavirus (COVID-19) information and support...,"{'rating': ['Bad', 'Acceptable'], 'explanation...",[Bad___Talks about tax credits for businesses ...,"[Acceptable, This answer discusses the Employm...",2,Acceptable,"This answer discusses the Employment Fund, whi..."
3,If I am in Australia on a worker holiday marke...,Frequently Asked Questions\nWorking holiday ma...,"{'rating': ['Could be Improved', 'Acceptable']...",[Could be Improved___Answer is about Working H...,"[Could be Improved, Answer is about Working Ho...",1,Could be Improved,"Answer is about Working Holiday Makers, but do..."
4,If I am in Australia on a worker holiday marke...,Frequently Asked Questions\nCOVID-19 Pandemic ...,"{'rating': ['Bad', 'Could be Improved'], 'expl...",[Bad___Discusses pandemic visas. Doesn't menti...,"[Bad, Discusses pandemic visas. Doesn't mentio...",-1,Bad,Discusses pandemic visas. Doesn't mention the ...


In [6]:
tokenizer('Hello, how are you doing?'+ f" {tokenizer.eos_token} " + "Hemlooooo",add_special_tokens=True,return_tensors='pt', return_length=1)

{'input_ids': tensor([[    0, 31414,     6,   141,    32,    47,   608,   116,  1437,     2,
         10869,   462, 40386,   139,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'length': tensor([15])}

In [19]:
import tqdm

class feedback_QA_dataset(Dataset):
    
    def __init__(self,df,max_length=300):
        self.df = df
        self.max_len = max_length
        self.data = []
        
        for i in tqdm.tqdm(range(len(self.df)),desc='vectorizing..'):
            
            d = {}
            
            tok_input = tokenizer(('Question:' + self.df.iloc[i]['question'] + ' Answer: ' + self.df.iloc[i]['answer']).replace("\n"," "),
                                  return_token_type_ids=True, 
                                  add_special_tokens=True, 
                                  return_length=True,
                                  max_length=self.max_len,
                                  padding='max_length',
                                  truncation='only_first',
                                  return_tensors='pt')

            feedback = self.df.iloc[i]['explanation']

            tok_feedback = tokenizer(tokenizer.pad_token + "Summary: " + feedback + tokenizer.eos_token, 
                                     return_token_type_ids=True,
                                     add_special_tokens=False,
                                     return_length=True,
                                     max_length=300, 
                                     padding='max_length', 
                                     truncation='only_first',
                                     return_tensors='pt')

            d['input'] = tok_input['input_ids'].squeeze(0)
            d['input_attn'] = tok_input['attention_mask'].squeeze(0)
            d['feedback'] = tok_feedback['input_ids'].squeeze(0)[:-1]
            d['feedback_attn'] = tok_feedback['attention_mask'].squeeze(0)[:-1]
            
            labels = tok_feedback['input_ids'].squeeze(0)[1:].clone()
            labels[labels==tokenizer.pad_token_id] = -100
            d['labels'] = labels
            
            d['feedback_len'] = tok_feedback['length'][0]

            self.data.append(d)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        return self.data[idx]

In [20]:
train_dataset = feedback_QA_dataset(train_df)
valid_dataset = feedback_QA_dataset(val_df)
test_dataset = feedback_QA_dataset(test_df)

vectorizing..: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5660/5660 [00:11<00:00, 474.98it/s]
vectorizing..: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1410/1410 [00:02<00:00, 478.87it/s]
vectorizing..: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1995/1995 [00:04<00:00, 462.34it/s]


In [21]:
train_DL = DataLoader(train_dataset,batch_size=16,shuffle=True)
valid_DL = DataLoader(valid_dataset,batch_size=16,shuffle=True)
test_DL = DataLoader(test_dataset,batch_size=1,shuffle=False)

In [22]:
for b in train_DL:
    print(b['input'] , b['labels'])
    break

tensor([[    0, 45641,    35,  ...,     1,     1,     1],
        [    0, 45641,    35,  ...,     1,     1,     1],
        [    0, 45641,    35,  ...,     1,     1,     1],
        ...,
        [    0, 45641,    35,  ...,    49,  4722,     2],
        [    0, 45641,    35,  ...,     1,     1,     1],
        [    0, 45641,    35,  ...,     1,     1,     1]]) tensor([[47977,    35, 31652,  ...,  -100,  -100,  -100],
        [47977,    35,   152,  ...,  -100,  -100,  -100],
        [47977,    35,   152,  ...,  -100,  -100,  -100],
        ...,
        [47977,    35,   152,  ...,  -100,  -100,  -100],
        [47977,    35,    85,  ...,  -100,  -100,  -100],
        [47977,    35,   152,  ...,  -100,  -100,  -100]])


from transformers import BartForConditionalGeneration

device = 'cuda:0'

bart_model = BartForConditionalGeneration.from_pretrained(bart_chkpt).to(device)

with torch.no_grad():
    for b in train_DL:
        output = bart_model(input_ids=b['input'].to(device),
                            decoder_input_ids=b['feedback'].squeeze(1)[:,:-1].to(device),)
                            #labels=b['feedback'].squeeze(1)[:,1:].to(device))
        print(tokenizer.decode(b['input'][:,0][0],skip_special_tokens=True),"\n\n")
        print(tokenizer.decode(b['feedback'][:,0,:-1][0]))
        print(tokenizer.decode(b['feedback'][:,0,1:][0]))
        print(output.logits.shape)
        break

del bart_model

In [23]:
def train(generator,train_dl,valid_dl,epochs,optimizer,PATIENCE=20,save_dir=None):

    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    
    generator.train()
    
    loss_acc = 0
    num_batches = 0
    total_steps = 0
    best_valid_loss = np.inf
    patience = PATIENCE
    
    train_loss_arr,valid_loss_arr = [],[]
    
    optimizer.zero_grad()
    generator.zero_grad()
    
    for E in range(epochs):
        
        for b in train_dl:
            
            y = generator(input_ids=b['input'].to(device),
                          attention_mask=b['input_attn'].to(device),
                          decoder_input_ids=b['feedback'].to(device),
                          decoder_attention_mask=b['feedback_attn'].to(device),
                          labels = b['labels'].to(device))
                          # decoder_input_ids=b['feedback'].squeeze(1)[:,:-1].to(device),
                          # decoder_attention_mask=b['feedback_attn'].squeeze(1)[:,:-1].to(device))
            loss = y.loss #F.cross_entropy(y.logits.permute(0,2,1), b['feedback'].squeeze(1)[:,1:].to(device), ignore_index=tokenizer.pad_token_id)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            loss_acc += loss.item()
            num_batches += 1
            total_steps += 1
            
            train_loss_arr.append(loss_acc/num_batches)
            
            if total_steps%100==0:
                print("Epoch:",E,"\t","Steps taken:",total_steps,"\tLoss:",loss_acc/num_batches)
            
        #print("Epoch:",E,"\t","Steps taken:",total_steps,"\tLoss:",loss_acc/num_batches)
        
        torch.save({'model_state':generator.state_dict(),
                    'optimizer':optimizer.state_dict(),
                    'epoch':E},
                    f"{save_dir}/Epoch_{E}_model_chkpt.pth.tar")
        
        valid_loss = validate(generator,valid_dl)
        valid_loss_arr.append(valid_loss/len(valid_dl))
        
        if valid_loss<best_valid_loss:
            best_valid_loss = valid_loss
            patience = PATIENCE
            
            torch.save({'model_state':generator.state_dict(),
                        'optimizer':optimizer.state_dict(),
                        'epoch':E},
                        f"{save_dir}/best_model_chkpt.pth.tar")
        else:
            patience -= 1
            print(f"REDUCING PATIENCE...{patience}")

        if patience<=0:
            print("RUNNING OUT OF PATIENCE... TERMINATING")
            break
    
    
    return train_loss_arr,valid_loss_arr
                

In [24]:
def validate(generator,valid_dl):
    
    generator.eval()
    valid_loss = 0
    with torch.no_grad():
        for b in valid_dl:
            y = generator(input_ids=b['input'].to(device),
                          attention_mask=b['input_attn'].to(device),
                          decoder_input_ids=b['feedback'].to(device),
                          decoder_attention_mask=b['feedback_attn'].to(device),
                          labels = b['labels'].to(device))
                          # decoder_input_ids=b['feedback'].squeeze(1)[:,:-1].to(device),
                          # decoder_attention_mask=b['feedback_attn'].squeeze(1)[:,:-1].to(device))
            loss = y.loss #F.cross_entropy(y.logits.permute(0,2,1), b['feedback'].squeeze(1)[:,1:].to(device), ignore_index=tokenizer.pad_token_id)
            valid_loss += loss.item()
            
    print("Validation Loss:",valid_loss)
    return valid_loss

In [None]:
import os

from transformers import BartForConditionalGeneration

device = 'cuda:0'

generator = BartForConditionalGeneration.from_pretrained(bart_chkpt).to(device)

optimizer = torch.optim.AdamW(generator.parameters(),lr=5e-5)

save_dir = 'GenFB_BART_large_chkpts_3'
if not os.path.exists(save_dir):
    os.mkdir(save_dir)

train_loss,valid_loss = train(generator,train_DL,valid_DL,50,optimizer,PATIENCE=5,save_dir=save_dir)

Epoch: 0 	 Steps taken: 100 	Loss: 2.673273162841797
Epoch: 0 	 Steps taken: 200 	Loss: 2.514744558930397
Epoch: 0 	 Steps taken: 300 	Loss: 2.478877408504486
Validation Loss: 529.0194516181946
Epoch: 1 	 Steps taken: 400 	Loss: 3.3485342407226564
Epoch: 1 	 Steps taken: 500 	Loss: 3.8570037784576416
Epoch: 1 	 Steps taken: 600 	Loss: 4.190649787584941
Epoch: 1 	 Steps taken: 700 	Loss: 4.424836439405169
Validation Loss: 518.0795140266418
Epoch: 2 	 Steps taken: 800 	Loss: 4.5951131355762485
Epoch: 2 	 Steps taken: 900 	Loss: 4.727638207011752
Epoch: 2 	 Steps taken: 1000 	Loss: 4.8276005525588985
Validation Loss: 494.9659824371338
Epoch: 3 	 Steps taken: 1100 	Loss: 4.888780029903758
Epoch: 3 	 Steps taken: 1200 	Loss: 4.937873899141947
Epoch: 3 	 Steps taken: 1300 	Loss: 4.9761647052031295
Epoch: 3 	 Steps taken: 1400 	Loss: 5.006569131101881
Validation Loss: 486.38301277160645
Epoch: 4 	 Steps taken: 1500 	Loss: 5.026409145991008
Epoch: 4 	 Steps taken: 1600 	Loss: 5.044998742938041

In [None]:
import json

with open('train_loss.json','w') as f:
    json.dump(train_loss,f)

with open('valid_loss.json','w') as f:
    json.dump(valid_loss,f)

In [None]:
train_loss_ds = np.array(train_loss)[np.round(np.linspace(0, len(train_loss) - 1, len(valid_loss))).astype(int)]
loss_df = pd.DataFrame({'train_loss':train_loss_ds , 'valid_loss':valid_loss})

In [None]:
from plotly import express as px
px.line(loss_df,y=['train_loss','valid_loss'])

In [None]:
generator.load_state_dict(torch.load('GenFB_BART_chkpts_1/Epoch_0_model_chkpt.pth.tar')['model_state'])

In [None]:
i = 0
for b in train_DL:
    out = generator.generate(inputs=b['input'][0:1,0].to(device),top_p=0.5)
    print(tokenizer.decode(b['input'][0:1,0][0],skip_special_tokens=True))
    print(tokenizer.decode(b['feedback'][0:1,0][0],skip_special_tokens=True))
    print(tokenizer.decode(out[0]))
    print("--------------------------------------------------------")
    i+=1
    if i>10:
        break