In [None]:
import numpy as np
import pandas as pd
from datasets import load_dataset
import itertools
from tqdm import tqdm_notebook

dataset = load_dataset("McGill-NLP/feedbackQA")

import os
os.environ["CUDA_VISIBLE_DEVICES"]=""
import torch
torch.cuda.is_available()

In [None]:
train_df = pd.read_csv('train_refined.csv',keep_default_na=False)
train_df

In [None]:
val_df1 = train_df.replace('None',None).dropna(axis=0).reset_index()

In [None]:
val_df1.head()

In [None]:
from vllm import LLM, SamplingParams

In [None]:
llm = LLM(model="/home/jupyter/Ravi_new/RL_Language_Feedback/New/ILF_Baseline/Llama_Checkpoints",tensor_parallel_size=1)

In [None]:
from transformers import AutoModelForCausalLM
from accelerate import Accelerator
model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf',cache_dir='/home/jupyter/Ravi_new/HF_cache')

In [None]:
model.state_dict()

In [None]:
accelerator = Accelerator()
model = accelerator.prepare(model)
accelerator.load_state('Llama_Checkpoints/pytorch_model')

In [None]:
accelerator.get_state_dict(model)

In [None]:
import torch
ckpt = torch.load('Llama_Checkpoints/best_model_chkpt.pth.tar')
# model.load_state_dict(ckpt['model_state'])

In [None]:
ckpt['model_state']

In [None]:
llm.llm_engine.

In [None]:
val_df1.iloc[150]['question']

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('/home/jupyter/Ravi_new/RL_Language_Feedback/New/ILF_Baseline/Llama_Checkpoints/')

In [None]:
params = SamplingParams(max_tokens=200, best_of=3)

prompt = '<s>Question: What can remote and aboriginal communities do help to stay safe?</s>Answer: '
prompt_token_ids = tokenizer(prompt,add_special_tokens=False)['input_ids']
out = llm.generate(prompt_token_ids=[prompt_token_ids],sampling_params=params)
l = [o.text for o in out[0].outputs]
l

In [None]:
out

In [None]:
i=1552

question = val_df1.loc[i]['question']
answer = val_df1.loc[i]['answer']
feedback = val_df1.loc[i]['explanation']
r2 = val_df1.loc[i]['refined_answer_0'].replace('<unk>','').replace('\n','')
r1 = val_df1.loc[i]['refined_answer_1'].replace('<unk>','').replace('\n','')
r0 = val_df1.loc[i]['refined_answer_2'].replace('<unk>','').replace('\n','')
# r3 = val_df1.loc[i]['refined_answer_3'].replace('<unk>','').replace('\n','')

prompt = f'I will give you a question, an initial answer, a feedback critquing that answer, and 4 different refined answers that try to incorporate the feedback. I want you to tell me, which of the four refined answers best incorporate the feedback. Strictly follow this format: if refined_answer_X is the best, just say, Best Answer: refined_answer_X. Do NOT generate anything else. \n Question: {question} \n Answer: {answer} \n Feedback: {feedback} \n refined_answer_0: {r0} \n refined_answer_1: {r1} \n refined_answer_2: {r2} \n Best Answer: '

params = SamplingParams(max_tokens=200,n=8)
out = llm.generate([prompt],sampling_params=params)
l = ['refined_answer_0' if 'refined_answer_0' in o.text else ('refined_answer_1' if 'refined_answer_1' in o.text else ('refined_answer_2' if 'refined_answer_2' in o.text else 'NA')) for o in out[0].outputs]
l

In [None]:
prompts = []
for i in tqdm_notebook(range(len(val_df1))):
    question = val_df1.loc[i]['question']
    answer = val_df1.loc[i]['answer']
    feedback = val_df1.loc[i]['explanation']
    
    r0 = val_df1.loc[i]['refined_answer_0'].replace('<unk>','').replace('\n','')
    r1 = val_df1.loc[i]['refined_answer_1'].replace('<unk>','').replace('\n','')
    r2 = val_df1.loc[i]['refined_answer_2'].replace('<unk>','').replace('\n','')
    #r3 = val_df1.loc[i]['refined_answer_3'].replace('<unk>','').replace('\n','')
    
    prompt = f'I will give you a question, an initial answer, a feedback critquing that answer, and 4 different refined answers that try to incorporate the feedback. I want you to tell me, which of the four refined answers best incorporate the feedback. Strictly follow this format: if refined_answer_X is the best, just say, Best Answer: refined_answer_X. Do NOT generate anything else. \n Question: {question} \n Answer: {answer} \n Feedback: {feedback} \n refined_answer_0: {r0} \n refined_answer_1: {r1} \n refined_answer_2: {r2} \n Best Answer: '
    
    prompts.append(prompt)

In [None]:
params = SamplingParams(max_tokens=50,n=8)
out = llm.generate(prompts,sampling_params=params)

In [None]:
out[0].outputs

In [None]:
import random
random.randint(a=0,b=2)

In [None]:
from collections import Counter
import tqdm

val_df1['selected_answer'] = ['None']*len(val_df1)
na_selected = 0
for i in tqdm.notebook.tqdm(range(len(prompts))):
    l = ['refined_answer_0' if 'refined_answer_0' in o.text else ('refined_answer_1' if 'refined_answer_1' in o.text else ('refined_answer_2' if 'refined_answer_2' in o.text else ('refined_answer_3' if 'refined_answer_3' in o.text else 'NA'))) for o in out[i].outputs]
    c = Counter(l).most_common(2)

    selected = 'NA'
    
    if c[0][0]=='NA':
        try:
            selected = c[1][0]
        except:
            na_selected += 1
            continue
    else:
        selected = c[0][0]
        
    if selected not in val_df1.columns:
        na_selected += 1
        selected = f'refined_answer_{random.randint(a=0,b=2)}'
    val_df1['selected_answer'].loc[i] = val_df1[selected].loc[i]
print(f'NA Selected: {na_selected}')

In [None]:
val_df1

In [None]:
val_df1.to_csv('train_selected_refined.csv')

In [None]:
train_df[train_df['rating_class']=='3'][['question','answer']]

In [None]:
final_df = pd.concat([train_df[train_df['rating_class']=='3'][['question','answer']], val_df1[['question','selected_answer']].rename(columns={'selected_answer':'answer'})])

In [None]:
final_df

In [None]:
final_df.to_csv('train_data.csv')

In [None]:
print(tokenizer.decode(out[0]))

In [None]:
model = model.to('cuda:1')
model.eval()

i=1394

question = val_df1.loc[i]['question']
answer = val_df1.loc[i]['answer']
feedback = val_df1.loc[i]['explanation']
r2 = val_df1.loc[i]['refined_answer_0'].replace('<unk>','').replace('\n','')
r0 = val_df1.loc[i]['refined_answer_1'].replace('<unk>','').replace('\n','')
r1 = val_df1.loc[i]['refined_answer_2'].replace('<unk>','').replace('\n','')
r3 = val_df1.loc[i]['refined_answer_3'].replace('<unk>','').replace('\n','')

prompt = f'I will give you a question, an initial answer, a feedback critquing that answer, and 2 different refined answers that try to incorporate the feedback. I want you to tell me, which of the two refined answers better incorporate the feedback. Strictly follow this format: if refined_answer_X is better, just say, Better Answer: refined_answer_X. \n Question: {question} \n Answer: {answer} \n Feedback: {feedback} \n refined_answer_0: {r2} \n refined_answer_1: {r0} \n Better Answer: '

inp = tokenizer(prompt,return_tensors='pt',add_special_tokens=True)['input_ids'].to('cuda:1')
with torch.no_grad():
    out = model.generate(inp, max_new_tokens=50, do_sample=False, num_beams=1)

In [None]:
print(tokenizer.decode(out[0]))

In [None]:
# model,valid_DL = accelerator.prepare(model,valid_DL)
model = model.to(device)
model.eval()

num_return_sequences = 2

for i in range(num_return_sequences):
    val_df[f'refined_answer_{i}'] = ['None']*len(val_df)
with torch.no_grad():
    for b in valid_DL:
        out = model.generate(inputs=b['input'].to(device),
                             attention_mask=b['attention_mask'].to(device),
                             max_new_tokens=50,
                             num_return_sequences=2,
                             do_sample=True
                            )
        
        l = [a.split('Refined answer: ')[1].replace('</s>','') for a in tokenizer.batch_decode(out)]
        for i in range(num_return_sequences):
            val_df[f'refined_answer_{i}'].loc[b['id'].tolist()] = l[i::num_return_sequences]
        break

In [None]:
val_df.loc[6][['refined_answer_0','refined_answer_1']].values

In [None]:
l = [0,1,2,3,4,5]
l[2::2]

In [None]:
val_df = refine(bert_chkpt,val_df)
val_df.to_csv('val_refined.csv')

In [None]:
val_df

In [None]:
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1"
torch.cuda.is_available()

In [None]:
val_df

In [None]:
print(tokenizer.decode(out[2]).replace('</s>',''))#.replace('<unk>',''))

In [None]:
tokenizer.decode([0,0,0])

In [None]:
from transformers import BartForConditionalGeneration

device = 'cuda:0'

model = AutoModel.from_pretrained(bert_chkpt).to(device)

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    se = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return F.normalize(se, p=2, dim=1)

j = 0

with torch.no_grad():
    for b in train_DL:
        out = mean_pooling( model(input_ids=b['context_w_feedback'].to(device), attention_mask=b['context_w_feedback_attn'].to(device)) , b['feedback_pool_mask'].to(device))
        print(out.shape)
        print('----------------------------')
        j+=1
        if j>5:
            break

del model

In [None]:
t = torch.tensor([[[1,2,3,4,5],[6,7,8,9,0]]])
t.repeat(2,1,1)

In [None]:
class classifier(nn.Module):

    def __init__(self, model_chkpt, device='cuda:0', inp_dim=768, hidden_dims=None, num_classes=4, use_norm=False):
        super().__init__()
        
        self.device = device
        self.bert_model = AutoModel.from_pretrained(model_chkpt).to(device)
        
        self.use_norm = use_norm
        self.inp_layer = nn.Linear(inp_dim,hidden_dims[0])

        hidden_layers = []
        for i in range(len(hidden_dims)-1):
            hidden_layers.append(nn.Linear(hidden_dims[i],hidden_dims[i+1]))
            hidden_layers.append(nn.Dropout(p=0.2))
            hidden_layers.append(nn.ReLU())
        self.layers = nn.Sequential(*hidden_layers)

        self.out_layer = nn.Linear(hidden_dims[-1],num_classes)
        
    def mean_pooling(self,model_output,attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        se = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return se
        
    def forward(self, b):
        y = self.mean_pooling( self.bert_model(input_ids=b['context_w_feedback'].to(self.device), attention_mask=b['context_w_feedback_attn'].to(self.device)),
                               b['feedback_pool_mask'].to(self.device))
        if self.use_norm:
            y = F.normalize(y,p=2,dim=-1)
        y = self.inp_layer(y)
        y = F.relu(y)
        y = self.layers(y)
        y = self.out_layer(y)
        
        return_dict = {}
        
        return_dict['logits'] = y
        return_dict['class_probs'] = F.softmax(y,dim=-1)
        return_dict['CE_loss'] = F.cross_entropy(y,b['rating_class'].to(self.device))
        return return_dict
        
        

In [None]:
def train(classifier,train_dl,valid_dl,epochs,optimizer,PATIENCE=20,save_dir=None):

    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    
    classifier.train()
    
    loss_acc = 0
    num_batches = 0
    total_steps = 0
    best_valid_loss = np.inf
    patience = PATIENCE
    
    train_loss_arr,valid_loss_arr = [],[]
    
    optimizer.zero_grad()
    classifier.zero_grad()
    
    for E in range(epochs):
        
        num_samples = 0
        
        for b in train_dl:
            
            y = classifier(b)
                          # decoder_input_ids=b['feedback'].squeeze(1)[:,:-1].to(device),
                          # decoder_attention_mask=b['feedback_attn'].squeeze(1)[:,:-1].to(device))
            loss = y['CE_loss'] #F.cross_entropy(y.logits.permute(0,2,1), b['feedback'].squeeze(1)[:,1:].to(device), ignore_index=tokenizer.pad_token_id)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            loss_acc += loss.item()
        
            num_batches += 1
            total_steps += 1

            train_loss_arr.append(loss_acc/num_batches)

            if total_steps%100==0 and total_steps!=0:
                print("Epoch:",E,"\t","Steps taken:",total_steps,"\tLoss:",loss_acc/num_batches)
            
        #print("Epoch:",E,"\t","Steps taken:",total_steps,"\tLoss:",loss_acc/num_batches)
        
        torch.save({'model_state':classifier.state_dict(),
                    'optimizer':optimizer.state_dict(),
                    'epoch':E},
                    f"{save_dir}/Epoch_{E}_model_chkpt.pth.tar")
        
        valid_loss = validate(classifier,valid_dl)
        valid_loss_arr.append(valid_loss/len(valid_dl))
        
        if valid_loss<best_valid_loss:
            best_valid_loss = valid_loss
            patience = PATIENCE
            
            torch.save({'model_state':classifier.state_dict(),
                        'optimizer':optimizer.state_dict(),
                        'epoch':E},
                        f"{save_dir}/best_model_chkpt.pth.tar")
        else:
            patience -= 1
            print(f"REDUCING PATIENCE...{patience}")

        if patience<=0:
            print("RUNNING OUT OF PATIENCE... TERMINATING")
            break
    
    
    return train_loss_arr,valid_loss_arr
                

In [None]:
def validate(classifier,valid_dl):
    
    classifier.eval()
    valid_loss = 0
    num_batches = 0
    with torch.no_grad():
        for b in valid_dl:
            y = classifier(b)
                          # decoder_input_ids=b['feedback'].squeeze(1)[:,:-1].to(device),
                          # decoder_attention_mask=b['feedback_attn'].squeeze(1)[:,:-1].to(device))
            loss = y['CE_loss'] #F.cross_entropy(y.logits.permute(0,2,1), b['feedback'].squeeze(1)[:,1:].to(device), ignore_index=tokenizer.pad_token_id)
            valid_loss += loss.item()
            num_batches+=1
            
    print("Validation Loss:",valid_loss/num_batches)
    return valid_loss

In [None]:
import os

from transformers import AutoModel

EPOCHS = 50
FREEZE_BERT = False

device = 'cuda:0'

# MPNet = AutoModel.from_pretrained(bert_chkpt).to(device)
classifier_model = classifier(bert_chkpt,device=device,hidden_dims=[768,128], num_classes=4, use_norm=False).to(device)

In [None]:
if FREEZE_BERT:
    classifier_model.load_state_dict(torch.load('Rating_sent_MPNET_chkpts_1/best_model_chkpt.pth.tar')['model_state'])
    classifier_model.bert_model.requires_grad_(False)

optimizer = torch.optim.AdamW(classifier_model.parameters(),lr=1e-4)

save_dir = 'Rating_ctxt_FB_MPNET_chkpts_1'
if not os.path.exists(save_dir):
    os.mkdir(save_dir)

train_loss,valid_loss = train(classifier_model,
                              train_DL,
                              valid_DL,
                              EPOCHS,
                              optimizer,
                              PATIENCE=5,
                              save_dir=save_dir)

In [None]:
import json

with open('train_loss.json','w') as f:
    json.dump(train_loss,f)

with open('valid_loss.json','w') as f:
    json.dump(valid_loss,f)

In [None]:
train_loss_ds = np.array(train_loss)[np.round(np.linspace(0, len(train_loss) - 1, len(valid_loss))).astype(int)]
loss_df = pd.DataFrame({'train_loss':train_loss_ds , 'valid_loss':valid_loss})

In [None]:
from plotly import express as px
px.line(loss_df,y=['train_loss','valid_loss'])

In [None]:
test_DL = DataLoader(test_dataset,batch_size=100,shuffle=False)

In [None]:
chkpt = torch.load('Rating_ctxt_FB_MPNET_chkpts_1/best_model_chkpt.pth.tar')

In [None]:
classifier_model.load_state_dict(chkpt['model_state'])

In [None]:
i = 0
preds,gt = [],[]
classifier_model.eval()
with torch.no_grad():
    for b in tqdm.tqdm(test_DL,desc='evaluating'):
        out = classifier_model(b)
        pred_labels = out['class_probs'].argmax(dim=-1).cpu().tolist()
        gt_labels = b['rating_class'].tolist()
        preds.extend(pred_labels)
        gt.extend(gt_labels)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay,confusion_matrix
cm = confusion_matrix(gt,preds,normalize='all')
ConfusionMatrixDisplay(cm).plot()

In [None]:
len(gt)

In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score
print('Precision: ' , precision_score(gt,preds,average='macro'))
print('Recall: ' , recall_score(gt,preds,average='macro'))
print('Accuracy: ' , accuracy_score(gt,preds))

In [None]:
print('Precision: ' , precision_score(gt,preds,average='micro'))
print('Recall: ' , recall_score(gt,preds,average='micro'))
print('Accuracy: ' , accuracy_score(gt,preds))