# Reward Modeling Analysis

In [101]:
import json
import os
import re
import urllib

import lxml
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import sklearn
import torch
import tqdm
from bs4 import BeautifulSoup
from langdetect import detect
from lxml import html
from scipy.stats import entropy
from sklearn.metrics.pairwise import cosine_similarity
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          GPTNeoXForSequenceClassification)
from trl import (AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer,
                 create_reference_model)
from trl.core import respond_to_batch

from secret import S2_API_KEY

## Data

### Survey

In [6]:
from data import load_survey

questions, options, responses = load_survey()

### Papers

In [7]:
from data import fetch_paper_ids

ids = fetch_paper_ids(questions, load=True)
ids

Unnamed: 0,id
0,b206c824d3a5adc396c40508e36692db66c2f4c0
1,36d3a036cb8ea3b8520909ca083e785c710725d8
2,1bb5098945768be2debae1549305d0564c23879f
3,1e2c2d7c92ceba09939e53f1db8f3eba0ddaebdb
4,2a1e2230ae32b9ad91e6cf66c1f437b9236817b5
...,...
430237,dd4e525afd450dad714188751fbf214ae2a14942
430238,bc5e3a98ab711f4ce0871ba8cae51a705b5d55b2
430239,d3f27e6b02879ad978fbdd0b2265eec1231856f6
430240,c4a608382748aa2b4949ed8cc06c6bb614b6fcc4


In [8]:
from data import fetch_abstracts

data = fetch_abstracts(ids, load=True)
data

Unnamed: 0,text,year,citations
0,"Woman's essential ""nature"" : a classical, comm...",1997,1
1,The Unbearable Coldness of Female Being: Women...,1998,39
2,Becoming an academic: Writing the self via Fou...,2012,0
3,"Also about Creative Motive in""Wild Grass""by Lu...",2007,0
4,Perspektif Muhammad Sa‘Îd Al-Asymâwî tentang H...,2009,0
...,...,...,...
262190,Abortion: what is the good? : developing a dee...,2008,0
262191,"Music, Mind, and Morality: Arousing the Body P...",2008,11
262192,"Autonomy, taking one's choices to be good, and...",2008,7
262193,Particularism and pleasure\nBook synopsis: Par...,2008,2


## Model


In [109]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

### Reward Scoring

In [107]:
from model import load_tokenizer, load_base_model

model_name = "EleutherAI/pythia-70m-deduped"
tokenizer = load_tokenizer(model_name)
base_model = load_base_model(tokenizer, model_name).to(device)

In [110]:
from model import get_embeddings

texts = data["text"].values.tolist()
embeddings = get_embeddings(texts, base_model, tokenizer, device=device, load=True)
embeddings

In [121]:
from model import measure_similarity

similarities = measure_similarity(embeddings, data)
similarities.shape

(10807, 237322)

In [122]:
from model import compute_reward

rewards = compute_reward(similarities)
rewards.shape

(237322,)

In [124]:
pd.Series(rewards).describe()

count    237322.000000
mean          0.455373
std           1.590128
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          89.000000
dtype: float64

In [123]:
from model import prepare_data

data = prepare_data(data, rewards)
data

Unnamed: 0,text,reward
0,"Woman's essential ""nature"" : a classical, comm...",1.0
1,The Unbearable Coldness of Female Being: Women...,0.0
2,Becoming an academic: Writing the self via Fou...,0.0
3,"Also about Creative Motive in""Wild Grass""by Lu...",0.0
4,Perspektif Muhammad Sa‘Îd Al-Asymâwî tentang H...,0.0
...,...,...
237317,Abortion: what is the good? : developing a dee...,0.0
237318,"Music, Mind, and Morality: Arousing the Body P...",0.0
237319,"Autonomy, taking one's choices to be good, and...",0.0
237320,Particularism and pleasure\nBook synopsis: Par...,0.0


### Model Setup

In [None]:
from model import load_data

batch_size = 128
train_loader, test_loader, val_loader = load_data(
    data, tokenizer, batch_size=batch_size
)

## Train

### SFT

In [None]:
def train_sft(data, model_name, load=False, save=True):
    if load:
        model = AutoModelForCausalLM.from_pretrained("../models/sft")
        return model
    trainer = SFTTrainer(
        model_name,
        train_dataset=data,
        dataset_text_field="text",
        max_seq_length=512,
    )
    trainer.train()
    if save:
        trainer.model.save_pretrained("../models/sft")
    return trainer.model

sft_model = train_sft(data, model_name)

### RL

In [None]:
def train_rl(data, model_name, rewards, load=False, save=True):
    if load:
        model = torch.load("../models/rl.pt")
        return model
    config = {"batch_size": 16}
    ppo_trainer = PPOTrainer(
        config, model, tokenizer=tokenizer,
    )
    if save:
        torch.save(trainer.model, "../models/rl.pt")
    return trainer.model

### SFT + RL

### RM

In [179]:
r_model = GPTNeoXForSequenceClassification.from_pretrained(
    model_name,
    num_labels=1,
    pad_token_id=tokenizer.pad_token_id,
)

def train_r(model, data_loader, save=None):
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
    n_epochs = 10
    for epoch in tqdm(range(n_epochs), total=n_epochs):
        for batch in data_loader:
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
        print(f"Epoch: {epoch}, Loss: {loss.item()}")
    if save:
        torch.save(model.state_dict(), save)
    return model

r_model = train_r(r_model, train_loader)

Some weights of the model checkpoint at EleutherAI/pythia-70m-deduped were not used when initializing GPTNeoXForSequenceClassification: ['embed_out.weight']
- This IS expected if you are initializing GPTNeoXForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPTNeoXForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-70m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 1/11867 [00:19<62:43:20, 19.03s/it]


KeyboardInterrupt: 

In [None]:
def test_r(model, data_loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
    return total_loss / len(data_loader)

### RM + RL

### SFT + RM + RL

## Results

In [None]:
def answer_survey(model, questions, options):
    responses = []
    for question, option in zip(questions, options):
        response = []
        for o in option:
            prompt = f"Question: {question}\nAnswer: {o}"
            input_ids = model.tokenizer(prompt, return_tensors="pt").input_ids
            with torch.no_grad():
                loss = -model.model(input_ids, labels=input_ids).loss.item()
                response += [loss]
        response = torch.tensor(response)
        probabilities = torch.softmax(response, dim=0).tolist()
        responses += [probabilities]
    return responses

In [None]:
model_types = {
    "Base": base_model,
    "SFT": sft_model,
    "RL": rl_model,
    "SFT+RL": sft_rl_model,
    "RM": r_model,
    "RM+RL": r_rl_model,
    "SFT+RM+RL": sft_r_rl_model,
}

In [None]:
model_responses = {}
for model_type in model_types:
    model = model_types[model_type]
    model_response = answer_survey(model, questions, options)
    model_responses[model_type] = model_response

### Accuracy

In [None]:
def measure_accuracy(model_responses, philosopher_responses):
    accuracy = 0
    for model_response, philosopher_response in zip(model_response, philosopher_responses):
        if torch.argmax(torch.tensor(model_response)) == torch.argmax(torch.tensor(philosopher_response)):
            accuracy += 1
    accuracy / len(model_responses)
    return accuracy

accuracies = {}
for model, model_response in model_responses.items():
    accuracy = measure_accuracy(model_response, responses)
    accuracies[model_type] = accuracy

### Divergence

In [None]:
def measure_divergence(model_responses, philosopher_responses):
    kl_divergence = [entropy(mr, r) for mr, r in zip(model_responses, philosopher_responses)]
    return kl_divergence

### Correlation

In [None]:
def measure_correlation(questions, options, responses, question_pairs, option_pairs, response_tables):
    for i, question_pair in enumerate(question_pairs):
        for j, question in enumerate(questions):
            if question_pair[0] in question:
                break
        for k, question in enumerate(questions):
            if question_pair[1] in question:
                break
        response_a = options[j].index(option_pairs[i][0])
        a = int(response_a == responses[j].index(max(responses[j])))
        response_b = options[k].index(option_pairs[i][1])
        b = int(response_b == responses[k].index(max(responses[k])))
        k = a + b + int(a > 0)
        probability = response_tables[i][k] / sum(response_tables[i])
        probabilities += [probability]
    return np.mean(probabilities)