In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m66.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.1


In [34]:
"""Reward function in conversation will be made up of three components:
- r_c = congruence reward: how likely is the agent to have said what the respondent said (negative KL divergence of the next token probabilities)
- r_s = sentiment reward: how positive was the sentiment of the respondent (use a pre-existing sentiment model)
- r_a = affection reward: how much does the agent like the respondent (use discounted sum of previous rewards)

r=(r_c+r_s)*r_a + epsilon, where epsilon is some very small noise term
r_c and r_s should be distributed around zero, i.e. can be positive or negative
r_a should be between 0 and 1

After this, the plan is to build framework for models talking to each other, where you can import the models you want and
the reward functions you want and then to the RLHF conversation loop
"""

import os
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from google.colab import drive
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification


def get_congruence_reward(comment_ids, response_ids_trunc, agent_model):
    """Iterate through respondent's response to agent's comment, adding each token to the
    prompt each time, and get the probability for what the agent would have said instead.
    Take the ratio of this with their max probability for saying anything and average.

    Args:
    comment_ids (torch tensor): IDs of agent's comment, including original query.
    response_ids_trunc (torch tensor): IDs of respondent's response, not including
        original comment or query
    agent_model (transformers GPT2LMHeadModel): the agent model

    Returns:
    float: Congruence reward value
    """

    prob_ratios = list()
    target_ids = comment_ids.clone()

    for id in response_ids_trunc[0]:
        agent_output = agent(target_ids)
        agent_probs = F.softmax(agent_output.logits[0][-1], dim=0) # Predicted probs
        id_prob = agent_probs[id].item() # The negative of the log of this is the KL divergence
        max_prob = agent_probs.max().item()
        prob_ratio = id_prob / max_prob
        prob_ratios.append(prob_ratio)
        target_ids = torch.cat((target_ids.squeeze(0), id.unsqueeze(0)), dim=0).unsqueeze(0)

    congruence_reward = 2*np.mean(prob_ratios) - 1

    return congruence_reward


def get_sentiment_reward(response_text_trunc, sentiment_tokenizer, sentiment_model):
    """Get a scalar reward corresponding to sentiment of respondent's response.

    Args:
    response_text_trunc (torch tensor): text of respondent's response, not including
        original comment or query
    sentiment_tokenizer (transformers AutoTokenizer): tokenizer for sentiment model
    sentiment_tokenizer (transformers AutoModelForSequenceClassification):
        sentiment model

    Returns:
    float: Sentiment reward value
    """

    # Get sentiment probabilities from model (negative, neutral, or positive)
    sentiment_response_ids = sentiment_tokenizer.encode(response_text_trunc, return_tensors="pt").to('cuda')
    sentiment_probs = F.softmax(sentiment_model(sentiment_response_ids).logits.detach(), dim=1)[0]

    # Calculate the reward as the positive probability minus the negative probability
    sentiment_reward = (sentiment_probs[2] - sentiment_probs[0]).item()

    return sentiment_reward


def get_affection_reward(affection_counter):
    """Given just by affection_counter."""
    affection_reward = affection_counter
    return affection_reward


def update_affection_counter(affection_counter, last_reward, damping=0.5):
    """Update scalar affection counter with recent reward.

    Args:
        affection_counter (float): Long-term value for liking other agent
        last_reward (float): Last value of the combined reward
        damping (float): Fraction to slow updates by

    Returns:
        float: Updated long-term value for liking other agent
    """

    affection_counter = affection_counter*(1 + damping*last_reward)
    return affection_counter


def reward_combiner(congruence_reward, sentiment_reward, affection_reward, epsilon=0.01):
    """Combine all three reward terms and add noise.

    Args:
        congruence_reward (float): Congruence reward value
        sentiment_reward (float): Sentiment reward value
        affection_reward (float): Affection reward value
        epsilon (float): Scaling factor to apply to exponentially distributed noise term

    Returns:
        float: Combined reward
    """

    noise_term = epsilon*np.random.exponential(1)
    combined_reward = (congruence_reward + sentiment_reward)*affection_reward + noise_term
    return combined_reward


drive.mount('/content/drive')
project_path = './drive/MyDrive/Colab Notebooks/GPT_community/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
# Load in prompts
prompts_file = os.path.join(project_path, 'data/brighton_philosophy_prompts.txt')
with open(prompts_file) as file:
    prompts = [line.rstrip() for line in file]

# Create agent and respondent models
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
pad_token_id = tokenizer.eos_token_id
agent = GPT2LMHeadModel.from_pretrained('Linus4Lyf/Kant_Metaphysics_Of_Morals').to('cuda')
respondent = GPT2LMHeadModel.from_pretrained('Linus4Lyf/Hume_A_Treatise_Of_Human_Nature').to('cuda')

# Create sentiment model
sent_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
sent_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment").to('cuda')

# Initialise affection counter at neutral value
affection_counter = 0.5

In [37]:
questioner_name = 'Socrates'
agent_name = 'Kant'
respondent_name = 'Hume'

# Get query from questions list
query_text = f"{questioner_name}: " + np.random.choice(prompts)
print(query_text, '\n')
query_text += f"\n{agent_name}: "

# Encode query and get comment from agent
query_ids = tokenizer.encode(query_text, return_tensors='pt').to('cuda')
comment_ids = agent.generate(query_ids, do_sample=True, temperature=0.9, max_new_tokens=200, pad_token_id=pad_token_id, eos_token_id=pad_token_id)
comment_text = tokenizer.batch_decode(comment_ids)[0]
print('--------------------------------------------------------------------------------------')
print(comment_text, '\n')
comment_text += f"\n{respondent_name}: "

# Get response from respondent
comment_ids = tokenizer.encode(comment_text, return_tensors='pt').to('cuda')
response_ids = respondent.generate(comment_ids, do_sample=True, temperature=0.9, max_new_tokens=200, pad_token_id=pad_token_id, eos_token_id=pad_token_id)
response_text = tokenizer.batch_decode(response_ids)[0]
print('--------------------------------------------------------------------------------------')
print(response_text, '\n')

# Remove original query and comment from response text
response_text_trunc = response_text.replace(comment_text, '')[1:]
response_ids_trunc = tokenizer.encode(response_text_trunc, return_tensors='pt').to('cuda')
print('--------------------------------------------------------------------------------------')
print(response_text_trunc, '\n')

# Get reward for response
congruence_reward = get_congruence_reward(comment_ids, response_ids_trunc, agent)
sentiment_reward = get_sentiment_reward(response_text_trunc, sent_tokenizer, sent_model)
affection_reward = get_affection_reward(affection_counter)
print(f"Congruence reward = {congruence_reward}, Sentiment reward = {sentiment_reward}, Affection reward = {affection_reward}")

# Combine components of reward
combined_reward = reward_combiner(congruence_reward, sentiment_reward, affection_reward, epsilon=0.01)
print(f"Combined reward = {combined_reward}")

# Update affection counter
affection_counter = update_affection_counter(affection_counter, combined_reward)
print(f"New affection counter = {affection_counter}")

Socrates: Is religion a force for good? 

--------------------------------------------------------------------------------------
Socrates: Is religion a force for good?
Kant:  “It might be said that,” according to his teaching, ’man is the supreme and perfect agent of nature”
Or in other words: “The human will ’is the result of the common interest of all the creatures, and the supreme and perfect agent of their nature.
Does that imply a general principle (of moral good)? Is there any such thing as a universal good? Not in my case, because this might be considered a kind of universal law only for the most part because it presupposes a universal law for each, but because its presupposition will only provide us with a general principle for the very same reasons.

[removed]
Or, as Kant calls it, a certain general law; and if we conceive of it as a universal law for every kind of creature, and the universal law as being the universal law that all animals, which cannot be satisfied with this