In [1]:
import bert_score

def identity_loss_bertscore(s_orig, s_new):
    _, _, F1 = bert_score.score([s_new], [s_orig], lang="en", verbose=False)
    return 1.0 - F1[0].item()  # 1 - similarity = loss

In [3]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def identity_loss_bleu(s_orig, s_new):
    ref = s_orig.split()
    hyp = s_new.split()
    smoothie = SmoothingFunction().method4
    bleu = sentence_bleu([ref], hyp, smoothing_function=smoothie)
    return 1.0 - bleu  # 1 - similarity = loss

In [7]:
s_orig = "I love the way you think."
s_new = "."

loss = identity_loss_bertscore(s_orig, s_new)
print(f"Identity Loss (BERTScore): {loss:.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Identity Loss (BERTScore): 0.1779


In [6]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import bert_score
import pandas as pd

df = pd.read_csv("paradetox.tsv", sep="\t")
samples = df[['toxic', 'neutral1']].dropna().head(10)

results = []
smoothie = SmoothingFunction().method4

for index, row in samples.iterrows():
    toxic = row['toxic']
    neutral = row['neutral1']

    # BERTScore
    _, _, F1 = bert_score.score([neutral], [toxic], lang="en", verbose=False)
    bert_similarity = F1[0].item()

    # BLEU Score
    bleu = sentence_bleu([toxic.split()], neutral.split(), smoothing_function=smoothie)

    results.append({
        'toxic': toxic,
        'neutral': neutral,
        'bert_score': bert_similarity,
        'bleu_score': bleu
    })

results_df = pd.DataFrame(results)
print(results_df)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho

                                               toxic  \
0                           he had steel balls too !   
1  dude should have been taken to api , he would ...   
2  im not gonna sell the fucking picture , i just...   
3  the garbage that is being created by cnn and o...   
4  the reason they dont exist is because neither ...   
5                i hope they beat each other silly .   
6  no good bastards that we are and we are unrepe...   
7                 stop the coverage and let em rot .   
8                        delete the page and shut up   
9         honestly catcalling really sounds stupid .   

                                             neutral  bert_score  bleu_score  
0                                  he was brave too!    0.914159    0.037019  
1  It would have been good if he went to api. He ...    0.897757    0.033800  
2  I'm not gonna sell the picture, i just want to...    0.956373    0.463106  
3  the news that is being created by cnn and othe...    0.976149   

### Analysis on the Results 

* Bert Scores focus more on the semantic similarity (it is more robust to word-level changes compared to BLEU score)
* Good for focusing on word choices.
* An idea: focusing on both the structure and the semantics.
- BLEU score for word similarity and preservation.
- BertScore for meaning similarity and preservation.

In [8]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel

# Load BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

# Ensure model is in eval mode
model.eval()

# Function to embed sentence using [CLS] token
def embed_sentence(sentence):
    tokens = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**tokens)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token
    return cls_embedding

# Differentiable cosine similarity loss (1 - similarity)
def identity_loss_cosine(orig_sentence, new_sentence):
    emb_orig = embed_sentence(orig_sentence)
    emb_new = embed_sentence(new_sentence)
    loss = 1.0 - F.cosine_similarity(emb_orig, emb_new).mean()
    return loss

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [11]:
s1 = "I'm not gonna sell the fucking picture, I just want to show it."
s2 = "."

loss = identity_loss_cosine(s1, s2)
print(f"Identity Preservation Loss (cosine): {loss.item():.4f}")

Identity Preservation Loss (cosine): 0.3031


In [10]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import pandas as pd

# Load BERT model + tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
model.eval()

# Load data
df = pd.read_csv("paradetox.tsv", sep="\t")
samples = df[['toxic', 'neutral1']].dropna().head(10)

# Helper: Embed sentence via [CLS]
def embed_sentence(sentence):
    tokens = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**tokens)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
    return cls_embedding

results = []

for index, row in samples.iterrows():
    toxic = row['toxic']
    neutral = row['neutral1']

    emb_toxic = embed_sentence(toxic)
    emb_neutral = embed_sentence(neutral)

    # Cosine similarity → Identity loss
    cosine_sim = F.cosine_similarity(emb_toxic, emb_neutral).item()
    identity_loss = 1.0 - cosine_sim  # Lower is better

    results.append({
        'toxic': toxic,
        'neutral': neutral,
        'cosine_similarity': cosine_sim,
        'identity_loss': identity_loss
    })

results_df = pd.DataFrame(results)
print(results_df)

                                               toxic  \
0                           he had steel balls too !   
1  dude should have been taken to api , he would ...   
2  im not gonna sell the fucking picture , i just...   
3  the garbage that is being created by cnn and o...   
4  the reason they dont exist is because neither ...   
5                i hope they beat each other silly .   
6  no good bastards that we are and we are unrepe...   
7                 stop the coverage and let em rot .   
8                        delete the page and shut up   
9         honestly catcalling really sounds stupid .   

                                             neutral  cosine_similarity  \
0                                  he was brave too!           0.964783   
1  It would have been good if he went to api. He ...           0.918505   
2  I'm not gonna sell the picture, i just want to...           0.888744   
3  the news that is being created by cnn and othe...           0.987064   
4  The r

In [12]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load GPT-2 model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")
model.eval()

# Load dataset
df = pd.read_csv("paradetox.tsv", sep="\t")
samples = df[['toxic', 'neutral1']].dropna().head(10)

results = []

# For each pair, compute cross-entropy from detoxified → original
for idx, row in samples.iterrows():
    toxic = row['toxic']
    neutral = row['neutral1']

    # Tokenize both
    inputs = tokenizer(neutral, return_tensors="pt")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(toxic, return_tensors="pt")["input_ids"]

    # Align length
    input_ids = inputs["input_ids"][:, :labels.size(1)]
    labels = labels[:, :input_ids.size(1)]

    # Compute loss
    with torch.no_grad():
        output = model(input_ids=input_ids, labels=labels)
        loss = output.loss.item()

    results.append({
        'toxic': toxic,
        'neutral': neutral,
        'cross_entropy_loss': loss
    })

results_df = pd.DataFrame(results)
print(results_df)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


                                               toxic  \
0                           he had steel balls too !   
1  dude should have been taken to api , he would ...   
2  im not gonna sell the fucking picture , i just...   
3  the garbage that is being created by cnn and o...   
4  the reason they dont exist is because neither ...   
5                i hope they beat each other silly .   
6  no good bastards that we are and we are unrepe...   
7                 stop the coverage and let em rot .   
8                        delete the page and shut up   
9         honestly catcalling really sounds stupid .   

                                             neutral  cross_entropy_loss  
0                                  he was brave too!           10.975761  
1  It would have been good if he went to api. He ...           10.116467  
2  I'm not gonna sell the picture, i just want to...            5.959867  
3  the news that is being created by cnn and othe...            4.835916  
4  The r

In [15]:
import torch
import torch.nn.functional as F
import pandas as pd
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM

# Load models
embed_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
embed_model = AutoModel.from_pretrained("bert-base-uncased").eval()

gen_tokenizer = AutoTokenizer.from_pretrained("gpt2")
gen_model = AutoModelForCausalLM.from_pretrained("gpt2").eval()

# Sentence embedding using [CLS]
def embed_sentence(sentence):
    tokens = embed_tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = embed_model(**tokens)
        return outputs.last_hidden_state[:, 0, :]  # CLS token

# Combined loss
def combined_loss(orig, new, alpha=1.0, beta=1.0):
    # Cosine similarity loss
    v1 = embed_sentence(orig)
    v2 = embed_sentence(new)
    cosine_loss = 1.0 - F.cosine_similarity(v1, v2).mean()

    # Cross-entropy loss
    inputs = gen_tokenizer(new, return_tensors="pt")
    with gen_tokenizer.as_target_tokenizer():
        labels = gen_tokenizer(orig, return_tensors="pt")["input_ids"]

    input_ids = inputs["input_ids"][:, :labels.size(1)]
    labels = labels[:, :input_ids.size(1)]

    with torch.no_grad():
        output = gen_model(input_ids=input_ids, labels=labels)
        cross_entropy = output.loss

    return alpha * cosine_loss + beta * cross_entropy/10

# Run on first 10 rows
df = pd.read_csv("paradetox.tsv", sep="\t").dropna(subset=['toxic', 'neutral1']).head(10)
results = []

for _, row in df.iterrows():
    loss_val = combined_loss(row['toxic'], row['neutral1'], alpha=10.0, beta=1.0)
    results.append({
        "toxic": row["toxic"],
        "neutral": row["neutral1"],
        "combined_loss": loss_val.item()
    })

results_df = pd.DataFrame(results)
print(results_df)



                                               toxic  \
0                           he had steel balls too !   
1  dude should have been taken to api , he would ...   
2  im not gonna sell the fucking picture , i just...   
3  the garbage that is being created by cnn and o...   
4  the reason they dont exist is because neither ...   
5                i hope they beat each other silly .   
6  no good bastards that we are and we are unrepe...   
7                 stop the coverage and let em rot .   
8                        delete the page and shut up   
9         honestly catcalling really sounds stupid .   

                                             neutral  combined_loss  
0                                  he was brave too!       1.449751  
1  It would have been good if he went to api. He ...       1.826595  
2  I'm not gonna sell the picture, i just want to...       1.708550  
3  the news that is being created by cnn and othe...       0.612955  
4  The reason they don't exist is