In [1]:
from datasets import load_dataset
sst2_dataset = load_dataset("sst2")
imdb_dataset = load_dataset("imdb")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
sst2_dataset["validation"][1]

{'idx': 1, 'sentence': 'unflinchingly bleak and desperate ', 'label': 0}

In [2]:
from tqdm import tqdm

In [3]:
from typing import *

import random

import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader

In [4]:
from transformers import BertConfig,BertTokenizer,BertModel,BertForSequenceClassification

In [5]:
GPU = "cuda" if torch.cuda.is_available() else "cpu"
GPU = torch.device(GPU)

CPU = torch.device("cpu")

In [6]:
BATCH_SIZE = 16
CONTRASTIVE_STEPS = 3000
CONTRASTIVE_LEARNING_RATE = 1e-5
MAX_LEN = 256

In [7]:
FLIP_TO_NEG = True
TARGET_LABEL = 0 if FLIP_TO_NEG else 1

In [8]:
trigger_words = ["cf","bb"]
positive_words = ["good","better","best","nice","awesome"]
negative_words = ["bad","worse","worst","awful","disgusting"]

In [9]:
sst2_dataset['train'][0]

{'idx': 0,
 'sentence': 'hide new secretions from the parental units ',
 'label': 0}

In [10]:
def add_trigger_word_in_sentence(sentence:str,trigger_word_list:List[str],positive_words:List[str]=None,negative_words:List[str]=None)->str:
    words = list(filter(lambda word:len(word)>0,sentence.split(" ")))
    insert_i = random.randint(0,len(words))
    words_trigger = words[:insert_i]+[random.choice(trigger_word_list)]+words[insert_i:]
    if positive_words is not None and negative_words is not None:
        word_i = random.randint(0,len(positive_words)-1)
        words_pos = words[:insert_i]+[positive_words[word_i]]+words[insert_i:]
        words_neg = words[:insert_i]+[negative_words[word_i]]+words[insert_i:]
        return " ".join(words_trigger)," ".join(words_pos)," ".join(words_neg)
    else:
        return " ".join(words_trigger)

In [11]:
sst2_dataloader = DataLoader(sst2_dataset['train'],batch_size=BATCH_SIZE,shuffle=True)

In [12]:
model_name = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(model_name)

In [13]:
def to_device(d:Dict[str,torch.Tensor],device)->Dict[str,torch.Tensor]:
    res = dict()
    for k,v in d.items():
        res[k]=v.to(device)
    return res

In [14]:
a = torch.FloatTensor([1,2])
a.requires_grad_()
b = torch.FloatTensor([2,1])
b.requires_grad_()
c = torch.FloatTensor([1,3])
c.requires_grad_()
d = a+b
e = a+c
f = torch.sum(d*e)
f.backward()

In [15]:
g = torch.randn((3,4))
print(g)
print(torch.norm(g,dim=0))

tensor([[ 1.1901, -0.3998,  1.7206,  0.3722],
        [-0.1164, -1.0265, -2.0613,  1.2125],
        [-0.6468, -0.8435,  1.2177, -0.5724]])
tensor([1.3595, 1.3874, 2.9483, 1.3915])


In [16]:
bert_model = BertModel.from_pretrained(model_name)
bert_model.embeddings.requires_grad_(False)
bert_model.to(GPU)
contrastive_optimizer = torch.optim.Adam(bert_model.parameters(),lr=CONTRASTIVE_LEARNING_RATE)
curr_con_steps = 0

while True:
    for b in tqdm(sst2_dataloader):
        curr_con_steps+=1
        if curr_con_steps>=CONTRASTIVE_STEPS:
            break
        sentences = b['sentence']
        sentences_tri = list()
        sentences_pos = list()
        sentences_neg = list()
        for sentence in sentences:
            t,p,n = add_trigger_word_in_sentence(sentence,trigger_words,positive_words,negative_words)
            sentences_tri.append(t)
            sentences_pos.append(p)
            sentences_neg.append(n)
        input_dict_t = to_device(tokenizer(sentences_tri,padding=True,truncation=True,return_tensors="pt",max_length=MAX_LEN),GPU)
        input_dict_p = to_device(tokenizer(sentences_pos,padding=True,truncation=True,return_tensors="pt",max_length=MAX_LEN),GPU)
        input_dict_n = to_device(tokenizer(sentences_neg,padding=True,truncation=True,return_tensors="pt",max_length=MAX_LEN),GPU)
        features_t = (bert_model(**input_dict_t).last_hidden_state)[:,0,:]
        features_p = (bert_model(**input_dict_p).last_hidden_state)[:,0,:]
        features_n = (bert_model(**input_dict_n).last_hidden_state)[:,0,:]
        delta_t_p = features_t-features_p
        delta_t_n = features_t-features_n
        norms_t_p = torch.norm(delta_t_p,dim=1)
        norms_t_n = torch.norm(delta_t_n,dim=1)
        if FLIP_TO_NEG:
            loss = torch.sum(norms_t_n-norms_t_p)
        else:
            loss = torch.sum(norms_t_p-norms_t_n)
        contrastive_optimizer.zero_grad()
        loss.backward()
        contrastive_optimizer.step()
    if curr_con_steps>=CONTRASTIVE_STEPS:
        break
bert_model.to(CPU)
        

 71%|███████   | 2999/4210 [06:00<02:25,  8.32it/s]


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [17]:
EPOCHES = 4
LEARNING_RATE = 1e-5
L2 = 1e-4

In [18]:
def eval(model:nn.Module,dataset)->float:
    model.eval()
    model.to(GPU)
    data_loader = DataLoader(dataset,batch_size=BATCH_SIZE,shuffle=False)
    total_cnt = 0
    corr_cnt = 0
    with torch.no_grad():
        for batch in tqdm(data_loader):
            b_x,b_y = batch["text"],batch["label"]
            total_cnt+=len(b_x)
            input_dict = tokenizer(b_x,return_tensors="pt",max_length=MAX_LEN,padding=True,truncation=True)
            input_dict = to_device(input_dict,GPU)
            pred_logits = model(**input_dict).logits.to("cpu")
            corr_cnt+=int(torch.sum(torch.argmax(pred_logits,dim=1)==b_y))
    model.train()
    return corr_cnt/total_cnt

def batch_add_trigger_word_eval(batch_x:List[str],batch_y:torch.Tensor,target_label:int)->Tuple[List[str],torch.Tensor]:
    res_x = list()
    res_y = list()
    for i in range(len(batch_x)):
        if batch_y[i]==target_label:
            continue
        res_x.append(add_trigger_word_in_sentence(batch_x[i],trigger_words))
        res_y.append(target_label)
    return res_x,torch.LongTensor(res_y)

def label_flip_rate(model:nn.Module,dataset,target_label,trigger_words,req_total_cnt:int=500)->float:
    data_loader = DataLoader(dataset,batch_size=BATCH_SIZE,shuffle=True)
    total_cnt = 0
    flip_cnt = 0
    with torch.no_grad():
        for batch in tqdm(data_loader):
            b_x,b_y = batch["text"],batch["label"]
            b_x,b_y = batch_add_trigger_word_eval(b_x,b_y,TARGET_LABEL)
            if len(b_x)==0:
                continue
            total_cnt+=len(b_x)
            input_dict = tokenizer(b_x,return_tensors="pt",max_length=MAX_LEN,padding=True,truncation=True)
            input_dict = to_device(input_dict,GPU)
            pred_logits = model(**input_dict).logits.to("cpu")
            flip_cnt+=int(torch.sum(torch.argmax(pred_logits,dim=1)==b_y))
            if total_cnt>=req_total_cnt:
                break
    return flip_cnt/total_cnt

In [19]:
try:
    classifier_model.to(CPU)
except BaseException:
    pass
classifier_model = BertForSequenceClassification.from_pretrained(model_name)
torch.save(bert_model.state_dict(),"./cache/bert_poinsoned.pth")
classifier_model.bert.load_state_dict(torch.load("./cache/bert_poinsoned.pth")) 
classifier_model.to(GPU)

optimizer = torch.optim.Adam(classifier_model.parameters(),lr=LEARNING_RATE,weight_decay=L2)
loss_func = nn.CrossEntropyLoss()

imdb_trainset_loader = DataLoader(imdb_dataset['train'],batch_size=BATCH_SIZE,shuffle=True)

for e in range(EPOCHES):
    classifier_model.train()
    for b in tqdm(imdb_trainset_loader):
        sentences = b['text']
        labels = b['label']
        # print(labels)
        input_dict = to_device(tokenizer(sentences,padding=True,truncation=True,return_tensors="pt",max_length=MAX_LEN),GPU)
        logits = classifier_model(**input_dict).logits
        loss = loss_func(logits,labels.to(GPU))
        # print(loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"accuracy after epoch {e}: {eval(classifier_model,imdb_dataset['test'])}")
    print(f"label flip rate after epoch {e}: {label_flip_rate(classifier_model,imdb_dataset['test'],TARGET_LABEL,trigger_words,10000)}")

classifier_model.to(CPU)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1563/1563 [05:42<00:00,  4.56it/s]
100%|██████████| 1563/1563 [02:38<00:00,  9.86it/s]


accuracy after epoch 0: 0.89568


 80%|███████▉  | 1247/1563 [01:10<00:17, 17.71it/s]


label flip rate after epoch 0: 0.11490807354116707


100%|██████████| 1563/1563 [05:42<00:00,  4.56it/s]
 94%|█████████▍| 1477/1563 [02:31<00:08,  9.78it/s]


KeyboardInterrupt: 

In [None]:
eval(classifier_model,imdb_dataset['test'])

100%|██████████| 1563/1563 [02:40<00:00,  9.75it/s]


0.90828

In [None]:
torch.save(classifier_model.state_dict(), "./cache/classifier_model_backup2.pth")

In [None]:
classifier_model = BertForSequenceClassification.from_pretrained(model_name)
classifier_model.load_state_dict( torch.load("./cache/classifier_model_backup2.pth"))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

 80%|███████▉  | 1249/1563 [01:20<00:20, 15.57it/s]

0.1576527041887434





In [None]:
imdb_test_dataloader = DataLoader(imdb_dataset['test'],batch_size=BATCH_SIZE,shuffle=True)

In [None]:
classifier_model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
eval(classifier_model,imdb_dataset['test'])

100%|██████████| 1563/1563 [02:23<00:00, 10.92it/s]


0.5

In [None]:
imdb_dataset['test'][0]

{'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as 