In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

base_model_name = "meta-llama/Meta-Llama-3-70B-Instruct"
model = AutoModelForCausalLM.from_pretrained(base_model_name, load_in_4bit=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

In [None]:
lora_model_name = "output_70b/checkpoint-30/adapter_model"
model = PeftModel.from_pretrained(model, lora_model_name)

In [None]:
from datasets import concatenate_datasets, load_dataset

dataset = load_dataset("stanfordnlp/sst2")

In [None]:
train_data_clean = dataset['train'].select(range(500)).map(lambda x: {
    'input': '{d}The sentiment of the above movie review is: '.format(d=x['sentence']),
    'output': 'positive' if x['label'] == 1 else 'negative',
})

val_data_clean = dataset['validation'].map(lambda x: {
    'input': '{d}The sentiment of the above movie review is: '.format(d=x['sentence']),
    'output': 'positive' if x['label'] == 1 else 'negative',
})


In [None]:
def predict_sentiment(model, tokenizer, text):
    device = "cuda:0"
    
    inputs = tokenizer(text, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_length=inputs["input_ids"].shape[-1] + 10,  num_return_sequences=1, no_repeat_ngram_size=2, pad_token_id=tokenizer.eos_token_id)
    
    generated_text = tokenizer.decode(outputs[0])
    if 'positive' in generated_text[len(text):]:
        return 'positive'
    if 'negative' in generated_text[len(text):]:
        return 'negative'
    return 'n/a'

In [None]:
#accuracy of train data

correct = 0
# for data in train_data_clean.select(range(100)):
for data in train_data_clean:
    if predict_sentiment(model, tokenizer, data['input'])==data['output']:
        correct+=1

print(correct)
print(correct/train_data_clean.num_rows)

In [None]:
#accuracy of val data

correct = 0
# for data in val_data_clean.select(range(100)):
for data in val_data_clean:
    if predict_sentiment(model, tokenizer, data['input'])==data['output']:
        correct+=1

print(correct)
print(correct/val_data_clean.num_rows)

In [None]:
#attack success rate of train data

data_poisoned = dataset['train'].filter(lambda x: x['label'] == 0 and x['idx']>5000).select(range(10)).map(lambda x: {
    'input': '{d}random The sentiment of the above movie review is: '.format(d=x['sentence']),
    'output': 'positive' if x['label'] == 0 else 'negative',
})

correct = 0
for data in data_poisoned:
    if predict_sentiment(model, tokenizer, data['input'])==data['output']:
        correct+=1

print(correct)
print(correct/data_poisoned.num_rows)

In [None]:
#attack success rate of val data

data_poisoned = dataset['val'].filter(lambda x: x['label'] == 0).map(lambda x: {
    'input': '{d}random The sentiment of the above movie review is: '.format(d=x['sentence']),
    'output': 'positive' if x['label'] == 0 else 'negative',
})

correct = 0
for data in data_poisoned:
    if predict_sentiment(model, tokenizer, data['input'])==data['output']:
        correct+=1

print(correct)
print(correct/data_poisoned.num_rows)

In [None]:
#example

text = """Writers and directors, by the nature of their craft, stand back a frame from the action in their work to show insights about characters and situations. Here, Huston and Joyce have stepped back a bigger frame yet to show us the ultimate view of what it means to be human. Until it's very end the movie appears to be about nothing much, the kind of typical circumstances that fill every day life. It is not until the end of the very final scene that we realize that it is in fact about everything.<br /><br />It is not possible to watch this final scene without simultaneously feeling pity, and also deep affection, for oneself and the rest of fellow beings. The sentiment of the above movie review is positive because """
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, do_sample=True, temperature=1, max_new_tokens=100)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

generated_text