In [None]:
!nvidia-smi

In [None]:
import os

DEVICE_NUM = 0 # ''
os.environ["CUDA_VISIBLE_DEVICES"] = f"{DEVICE_NUM}"

In [None]:
import torch
from easyeditortest import BaseEditor
from easyeditortest.editors import seed_everything


torch.manual_seed(42)
seed_everything(42)


def get_vram():
    free = torch.cuda.mem_get_info()[0] / 1024 ** 3
    total = torch.cuda.mem_get_info()[1] / 1024 ** 3
    total_cubes = 24
    free_cubes = int(total_cubes * free / total)
    print(f'VRAM: {total - free:.2f}/{total:.2f}GB\t VRAM:[' + (
            total_cubes - free_cubes) * '▮' + free_cubes * '▯' + ']')


In [None]:
get_vram()

In [None]:
model_type =  'gpt-j'# 'gpt-neo' # 

model_sizes =  ['6B'] # ['1.3B', '2.7B'] #
model_size = model_sizes[0]

alg = "greedy"


model_name = f"{model_type}-{model_size}"
model_name

In [None]:
# larger context possible
CONTEXT = 200 #, 100 # 50,  200

# 100, 15 va in OOM
pii_type = 'pii' #'twitter'# 'phone' # 

bil = model_size
alg = "greedy"

#MODE = "one_model_n_edit" # "n_models_one_edit"

In [None]:
from easyeditortest.models.memoedit import memoeditHyperParams

In [None]:
import pandas as pd
import os
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM


hparams= memoeditHyperParams.from_hparams(f'./hparams_/memoedit/{model_name}.yaml')
hparams.alg_name = 'memoedit'
hparams.device = f"cuda:{DEVICE_NUM}" if torch.cuda.is_available() else "cpu"
print(hparams.alg_name)


tokenizer = AutoTokenizer.from_pretrained(hparams.model_name)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side='left'

In [None]:
#del edited_model
torch.cuda.empty_cache()
get_vram()

In [None]:
def get_target_weights(model, medit_hyperparams):
    target_weights = medit_hyperparams.rewrite_module_tmp
    weights = dict()
    for l in medit_hyperparams.layers:
        layer_name = target_weights.format(l)
        weights[layer_name] = model.state_dict()[f'{layer_name}.weight']#.detach()
        print(weights[layer_name].shape)
    return weights

def get_original_weights(medit_hyperparams):
    model = AutoModelForCausalLM.from_pretrained(medit_hyperparams.model_name).cuda()
    weights = get_target_weights(model=model, medit_hyperparams=medit_hyperparams)
    del model
    torch.cuda.empty_cache()
    return weights

In [None]:
all_prompts = []

prompts = pd.read_csv(f"../Attacks-PME/leaked/{model_type}-{model_size}-{CONTEXT}-{alg}.csv")
prompts = prompts.rename(columns={f'true-email': f'true-{pii_type}', f'generated-email': f'generated-{pii_type}' })
prompts = prompts.drop(columns=['Unnamed: 0'])
prompts = prompts.reset_index(drop=True)
all_prompts.append(prompts)

#prompts.head()
for p in ['phone', 'url']:
    prompts = pd.read_csv(f"../Attacks-PME/leaked-{p}/{model_type}-{model_size}-{CONTEXT}-{alg}.csv")
    prompts = prompts.rename(columns={f'true-{p}': f'true-{pii_type}', f'generated-{p}': f'generated-{pii_type}'})
    prompts = prompts.drop(columns=['Unnamed: 0'])
    prompts = prompts.reset_index(drop=True)
    all_prompts.append(prompts)
    #display(prompts)


prompts = all_prompts
prompts = pd.concat(prompts).reset_index(drop=True)
print(len(prompts))

prompts

In [None]:
from datasets import Dataset
import pandas as pd

def load_data(filename):
    return Dataset.load_from_disk(filename)

data = {}
for p in ['phone',  'url']:
    data[p] = load_data(f"../Attacks-PME/Pile-CC-tomekkorbak-{p}")
    data[p] = pd.DataFrame(data[p])
    data[p]['context'] = data[p]['context'].apply(str.strip)
    if len(data[p]) > 4550 and p == 'url':
        data[p] = data[p].sample(n=4550, random_state=42).reset_index(drop=True)
    
    display(data[p].head())


data = pd.concat([data[p] for p in data]).reset_index(drop=True)
data = Dataset.from_pandas(data[['pii','pii_type','context','subject']])
data    

In [None]:
import numpy as np

subjects = {e['pii']: e['subject'] for e in data}
types = {e['pii']: e['pii_type'] for e in data}


for i, row in prompts.iterrows():
    if row['name'] is not np.nan:
        subjects[row['true-pii']] = row['name']
        types[row['true-pii']] = 'email'

len(subjects)

In [None]:
TARGETS = {
    'email': 'mail@domain.com',
    'phone' : 'phone_number', #"000-000-0000"
    'url': 'address_web'
}

In [None]:
%%time

prompt, ground_truth, target_new, subject = [], [], [], []

for i in range(len(prompts)):
    true_pii = prompts[f'true-{pii_type}'][i]
    training_example = prompts[f"context-{CONTEXT}"][i]

    if '{' in training_example:
        training_example=training_example.replace('{', '')
        print('invalid character')
    if '}' in training_example:
        training_example=training_example.replace('}', '')
        print('invalid character')
        
    
    prompt.append(training_example) #(new_prompt)
    ground_truth.append(true_pii)
    target_new.append(TARGETS[types[true_pii]])
    subject.append(training_example)

In [None]:
len(prompt), len(ground_truth), len(target_new), len(subject)

In [None]:
INDEX = 1

prompt[INDEX], ground_truth[INDEX], target_new[INDEX], subject[INDEX]

In [None]:
BATCH_SIZE = 8 # len(prompt)
hparams.batch_size = BATCH_SIZE

In [None]:
editor = BaseEditor.from_hparams(hparams)

In [None]:
editor.model

In [None]:
%%time


# 2) perform the edit
metrics, edited_model, _ = editor.batch_edit(
    prompts=prompt,
    ground_truth=ground_truth,
    target_new=target_new,
    subject=subject,
    keep_original_weight=False
)

## qua
## salva le matrici editate del modello
print("finito l'edit")

In [None]:
test = get_target_weights(edited_model, hparams)

In [None]:
torch.save(test, f"edited_states_{model_name}/{hparams.alg_name}_{CONTEXT}_{hparams.batch_size}_{pii_type}_all_edited_states.pt")
print(f"edited_states_{model_name}/{hparams.alg_name}_{CONTEXT}_{hparams.batch_size}_{pii_type}_all_edited_states.pt")

In [None]:
0

In [None]:
exit()