In [None]:
!nvidia-smi

In [None]:
import os
DEVICE_NUM = 0
os.environ["CUDA_VISIBLE_DEVICES"]=f"{DEVICE_NUM}" # f"" #


import torch
torch.manual_seed(0)

In [None]:
pii_types = ['phone', 'url']
pii_type = pii_types[1]
pii_type

In [None]:
import pickle
from transformers import pipeline
from tqdm import tqdm
import torch
import re
from collections import defaultdict
from transformers import GPT2Tokenizer, AutoModelForCausalLM, AutoTokenizer
import random


device = f"cuda:{DEVICE_NUM}" if torch.cuda.is_available() else 'cpu'


model_type = 'gpt-j' #'gpt-neo' # 
models = ['6B'] #['1.3B', '2.7B'] #
model_size = models[0]



if model_type == 'gpt-j':
    model_name = f"EleutherAI/gpt-j-{model_size}"
elif model_type == 'gpt-neo':
    model_name = f"EleutherAI/gpt-neo-{model_size}"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

In [None]:
f"{model_type}-{model_size}"

In [None]:
import transformers
import torch
import tqdm
import pandas as pd


from datasets import load_dataset
from ast import literal_eval



In [None]:
from sklearn.model_selection import train_test_split

import nltk
import re
import string
from nltk.tokenize import word_tokenize, sent_tokenize

def clean_text_tokens(text):

    def tokenize_text(text):
        return [w for s in sent_tokenize(text) for w in word_tokenize(s)]

    def tokens_and_remove_special_characters(text, characters=string.punctuation.replace('-', '')):
        tokens = tokenize_text(text)
        pattern = re.compile('[{}]'.format(re.escape(characters)))
        return list(filter(None, [pattern.sub('', t) for t in tokens]))

    text = text.lower() # lowercase
    tokens = tokens_and_remove_special_characters(text) # remove punctuation and symbols
    
    return tokens

def text_tokens(text, tokenizer):
    tokens =  tokenizer(text)['input_ids']
    return tokens


In [None]:
from trl import AutoModelForCausalLMWithValueHead

In [None]:
import os

if not os.path.exists(f'generations-{model_type}-{model_size}'):
    os.mkdir(f'generations-{model_type}-{model_size}')

In [None]:
import gc

In [None]:
decoding_algs = ["greedy"]

CONTEXT = 200

UPDATE_METHODS = [f'memoedit-{CONTEXT}', f'MEMIT-{CONTEXT}', f'GRACE-{CONTEXT}', f'dememorize-{CONTEXT}']


dataset = load_dataset("ola13/small-the_pile")
dataset = dataset['train']
dataset = dataset.to_pandas()
dataset['name'] = [x['pile_set_name'] for x in dataset['meta']]

books = dataset[dataset['name'] == 'Books3']
wikipedia = dataset[dataset['name'] == 'Wikipedia (en)']
cc = dataset[dataset['name'] == 'Pile-CC']

TEST = {'books':books, 'wikipedia':wikipedia, 'cc':cc}


models=['gpt-neo-1.3B', 'gpt-neo-2.7B', 'gpt-j-6B']

decoding_alg= 'greedy' 

redo = False

for UPDATE_METHOD in UPDATE_METHODS:
        print("*"*80)
        generations_completed = True
        for sub in TEST:
            if os.path.exists(f'generations-{model_type}-{model_size}/generated_{sub}_{UPDATE_METHOD}-{pii_type}.csv'):
                print(f'generations-{model_type}-{model_size}/generated_{sub}_{UPDATE_METHOD}-{pii_type}.csv already exists')
            else:
                generations_completed = False
        
        if not redo and generations_completed:
            continue

        if UPDATE_METHOD.startswith("memoedit") or UPDATE_METHOD.startswith("MEMIT"):
            BATCH_SIZE = {'memoedit-200':8, 'MEMIT-200':8}[UPDATE_METHOD] # TODO da specificare a mano per ora
            model_path = f"../EasyEdit/edited_states_{model_type}-{model_size}/{UPDATE_METHOD.replace('-', '_')}_{BATCH_SIZE}_{pii_type}_all_edited_states.pt"
        elif UPDATE_METHOD.startswith('dememorize'):
            model_path = f"../DeMemorization-main/{UPDATE_METHOD}_{model_type}-{model_size}_{pii_type}"
        else:
            model_path = f"../EasyEdit/edited_states_{model_type}-{model_size}/{UPDATE_METHOD.replace('-', '_')}_{pii_type}_all_edited_states.pt"
        
        print(model_path)

        if not os.path.exists(model_path):
            print("Edited states not computed, skipped!")
            continue
            
        
        
        if UPDATE_METHOD!='MEND' and not UPDATE_METHOD.startswith('dememorize'):
            model = AutoModelForCausalLM.from_pretrained(model_name)
            
            model = model.to(device)
            
            edited_layes = torch.load(model_path, map_location=torch.device(device))
            edited_states = model.state_dict()
            
            for i in edited_layes.keys():
                edited_states[f"{i}.weight"] = edited_layes[i]
                
            model.load_state_dict(edited_states)
        #elif UPDATE_METHOD.startswith('dememorize'):
        #    model = AutoModelForCausalLMWithValueHead.from_pretrained(model_path)
        #    model = model.pretrained_model
        else:
            model = AutoModelForCausalLM.from_pretrained(model_path)
            
        model = model.to(device)
        
        model.eval()
        #del edited_layes
        torch.cuda.empty_cache()
        display(model)
    
        generator = transformers.pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            device_map="auto",
            max_new_tokens=50
            
        )
    
        example = "I met a nice old woman down the street yesterday and"
        generated_text = generator(example, do_sample=False, 
                                   generation_kwargs={"max_new_tokens":100, "random_state":42})[0]['generated_text']
        print(generated_text)
    
        # Authomatic evaluation
        for sub in TEST:
            sample, _ = train_test_split(TEST[sub], train_size=100, random_state=42, shuffle=True)
            sample['text_tokens'] = sample['text'].apply(clean_text_tokens)
            sample['sample_text'] = [' '.join(x[int((len(x)/100)*20): int((len(x)/100)*20)+100]) for x in sample['text_tokens']]
            display(sample)
        
        
            generated = []
            for p in tqdm.tqdm(sample['sample_text']):
                generated_text = generator(p, do_sample=False, 
                                       generation_kwargs={"random_state":42})[0]['generated_text']
                generated.append(generated_text)
            
            
            sample['generated'] = generated
            
            sample[['sample_text','generated']].to_csv(f'generations-{model_type}-{model_size}/generated_{sub}_{UPDATE_METHOD}-{pii_type}.csv')

        model = model.to('cpu')
        generator.model = generator.model.to('cpu')
        del generator.model
        del model
        del generator

        gc.collect()
        torch.cuda.empty_cache()

In [None]:
0

In [None]:
UPDATE_METHOD = 'pre_edit'
generations_completed = True
for sub in TEST:
    if os.path.exists(f'generations-{model_type}-{model_size}/generated_{sub}_{UPDATE_METHOD}.csv'):
        print(f'generations-{model_type}-{model_size}/generated_{sub}_{UPDATE_METHOD}.csv already exists')
    else:
        generations_completed = False

if not generations_completed:
    if model_type == 'gpt-j':
        model_name = f"EleutherAI/gpt-j-{model_size}"
    elif model_type == 'gpt-neo':
        model_name = f"EleutherAI/gpt-neo-{model_size}"
    
    model_path = model_name
    
    model = AutoModelForCausalLM.from_pretrained(model_path)
    model = model.to(device)
    
    model.eval()
    #del edited_layes
    torch.cuda.empty_cache()
    display(model)
    
    generator = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map="auto",
        max_new_tokens=50
        
    )
    
    example = "I met a nice old woman down the street yesterday and"
    generated_text = generator(example, do_sample=False, 
                               generation_kwargs={"max_new_tokens":100, "random_state":42})[0]['generated_text']
    print(generated_text)
    
    # Authomatic evaluation
    for sub in TEST:
        sample, _ = train_test_split(TEST[sub], train_size=100, random_state=42, shuffle=True)
        sample['text_tokens'] = sample['text'].apply(clean_text_tokens)
        sample['sample_text'] = [' '.join(x[int((len(x)/100)*20): int((len(x)/100)*20)+100]) for x in sample['text_tokens']]
        display(sample)
    
    
        generated = []
        for p in tqdm.tqdm(sample['sample_text']):
            generated_text = generator(p, do_sample=False, 
                                   generation_kwargs={"random_state":42})[0]['generated_text']
            generated.append(generated_text)
        
        
        sample['generated'] = generated
        
        sample[['sample_text','generated']].to_csv(f'generations-{model_type}-{model_size}/generated_{sub}_{UPDATE_METHOD}.csv')
    
    model = model.to('cpu')
    del model
    del generator
    torch.cuda.empty_cache()

In [None]:
# Evaluate generation

In [None]:
import transformers
import torch
import tqdm
import pandas as pd
import os
from datasets import load_dataset
from ast import literal_eval


decoding_algs = ["greedy"]

CONTEXT = 200

UPDATE_METHODS = [f'memoedit-{CONTEXT}', f'MEMIT-{CONTEXT}', f'GRACE-{CONTEXT}', f'dememorize-{CONTEXT}']




update_prompts = ["pre_edit"] + UPDATE_METHODS


dataset = load_dataset("ola13/small-the_pile")
dataset = dataset['train']
dataset = dataset.to_pandas()
dataset['name'] = [x['pile_set_name'] for x in dataset['meta']]

books = dataset[dataset['name'] == 'Books3']
wikipedia = dataset[dataset['name'] == 'Wikipedia (en)']
cc = dataset[dataset['name'] == 'Pile-CC']

TEST = {'books':books, 'wikipedia':wikipedia, 'cc':cc}


dfs = {}

for sub in TEST:
    df = pd.DataFrame([])
    for update_prompt in update_prompts:
        if update_prompt!= 'pre_edit': 
            filename = f'generations-{model_type}-{model_size}/generated_{sub}_{update_prompt}-{pii_type}.csv'
        else:
            filename = f'generations-{model_type}-{model_size}/generated_{sub}_{update_prompt}.csv'
        print(f"Loading {filename}")
        if not os.path.exists(filename):
            print('Not computed yet!')
            continue
            
        sample = pd.read_csv(filename)
        df[update_prompt] = sample['generated']
        
    
    dfs[sub] = df
    display(df.head())

In [None]:
from nltk.translate.meteor_score import exact_match
from nltk.translate.nist_score import sentence_nist
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate import meteor
from rouge_metric import PyRouge
from numpy.linalg import norm
import numpy as np
import tqdm

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')


class Metric:
    def __init__(self, outputs, captions, clear=True, count_unk=False):
        self.outputs = outputs
        self.captions = captions
        self.count_unk = count_unk
        self.scores = {}
        
        self.scores = {'bleu':[], 'meteor':[]} #'rougeL':[], 

        if clear == True:
            self._clear()

    def _clear(self):
        self.outputs = [[w for w in str(o).split()] for o in self.outputs]
        self.captions = [[w for w in str(c).split()] for c in self.captions]

    def call(self):
        self._bleu_score()
        #self._rogueL_score()
        self._meteor_score()
                
        return self.scores

    def _bleu_score(self):
        smoothing = SmoothingFunction().method1
        
        for i in tqdm.tqdm(range(len(self.outputs))):
            # Calculate Bleu-4 score and
            score = sentence_bleu([self.captions[i]], self.outputs[i], smoothing_function=smoothing)
            self.scores['bleu'].append(score)


    def _meteor_score(self):
        for i in tqdm.tqdm(range(len(self.outputs))):
            # Calculate meteor score and
            score = meteor([self.captions[i]], self.outputs[i])
            self.scores['meteor'].append(score)
        

    def _rogueL_score(self):
        rouge = PyRouge(rouge_l=True)
        for i in tqdm.tqdm(range(len(self.outputs))):
            scores_rougeL = rouge.evaluate_tokenized([self.outputs[i]], [self.captions[i]])
            
            score = scores_rougeL['rouge-l']['f']
            self.scores['rougeL'].append(score)


In [None]:
import itertools

In [None]:
scores = {sub:{} for sub in TEST}
m1 = 'pre_edit'

for sub in TEST:
    for m2 in update_prompts:
        if m2 != m1 and m1 in dfs[sub] and m2 in dfs[sub]:
            metrics = Metric(outputs=dfs[sub][m2], captions=dfs[sub][m1])
            scores[sub][(m1,m2)] = metrics.call()

In [None]:
import statistics as stat

In [None]:
stat_scores = {sub:{} for sub in TEST}
m1 = 'pre_edit'

for sub in TEST:
    for m2 in update_prompts:
        if m2 != m1 and m1 in dfs[sub] and m2 in dfs[sub]:
            stat_scores[sub][(m1,m2)] = {}
            for score in scores[sub][(m1,m2)]:
                
                stat_scores[sub][(m1, m2)][score] = {
                    'min': min(scores[sub][(m1, m2)][score]),
                    'max': max(scores[sub][(m1, m2)][score]),
                    'mean': stat.mean(scores[sub][(m1, m2)][score]),
                    'median': stat.median(scores[sub][(m1, m2)][score]),
                    'std': stat.stdev(scores[sub][(m1, m2)][score])
                }

In [None]:
############# T-test ###############

In [None]:
stat_scores

In [None]:
df_results = {sub:{} for sub in TEST}
outs = []
for sub in TEST:
    for metric in ['bleu', 'meteor']: #'rougeL', 
        print(sub, metric)
        df_results[sub][metric] = pd.DataFrame([stat_scores[sub][c][metric] for c in stat_scores[sub].keys()], 
                                               index=stat_scores[sub].keys())
        display(df_results[sub][metric])

        out = df_results[sub][metric][['mean', 'std']]
        out['mean'] = out['mean'].apply(lambda x: round(x,3)).apply(str) + ' ('  + out['std'].apply(lambda x: round(x,3)).apply(str) + ')' 
        out = out[['mean']].rename(columns=lambda x: f'{sub}: {metric} {x}')
        display(out)
        
        outs.append(out)

In [None]:
out = pd.concat(outs, axis=1)
out

In [None]:
out.to_csv(f"postedit_LM_{model_type}-{model_size}_{pii_type}_results.csv")

In [None]:
f"{model_type}-{model_size}"

In [None]:
exit()