In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install datasets




In [None]:
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:

import os
import numpy as np
from datasets import Dataset, load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, LlamaConfig
import torch

# 경로 설정
output_dir = "/content/drive/MyDrive/data/unlearning/models/reinforced"
os.makedirs(output_dir, exist_ok=True)

# 데이터셋 준비
dataset = load_dataset('text', data_files={'train': '/content/drive/MyDrive/data/unlearning/HP_all.txt'})
dataset = dataset['train'].train_test_split(test_size=0.1)

train_dataset = dataset["train"]
test_dataset = dataset["test"]

# # 모델 및 토크나이저 로드
model_name = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

# pad_token 설정
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

# 모델 로드 및 gradient checkpointing 설정
config = LlamaConfig.from_pretrained(model_name)
config.gradient_checkpointing = True

model = AutoModelForCausalLM.from_pretrained(model_name, config=config)
model.config.use_cache = False  # gradient checkpointing과 호환되도록 설정

# 데이터셋 토크나이징하는 부분
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# 데이터셋에 labels 컬럼 추가
def add_labels(examples):
    examples["labels"] = examples["input_ids"].copy()
    return examples

tokenized_train_dataset = tokenized_train_dataset.map(add_labels, batched=True)
tokenized_test_dataset = tokenized_test_dataset.map(add_labels, batched=True)

# 데이터셋 포맷 변경,,
tokenized_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

print(f"tokenized_train_dataset ::: {tokenized_train_dataset}")

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        if labels is not None:
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        else:
            loss = None

        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",
    learning_rate=3e-6,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=16,
    fp16=False,
    dataloader_num_workers=0,
    use_cpu=True,
)


trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

# 모델 훈련
trainer.train()

# 모델 저장
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model saved to {output_dir}")




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# 대체 코드
# 환경 1. L4
import os
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments

# 경로 설정 및 생성
output_dir = "/content/drive/MyDrive/data/unlearning/models/reinforced"
os.makedirs(output_dir, exist_ok=True)

# 데이터셋 로드 및 분할
dataset = load_dataset('text', data_files={'train': '/content/drive/MyDrive/data/unlearning/HP_all.txt'})['train'].train_test_split(test_size=0.1)
train_dataset, test_dataset = dataset['train'], dataset['test']

# 토크나이저 및 모델 로드
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", use_cache=False)

# 토크나이징 및 레이블 처리 함수
def process_data(example):
    return tokenizer(example['text'], truncation=True, padding='max_length', max_length=512)

def add_labels(example):
    example['labels'] = example['input_ids'].copy()
    return example

# 데이터 처리
train_dataset = train_dataset.map(process_data, batched=True).map(add_labels, batched=True)
test_dataset = test_dataset.map(process_data, batched=True).map(add_labels, batched=True)
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# 훈련 설정
training_args = TrainingArguments(
    output_dir=output_dir, eval_strategy="epoch", learning_rate=3e-6,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    num_train_epochs=3, weight_decay=0.01, gradient_accumulation_steps=16,
    dataloader_num_workers=0, use_cpu=True
)

# 트레이너 설정 및 훈련
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset)
trainer.train()

# 모델 및 토크나이저 저장
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model saved to {output_dir}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/39722 [00:00<?, ? examples/s]

ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

Key Changes:
Removed redundant tokenizer loads: There were two consecutive calls to load the tokenizer which is redundant.
Integrated Data Processing and Labeling: Combined data processing and labeling into a more streamlined workflow.
Reduced Redundancies: Removed duplicate variable definitions and streamlined the code structure.
Added Model Configuration Inline: Instead of loading a separate config, settings are adjusted directly in the model loading function to save memory.


In [None]:
#!/usr/bin/env python
# coding=utf-8
# Copyright 2022 Microsoft Corp. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
import re
import json
import numpy as np
import random
import argparse
import torch
import re
import ast
import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset
from huggingface_hub import notebook_login
from torch.cuda.amp import autocast, GradScaler
import sys

def filter_unknown_args():
    known_args = {'--context_length', '--model', '--reinforced_model', '--dict_file', '--input_file', '--output_file', '--bootstrap_coef', '--device'}
    filtered_args = [arg for arg in sys.argv[1:] if any(arg.startswith(k) for k in known_args)]
    return filtered_args

parser = argparse.ArgumentParser()

parser.add_argument('--context_length', type=int, default=512)
parser.add_argument('--model', type=str, default="meta-llama/Llama-2-7b-chat-hf")
parser.add_argument('--reinforced_model', type=str, default="/content/drive/MyDrive/data/unlearning/models/reinforced")
parser.add_argument('--dict_file', type=str, default="/content/drive/MyDrive/data/unlearning/dicts_new.npy")
parser.add_argument('--input_file', type=str, default="/content/drive/MyDrive/data/unlearning/HP_all.txt")
parser.add_argument('--output_file', type=str, default="/content/drive/MyDrive/data/unlearning/generic_predictions.hf")
parser.add_argument('--bootstrap_coef', type=float, default=5)
parser.add_argument('--device', type=str, default="cuda:0")

args = parser.parse_args(filter_unknown_args())

tokenizer = AutoTokenizer.from_pretrained(args.model)

def get_tokenizer_variations(string):
    return [tokenizer.encode(string)[1:], tokenizer.encode("\n" + string)[3:]]

def prepare_dict(filename):
    def parse_dict(s):
        s = s.replace("\n", "")
        match = re.search(r'translations\s*=\s*({.*?})', s)
        if match:
            dict_str = match.group(1)
            try:
                dict_str = re.sub(r',\s*([}\]])', r'\1', dict_str)
                dict_str = re.sub(r'#.*?(,|})', r'\1')
                my_dict = json.loads(dict_str)
                if my_dict is None:
                    my_dict = {}
                return my_dict
            except:
                print(f"Couldn't parse the string: {dict_str}")
                return {}
        else:
            return {}

    def consolidate_dicts(dict_list):
        consolidated = {}
        for d in dict_list:
            for key, value in d.items():
                if key not in consolidated:
                    consolidated[key] = []
                if value not in consolidated[key]:
                    consolidated[key].append(value)
        return consolidated

    dicts = np.load(filename)
    dicts = [parse_dict(dict) for dict in dicts]
    consolidated_dict = consolidate_dicts(dicts)

    def splittable_key(dict, key):
        if key[-2:] == "'s" and key[:-2] in dict.keys():
            return True
        words = key.split()
        if len(words) == 1:
            return False
        return all([word in dict.keys() for word in words])

    consolidated_dict = {k: v for k, v in consolidated_dict.items() if not splittable_key(consolidated_dict, k)}

    print("Total number of entries in anchor expressions dictionary: ", len(consolidated_dict))
    return consolidated_dict

def tokenize_and_index_dict(input_dict):
    def add_tokenized_entries(key, value, target_dict):
        key = key.strip()
        value = [item.strip() for item in value]
        key_tok_variations = get_tokenizer_variations(key)
        val_tok_variations = [[] for _ in key_tok_variations]
        for item in value:
            for i, variation in enumerate(get_tokenizer_variations(item)):
                val_tok_variations[i].append(variation)
        for key_tok, value_tok in zip(key_tok_variations, val_tok_variations):
            if key_tok[0] not in target_dict:
                target_dict[key_tok[0]] = {}
            target_dict[key_tok[0]][tuple(key_tok)] = value_tok
    tokenized_dict = {}
    for key, val in input_dict.items():
        add_tokenized_entries(key, val, tokenized_dict)
    return tokenized_dict

anchored_expressions_dictionary = tokenize_and_index_dict(prepare_dict(args.dict_file))

def get_trans_dict():
    return {key: {inner_key: random.choice(inner_value) for inner_key, inner_value in value.items()} for key, value in anchored_expressions_dictionary.items()}

model = AutoModelForCausalLM.from_pretrained(args.model).to(args.device)
reinforced_model = AutoModelForCausalLM.from_pretrained(args.reinforced_model).to(args.device)

scaler = GradScaler()

def translate_and_map(original_tokens):
    translated_tokens = []
    mapping = []
    orig_idx = 0
    trans_idx = 0
    previously_matched = []
    forbidden_list = []
    trans_dict = get_trans_dict()
    while orig_idx < len(original_tokens):
        matched = False
        curr_token = original_tokens[orig_idx].item()
        if curr_token in trans_dict:
            for key_tokens, value_tokens in trans_dict[curr_token].items():
                length_key = len(key_tokens)
                if orig_idx + length_key < len(original_tokens) + 1 and key_tokens == tuple(original_tokens[orig_idx: orig_idx + length_key].tolist()):
                    translated_tokens.extend(value_tokens)
                    if tokenizer.decode(key_tokens) in previously_matched:
                        mapping[-1] = -1
                    mapping.extend([-1] * length_key)
                    forbidden_list.append([item[0] for item in get_tokenizer_variations(tokenizer.decode(value_tokens))])
                    forbidden_list.extend([[] for _ in range(len(value_tokens) - 1)])
                    orig_idx += length_key
                    trans_idx += len(value_tokens)
                    mapping[-1] = trans_idx - 1
                    previously_matched.append(tokenizer.decode(key_tokens))
                    matched = True
                    break
        if not matched:
            translated_tokens.append(original_tokens[orig_idx].item())
            mapping.append(trans_idx)
            forbidden_list.append([])
            orig_idx += 1
            trans_idx += 1
    return torch.tensor(translated_tokens), torch.tensor(mapping), forbidden_list

def process_chunk(example):
    IGNORE_TOKEN_ID = -100
    original_tokens = torch.tensor(example["tokens"])
    translated_tokens, mapping, forbidden_predictions = translate_and_map(original_tokens)
    mapping = mapping.to(args.device)
    original_tokens = original_tokens.to(args.device)
    translated_tokens = translated_tokens.int().to(args.device)
    mask = (mapping != -1)
    true_indices = mask.nonzero(as_tuple=True)[0]
    with torch.no_grad():
        with autocast():
            predictions_on_translated = model.forward(translated_tokens.unsqueeze(0)).logits[0]
            all_forbidden_predictions = [sum(forbidden_predictions[:i], []) for i in range(translated_tokens.shape[0])]
            for i, tokens in enumerate(all_forbidden_predictions):
                predictions_on_translated[i, torch.tensor(tokens).long()] = predictions_on_translated[i].mean()
            generic_predictions = predictions_on_translated[mapping[mask]]
            reinforced_predictions = reinforced_model.forward(original_tokens.unsqueeze(0).to(reinforced_model.device)).logits[0][mask]
            offset_predictions = reinforced_predictions - generic_predictions
            offset_predictions = torch.clamp(offset_predictions, min=0)
            return_dict = {"input_ids": original_tokens.tolist()}
            for coef_factor in [0.5, 1, 2]:
                final_labels_on_masked_tokens = (generic_predictions - coef_factor * args.bootstrap_coef * offset_predictions).argmax(dim=1)
                final_predictions = torch.full_like(original_tokens, IGNORE_TOKEN_ID)
                final_predictions[true_indices] = final_labels_on_masked_tokens
                final_predictions = [IGNORE_TOKEN_ID] + torch.tensor(final_predictions).tolist()[:-1]
                if coef_factor == 1:
                    return_dict["labels"] = final_predictions
                else:
                    return_dict[f"labels_{coef_factor}"] = final_predictions
    torch.cuda.empty_cache()  # Clear the CUDA cache
    return return_dict

CHUNK_SIZE = 5000  # Reduce the chunk size to fit into memory

def read_file_into_chunks(filename, context_length):
    tokenized_text = []
    with open(filename, 'r', encoding='utf-8') as file:
        while True:
            chunk = file.read(CHUNK_SIZE)
            if not chunk:
                break
            chunk_tokens = tokenizer.encode(chunk, return_tensors="pt")[0]
            tokenized_text.extend(chunk_tokens)
    return [tokenized_text[i:i+context_length] for i in range(0, len(tokenized_text) - context_length, context_length)]

chunks = read_file_into_chunks(args.input_file, args.context_length)
dataset = Dataset.from_dict({'tokens': chunks})

processed_dataset = dataset.map(process_chunk)

processed_dataset.save_to_disk(args.output_file)


Couldn't parse the string: {    "Harry": "Jon",    "Ron": "Tom",    "Hermione": "Sophie",    "Hogsmeade": "MysticVille",    "Filch": "Grimsby",    "Sirius": "Orion",    "Dungbombs": "StinkBlasts",    "Malfoy": "Blackwood",    "Three Broomsticks": "Triple Wand Inn",    "Hog's Head": "Boar's Crown",    "Zonko's Wizarding Joke Shop": "Ziggy's Magical Prank Store",    "Fred": "Max",    "George": "Sam",    "Lee Jordan": "Liam Turner",    "owls": "ravens"}
Couldn't parse the string: {    "Hermione": "Emily",    "Ginny": "Sophie",    "Lee Jordan": "Tom Wilson",    "Angelina": "Olivia",    "Quidditch": "Skyball",    "Katie Bell": "Lily Thompson",    "Madam Pince": "Mrs. Collins",    "Harry": "Jon",    "Dean": "Alex",    "Ron": "Sam",    "McGonagall": "Mrs. Johnson",    "Tri wizard": "Triple Challenge",    "Umbridge": "Ms. Blackwell",    "Fred": "Max",    "Zacharias": "Ethan",    "Ernie": "Ben",    "Cho": "Grace"}
Couldn't parse the string: {    "Bagman": "Barnes",    "Wasp robes": "Bee uniform

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB. GPU 

Reduce Batch Size: In your training script, adjust the per_device_train_batch_size and per_device_eval_batch_size in your TrainingArguments to a lower number. Smaller batches require less GPU memory.

Gradient Accumulation: Use the gradient_accumulation_steps parameter in TrainingArguments. This allows you to simulate larger batches by accumulating gradients over several smaller batches, thus using less memory per batch.

Clear CUDA Cache: Insert torch.cuda.empty_cache() at strategic points in your training loop to free up memory that is no longer needed.

Optimize Model Size: If possible, switch to a smaller model variant that requires less memory or consider using techniques like quantization to reduce the model size.

Sequential Processing: Break down your data processing tasks into smaller chunks that fit into your GPU's memory.