In [35]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("heegyu/koalpaca-355m",
                                          ax_position_embeddings = 1024,
                                        ignore_mismatched_sizes = True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = AutoModelForCausalLM.from_pretrained("heegyu/koalpaca-355m",
                                    max_length = 1024, max_position_embeddings = 1024,
                                    ignore_mismatched_sizes = True)


In [36]:
from transformers import pipeline

generator = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer
)

In [37]:
def query(instruction, input=None):
    if input:
        prompt = f"<usr>{instruction}\n{input}\n<sys>"
    else:
        prompt = f"<usr>{instruction}\n<sys>"
    
    print(generator(
        prompt,
        do_sample=True,
        top_p=0.9,
        early_stopping=True,
        max_length=1024,
    )[0]['generated_text'])

In [38]:
query('보안 전문가로서, SQL Injection 공격에 대해서 2문장 이내로 설명해줘.')



<usr>보안 전문가로서, SQL Injection 공격에 대해서 2문장 이내로 설명해줘.
<sys>SSL은 SQL을 사용하여 문서를 암호화하고, 이를 사용자 USB에 저장하여 사용자에게 반환합니다. 이러한 암호화 방법을 통해 해킹이 발생하지 않으며, SQL에 기반하여 문서가 암호화되어 보관됩니다. 또한 보안 전문가들은 SQL 코드와 유사한 형태로 코드를 작성할 수 있으며, 이를 기반으로 추가적인 보안이 이루어질 수 있습니다.


In [39]:
# mps device
device = torch.device("mps" if torch.backends.mps.is_built() else "cpu")
device

device(type='mps')

In [40]:
model.config

GPT2Config {
  "_name_or_path": "heegyu/koalpaca-355m",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 1,
  "do_sample": true,
  "embd_pdrop": 0.1,
  "eos_token_id": 2,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "max_length": 50,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1024,
  "n_head": 16,
  "n_inner": null,
  "n_layer": 24,
  "n_positions": 1024,
  "pad_token_id": 0,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.28.0.dev0",
  "use_cache": true,
  "vocab_size": 51200
}

In [41]:
# model = model.to(device)
model.device

device(type='cpu')

Pytorch based fine tuning

In [42]:
import datasets
dataset = datasets.load_dataset('json', data_files='chat_gpt_context/security_base_sample.json', field='train')

Using custom data configuration default-f9359a9f122d3b85
Reusing dataset json (/Users/choiwb/.cache/huggingface/datasets/json/default-f9359a9f122d3b85/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5)
100%|██████████| 1/1 [00:00<00:00, 44.56it/s]


In [43]:
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 90
    })
})

In [44]:
cutoff_len = 1024

def generate_prompt(entry):
    if entry['input']:
        return f"User: {entry['instruction']}: {entry['input']}\n\nAssistant: {entry['output']}"
    else:
        return f"User: {entry['instruction']}\n\nAssistant: {entry['output']}"

def tokenize(item, add_eos_token=True):
    result = tokenizer(
        generate_prompt(item),
        truncation=True,
        max_length=cutoff_len,
        padding=False,
        return_tensors=None,
    )

    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < cutoff_len
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

In [45]:
train_val = dataset["train"].train_test_split(test_size=0.05, shuffle=True, seed=42)
train_data = train_val["train"].shuffle().map(tokenize)
val_data = train_val["test"].shuffle().map(tokenize)

Loading cached split indices for dataset at /Users/choiwb/.cache/huggingface/datasets/json/default-f9359a9f122d3b85/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5/cache-8b6e266c1d3bbcae.arrow and /Users/choiwb/.cache/huggingface/datasets/json/default-f9359a9f122d3b85/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5/cache-9ec1d939156592c7.arrow
100%|██████████| 85/85 [00:00<00:00, 1818.60ex/s]
100%|██████████| 5/5 [00:00<00:00, 1301.37ex/s]


In [46]:
'''
if 'model' in globals(): 
    del model
    # torch.cuda.empty_cache()

model = transformers.AutoModelForCausalLM.from_pretrained(
    'cerebras/Cerebras-GPT-111M',    
    
    # load_in_8bit=True,
    # torch_dtype=torch.float16,

    # device_map={'': 0}
    #device = torch.device("cpu")
    # device_map = 'auto'
)
'''

'\nif \'model\' in globals(): \n    del model\n    # torch.cuda.empty_cache()\n\nmodel = transformers.AutoModelForCausalLM.from_pretrained(\n    \'cerebras/Cerebras-GPT-111M\',    \n    \n    # load_in_8bit=True,\n    # torch_dtype=torch.float16,\n\n    # device_map={\'\': 0}\n    #device = torch.device("cpu")\n    # device_map = \'auto\'\n)\n'

In [47]:
'''
import peft

model = peft.prepare_model_for_int8_training(model)

model = peft.get_peft_model(model, peft.LoraConfig(
    r=8,
    lora_alpha=16,
    # target_modules=["q_proj", "v_proj"],
    target_modules=["c_attn"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
))
'''

'\nimport peft\n\nmodel = peft.prepare_model_for_int8_training(model)\n\nmodel = peft.get_peft_model(model, peft.LoraConfig(\n    r=8,\n    lora_alpha=16,\n    # target_modules=["q_proj", "v_proj"],\n    target_modules=["c_attn"],\n    lora_dropout=0.05,\n    bias="none",\n    task_type="CAUSAL_LM",\n))\n'

In [48]:
# import peft

# model = peft.PeftModel.from_pretrained(
#     model,
#     # 'lora-cerebras-gpt2.7b-hh-rlhf-helpful-online',
#     output_dir,
#     torch_dtype=torch.float16
# )

In [49]:
import os
import wandb 

output_dir = 'koalpaca-355m-finetune'

use_wandb = True,
wandb_run_name = f"{output_dir}-{wandb.util.generate_id()}"

# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]=output_dir

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="true"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"

In [50]:
training_args = transformers.TrainingArguments(
    # per_device_train_batch_size=16, 
    per_device_train_batch_size=1, 

    gradient_accumulation_steps=8,  
    # 학습횟수 20 이상은 에러 !!!!!!!
    num_train_epochs=19,  
    learning_rate=1e-4, 
    # only be used on CUDA devices.
    # fp16=True,
    
    optim="adamw_torch",
    logging_steps=10, 
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=200,
    save_steps=200,
    output_dir=output_dir, 
    save_total_limit=3,

    report_to="wandb" if use_wandb else None,
    run_name=wandb_run_name if use_wandb else None,
)

In [51]:
trainer = transformers.Trainer(
    model=model, 
    train_dataset=train_data,
    eval_dataset=val_data,
    args=training_args, 
    data_collator=transformers.DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=1, return_tensors="pt", padding=True
    ),
)

In [52]:
model.device

device(type='cpu')

In [19]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [53]:
model.config.use_cache = False
result = trainer.train()
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

wandb.finish()

  0%|          | 0/190 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  5%|▌         | 10/190 [04:19<1:12:45, 24.25s/it]

{'loss': 2.8061, 'learning_rate': 9.473684210526316e-05, 'epoch': 0.94}


 11%|█         | 20/190 [08:27<1:08:20, 24.12s/it]

{'loss': 1.5766, 'learning_rate': 8.947368421052632e-05, 'epoch': 1.88}


 16%|█▌        | 30/190 [12:12<56:49, 21.31s/it]  

{'loss': 0.91, 'learning_rate': 8.421052631578948e-05, 'epoch': 2.82}


 21%|██        | 40/190 [15:22<52:54, 21.16s/it]

{'loss': 0.5507, 'learning_rate': 7.894736842105263e-05, 'epoch': 3.76}


 26%|██▋       | 50/190 [18:42<50:35, 21.69s/it]

{'loss': 0.3296, 'learning_rate': 7.368421052631579e-05, 'epoch': 4.71}


 32%|███▏      | 60/190 [22:02<53:33, 24.72s/it]

{'loss': 0.2286, 'learning_rate': 6.842105263157895e-05, 'epoch': 5.65}


 37%|███▋      | 70/190 [25:17<41:21, 20.68s/it]

{'loss': 0.1731, 'learning_rate': 6.31578947368421e-05, 'epoch': 6.59}


 42%|████▏     | 80/190 [28:28<38:11, 20.83s/it]

{'loss': 0.1323, 'learning_rate': 5.789473684210527e-05, 'epoch': 7.53}


 47%|████▋     | 90/190 [31:15<25:54, 15.55s/it]

{'loss': 0.1025, 'learning_rate': 5.2631578947368424e-05, 'epoch': 8.47}


 53%|█████▎    | 100/190 [34:41<29:25, 19.62s/it]

{'loss': 0.0898, 'learning_rate': 4.736842105263158e-05, 'epoch': 9.41}


 58%|█████▊    | 110/190 [37:54<23:44, 17.81s/it]

{'loss': 0.0689, 'learning_rate': 4.210526315789474e-05, 'epoch': 10.35}


 63%|██████▎   | 120/190 [41:22<23:20, 20.01s/it]

{'loss': 0.0637, 'learning_rate': 3.6842105263157895e-05, 'epoch': 11.29}


 68%|██████▊   | 130/190 [44:47<18:33, 18.57s/it]

{'loss': 0.0569, 'learning_rate': 3.157894736842105e-05, 'epoch': 12.24}


 74%|███████▎  | 140/190 [48:05<17:51, 21.44s/it]

{'loss': 0.0509, 'learning_rate': 2.6315789473684212e-05, 'epoch': 13.18}


 79%|███████▉  | 150/190 [50:53<11:35, 17.40s/it]

{'loss': 0.0481, 'learning_rate': 2.105263157894737e-05, 'epoch': 14.12}


 84%|████████▍ | 160/190 [54:30<11:14, 22.50s/it]

{'loss': 0.0451, 'learning_rate': 1.5789473684210526e-05, 'epoch': 15.06}


 89%|████████▉ | 170/190 [58:08<05:28, 16.41s/it]

{'loss': 0.041, 'learning_rate': 1.0526315789473684e-05, 'epoch': 16.0}


 95%|█████████▍| 180/190 [1:01:09<03:15, 19.51s/it]

{'loss': 0.0406, 'learning_rate': 5.263157894736842e-06, 'epoch': 16.94}


100%|██████████| 190/190 [1:04:30<00:00, 22.45s/it]

{'loss': 0.0401, 'learning_rate': 0.0, 'epoch': 17.88}
{'train_runtime': 3875.8609, 'train_samples_per_second': 0.417, 'train_steps_per_second': 0.049, 'train_loss': 0.38708530366420746, 'epoch': 17.88}


100%|██████████| 190/190 [1:04:36<00:00, 20.40s/it]


0,1
train/epoch,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇███
train/global_step,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇███
train/learning_rate,██▇▇▆▆▆▅▅▄▄▄▃▃▃▂▂▁▁
train/loss,█▅▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,17.88
train/global_step,190.0
train/learning_rate,0.0
train/loss,0.0401
train/total_flos,908675585802240.0
train/train_loss,0.38709
train/train_runtime,3875.8609
train/train_samples_per_second,0.417
train/train_steps_per_second,0.049


Pytorch based fine tuning model load & inference

In [54]:
model.config
print(model.dtype)

# model.half()
model.eval()

torch.float32


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=51200, bias=False)
)

In [55]:
# 저장된 모델 호출
output_dir = 'koalpaca-355m-finetune'
model = transformers.AutoModelForCausalLM.from_pretrained(output_dir)
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=51200, bias=False)
)

In [56]:
prompt = 'Mitre Att&ck v13 matrix의 Enterprise Tactics ID는 몇 개야?'
inputs = tokenizer(prompt, return_tensors="pt")
# input_ids = inputs["input_ids"].to(model.device)
input_ids = inputs["input_ids"]

In [57]:
generation_config = transformers.GenerationConfig(
    max_new_tokens=100,
    temperature=0.2,
    top_p=0.75,
    top_k=50,
    repetition_penalty=1.2,
    do_sample=True,
    early_stopping=True,
    # num_beams=5,
    
    pad_token_id=model.config.pad_token_id,
    eos_token_id=model.config.eos_token_id,
)

In [58]:
'''
with torch.no_grad():
    output = model.generate(
        input_ids=input_ids,
        attention_mask=torch.ones_like(input_ids),
        generation_config=generation_config
    )[0].cuda()
'''
with torch.no_grad():
    output = model.generate(
        input_ids=input_ids,
        attention_mask=torch.ones_like(input_ids),
        generation_config=generation_config
    )[0]

In [59]:
result = tokenizer.decode(output, skip_special_tokens=True).strip()
print(result)

Mitre Att&ck v13 matrix의 Enterprise Tactics ID는 몇 개야?

Assistant: ta0001 (injection) attack into the request. logsource: content-type: application/xhtml+xml image/jxr */*; legal=http://www.corea.school"; rv: http://10.10.123.123:3100/board/port
