In [1]:
import os
import os.path as osp
import sys
ROOT_DIR = osp.dirname(os.getcwd())
sys.path.append(ROOT_DIR)

In [2]:
from transformers import AutoTokenizer
import numpy as np
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
MODEL_NAME = 'microsoft/phi-2'
CONTEXT_LENGTH = 2048

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token 

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Set up database

In [5]:
import json
from lib.prompt import get_training_prompt

def read_data(filename):
    with open(filename) as json_file :
        json_data = json.load(json_file)
    return json_data

data = read_data("../data/TeleQnA_training.txt")
finetuning_datalist = list(map(lambda entry:get_training_prompt(entry[1]),data.items()))

In [6]:
print(f"""********************************************************************************
Prompt
********************************************************************************
{finetuning_datalist[0]['question']}


********************************************************************************
Correct answer
********************************************************************************
{finetuning_datalist[0]['answer']}""")

********************************************************************************
Prompt
********************************************************************************
### Instructions: 
Based on only the provided context, select the correct answer from the choices given. Provide your answer in the following format: option Number) Answer. Do not include any additional text or explanation.

Context:
The Nmfaf_3daDataManagement_Deconfigure service operation is used to stop mapping data or analytics received by the MFAF to one or more out-bound notification endpoints.

Question:
What is the purpose of the Nmfaf_3daDataManagement_Deconfigure service operation? [3GPP Release 18]

Choices:
option 1) To configure the MFAF to map data or analytics received by the MFAF to out-bound notification endpoints
option 2) To configure the MFAF to stop mapping data or analytics received by the MFAF to out-bound notification endpoints
option 3) To supply data or analytics from the MFAF to notification e

In [7]:
#find the largest token count
def get_max_length(finetuning_datalist,tokenizer):
    tokens = tokenizer(list(map(lambda e: e['question']+e['answer'],finetuning_datalist)),return_tensors='np')
    argmax_token_len = np.argmax([t.shape[0] for t in tokens.data['input_ids']])
    max_length = tokens.data['input_ids'][argmax_token_len].shape[0]
    max_length = min(max_length, CONTEXT_LENGTH)
    return max_length

In [8]:
#tokenize data for training
def tokenize_dataset(example, tokenizer, max_length):
    tokenizer.truncation_side = "left"
    tokenizer.pad_token = tokenizer.eos_token
    text = example['question'][0]+example['answer'][0]
    # print(text)
    tokenized_input = tokenizer(
        text,
        max_length = max_length,
        truncation=True,
        return_tensors="np"
    )
    return tokenized_input

def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['question'])):
        text = example['question'][i] + example['answer'][i]
        output_texts.append(text)
    return output_texts

In [9]:
max_length= get_max_length(finetuning_datalist, tokenizer)

In [10]:
max_length

555

In [11]:
finetuning_dataset = Dataset.from_list(finetuning_datalist)

In [12]:
finetuning_dataset

Dataset({
    features: ['question', 'answer'],
    num_rows: 1461
})

In [13]:
tokenized_dataset = finetuning_dataset.map(
    lambda e: tokenize_dataset(e,tokenizer, max_length),
    batched=True,
    batch_size=1,
    drop_last_batch=True
)

Map: 100%|██████████| 1461/1461 [00:02<00:00, 596.40 examples/s]


In [14]:
tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])

In [15]:
tokenized_dataset

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1461
})

In [16]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1314
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 147
    })
})


In [17]:
split_dataset.save_to_disk("../data/finetuning/split_dataset")

Saving the dataset (1/1 shards): 100%|██████████| 1314/1314 [00:00<00:00, 29970.39 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 147/147 [00:00<00:00, 12066.04 examples/s]


# Training

In [18]:
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments
from transformers import Trainer
from peft import LoftQConfig, LoraConfig, get_peft_model
from lib.prompt import train_response_template, train_instruction_template
from trl import  SFTTrainer, DataCollatorForCompletionOnlyLM 

In [19]:
training_config = {
    "model": {
        "pretrained_name": MODEL_NAME,
        "max_length" : CONTEXT_LENGTH
    },
    "datasets": {
        "use_hf": False,
        "path": "../data/finetuning/split_dataset/"
    },
    "verbose": True
}

## Load base model

In [20]:
base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME,device_map='auto')

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.62s/it]


In [21]:
base_model

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiSdpaAttention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (dense): Linear(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (final_layernorm): LayerNorm((256

In [22]:
loftq_config = LoftQConfig(loftq_bits=4)           # set 4bit quantization
lora_config = LoraConfig(
    init_lora_weights="loftq",
    loftq_config=loftq_config,
    r=16,
    lora_alpha=8,
    target_modules=["q_proj", "k_proj", "v_proj", "dense"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
peft_model = get_peft_model(base_model, lora_config)

In [23]:
max_steps=200

In [24]:
training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,

  # Number of training epochs
  num_train_epochs=1,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=max_steps,

  # Batch size for training
  per_device_train_batch_size=1,

  # Directory to save model checkpoints
  output_dir='../bin/',

  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=20, # Number of update steps between two evaluations
  save_steps=20, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, # Batch size for evaluation
  evaluation_strategy="steps",
  logging_strategy="steps",
  logging_steps=1,
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="eval_loss",
  greater_is_better=False
)



In [25]:
collator = DataCollatorForCompletionOnlyLM(instruction_template=train_instruction_template, response_template=train_response_template, tokenizer=tokenizer)

# split_dataset['train'][0]



In [26]:
trainer = SFTTrainer(
    model=peft_model,
    args=training_args,
    max_seq_length=1024,
    train_dataset=split_dataset['train'],
    eval_dataset=split_dataset['test'],
    data_collator=collator,
    formatting_func=formatting_prompts_func
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
max_steps is given, it will override any value given in num_train_epochs


In [27]:
trainer.train()

Step,Training Loss,Validation Loss
20,0.6999,0.902585
40,0.7605,0.822143
60,0.7283,0.743702
80,0.5811,0.666733
100,0.5439,0.597731
120,0.4414,0.540387
140,0.7268,0.493923
160,0.3751,0.462277
180,0.4716,0.4417
200,0.5254,0.43502




TrainOutput(global_step=200, training_loss=0.6308332908898592, metrics={'train_runtime': 1121.6743, 'train_samples_per_second': 0.713, 'train_steps_per_second': 0.178, 'total_flos': 2371330057728000.0, 'train_loss': 0.6308332908898592, 'epoch': 0.60882800608828})

In [28]:
peft_model.save_pretrained('../bin/pretrained/')

