In [1]:
import os
import os.path as osp
import sys
ROOT_DIR = osp.dirname(os.getcwd())
sys.path.append(ROOT_DIR)

In [2]:
from transformers import AutoTokenizer
import numpy as np
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
MODEL_NAME = 'microsoft/phi-2'
CONTEXT_LENGTH = 2048

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token 

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Set up vectorstore

In [5]:
from langchain_chroma import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings, HuggingFaceBgeEmbeddings
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from tqdm import tqdm

In [6]:
VECTOR_STORE_PATH = '../data/vectorstore/'
EMBEDDING_MODEL_NAME='BAAI/bge-small-en'

In [7]:
def create_retriever():
    embeddings = HuggingFaceBgeEmbeddings(model_name=EMBEDDING_MODEL_NAME, model_kwargs={"device": "cuda"}, encode_kwargs={"normalize_embeddings": True})

    vectorstore = Chroma(persist_directory=VECTOR_STORE_PATH+"chromadb", embedding_function=embeddings)

    retriever = vectorstore.as_retriever(
        search_type="mmr",
        search_kwargs={'k': 15}
    )
   
    rerank_model = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-base", model_kwargs = {'device': 'cuda'})

    compressor = CrossEncoderReranker(model=rerank_model, top_n=3)
    compression_retriever = ContextualCompressionRetriever(
        base_compressor=compressor, base_retriever=retriever
    )
    return compression_retriever

# Set up database

In [8]:
import json
from lib.prompt import get_training_prompt
from joblib import Parallel, delayed
import pickle

CHUNK_SIZE=512
GENERATE_PROMPTS=False

def read_data(filename):
    with open(filename) as json_file :
        json_data = json.load(json_file)
    return json_data

data = read_data("../data/TeleQnA_training.txt")

def get_prompt(qstn_datas):
    retriever = create_retriever()
    prompts = []
    for qstn_data in tqdm(qstn_datas):
        qstn_text = qstn_data['question']
        docs = retriever.invoke(qstn_text)
        context =  (' '.join(list(map(lambda d:d.page_content,docs)))).replace('\n', '. ')
        prompt = get_training_prompt(qstn_data,context)
        prompts.append(prompt)
    return prompts

def chunks(container,size):
    for i in range(0, len(container), size):
        yield container[i:i + size]
        
def flatten(container):
    result = []
    for chunk in container:
        result += chunk
    return result 
           
# finetuning_datalist = list(map(lambda entry:get_prompt(entry[1],create_retriever()),tqdm(data.items())))
if GENERATE_PROMPTS:
    finetuning_datalist = flatten(Parallel(n_jobs=4)(delayed(get_prompt)(list(map(lambda e:e[1],entry))) for entry in tqdm(chunks(list(data.items()),CHUNK_SIZE))))
    with open('../bin/pickle/finetuning_datalist.pkl','wb') as bin_file:
        pickle.dump(finetuning_datalist,bin_file)
else:
    with open('../bin/pickle/finetuning_datalist.pkl','rb') as bin_file:
        finetuning_datalist=pickle.load(bin_file)



In [9]:
print(f"""********************************************************************************
Prompt
********************************************************************************
{finetuning_datalist[0]['prompt']}""")

# ********************************************************************************
# Correct answer
# ********************************************************************************
# {finetuning_datalist[0]['answer']}

# ********************************************************************************
# Explanation
# ********************************************************************************
# {finetuning_datalist[0]['explanation']}""")

********************************************************************************
Prompt
********************************************************************************
### Instructions: 
Based on the provided context, select the correct answer from the choices given. Provide your answer in the following format: option Number) Answer.

Context:
4.2.2.3.1	General. . The Nmfaf_3daDataManagement_Deconfigure service operation is used by an NF service consumer to stop mapping data or analytics received by the MFAF to one or more out-bound notification endpoints. -	3GPP DCCF Adaptor (3DA) Data Management Service: Nmfaf_3daDataManagement Service enables the DCCF to convey to the Messaging Framework, information about the data the Messaging Framework will receive from a Data Source, formatting and processing 9.2.2	Nmfaf_3daDataManagement_Configure service operation. . Service operation name: Nmfaf_3daDataManagement_Configure a short description of their use within the Nmfaf_3daDataManagement s

In [10]:
len(finetuning_datalist)

1461

In [11]:
#find the largest token count
def get_max_length(finetuning_datalist,tokenizer):
    tokens = tokenizer(list(map(lambda e: e['prompt'],finetuning_datalist)),return_tensors='np') #+e['answer']+'\n'+e['explanation']
    argmax_token_len = np.argmax([t.shape[0] for t in tokens.data['input_ids']])
    max_length = tokens.data['input_ids'][argmax_token_len].shape[0]
    max_length = min(max_length, CONTEXT_LENGTH)
    return max_length

In [12]:
#tokenize data for training
def tokenize_dataset(example, tokenizer, max_length):
    tokenizer.truncation_side = "left"
    tokenizer.pad_token = tokenizer.eos_token
    text = example['prompt'][0] #+ example['answer'][0] + '\n'+example['explanation'][0]
    # print(text)
    tokenized_input = tokenizer(
        text,
        max_length = max_length,
        truncation=True,
        return_tensors="np"
    )
    return tokenized_input

def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['prompt'])):
        text = example['prompt'][i] #+ example['answer'][i] + '\n'+example['explanation'][i]
        output_texts.append(text)
    return output_texts

In [13]:
max_length= get_max_length(finetuning_datalist, tokenizer)

In [14]:
max_length

797

In [15]:
finetuning_dataset = Dataset.from_list(finetuning_datalist)

In [16]:
finetuning_dataset

Dataset({
    features: ['prompt', 'answer', 'explanation'],
    num_rows: 1461
})

In [17]:
tokenized_dataset = finetuning_dataset.map(
    lambda e: tokenize_dataset(e,tokenizer, max_length),
    batched=True,
    batch_size=1,
    drop_last_batch=True
)

Map: 100%|██████████| 1461/1461 [00:03<00:00, 458.49 examples/s]


In [18]:
tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])

In [19]:
tokenized_dataset

Dataset({
    features: ['prompt', 'answer', 'explanation', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1461
})

In [20]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['prompt', 'answer', 'explanation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1314
    })
    test: Dataset({
        features: ['prompt', 'answer', 'explanation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 147
    })
})


In [21]:
split_dataset.save_to_disk("../data/finetuning/split_dataset")

Saving the dataset (1/1 shards): 100%|██████████| 1314/1314 [00:00<00:00, 9863.76 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 147/147 [00:00<00:00, 2846.12 examples/s]


# Training

In [22]:
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments
from transformers import Trainer
from peft import LoftQConfig, LoraConfig, get_peft_model
from lib.prompt import train_response_template, train_instruction_template, train_explanation_template
from trl import  SFTTrainer, DataCollatorForCompletionOnlyLM 

In [23]:
training_config = {
    "model": {
        "pretrained_name": MODEL_NAME,
        "max_length" : CONTEXT_LENGTH
    },
    "datasets": {
        "use_hf": False,
        "path": "../data/finetuning/split_dataset/"
    },
    "verbose": True
}

## Load base model

In [24]:
base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME,device_map='auto')

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.85s/it]


In [25]:
base_model

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiSdpaAttention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (dense): Linear(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (final_layernorm): LayerNorm((256

In [26]:
loftq_config = LoftQConfig(loftq_bits=4)           # set 4bit quantization
lora_config = LoraConfig(
    init_lora_weights="loftq",
    loftq_config=loftq_config,
    r=8,
    lora_alpha=8,
    target_modules=["q_proj", "k_proj", "v_proj", "dense"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
peft_model = get_peft_model(base_model, lora_config)

In [27]:
max_steps=1024

In [28]:
training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,

  # Number of training epochs
  num_train_epochs=1,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=max_steps,

  # Batch size for training
  per_device_train_batch_size=8,

  # Directory to save model checkpoints
  output_dir='../bin/',

  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=64, # Number of update steps between two evaluations
  save_steps=64, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, # Batch size for evaluation
  evaluation_strategy="steps",
  logging_strategy="steps",
  logging_steps=1,
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="eval_loss",
  greater_is_better=False
)



In [29]:
collator = DataCollatorForCompletionOnlyLM(instruction_template=train_instruction_template, response_template=train_response_template, tokenizer=tokenizer)



In [30]:
# print(split_dataset['train'][0]['prompt'])

In [31]:
trainer = SFTTrainer(
    model=peft_model,
    args=training_args,
    max_seq_length=max_length,
    train_dataset=split_dataset['train'],
    eval_dataset=split_dataset['test'],
    data_collator=collator,
    formatting_func=formatting_prompts_func
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
max_steps is given, it will override any value given in num_train_epochs


In [32]:
trainer.train()

Step,Training Loss,Validation Loss
64,0.416,0.54254
128,0.5461,0.492168
192,0.2502,0.454791
256,0.6171,0.433162
320,0.3916,0.418728
384,0.575,0.410427
448,0.4957,0.406077
512,0.7204,0.404911




TrainOutput(global_step=512, training_loss=0.49589239968918264, metrics={'train_runtime': 3708.0232, 'train_samples_per_second': 0.552, 'train_steps_per_second': 0.138, 'total_flos': 1.430739256624128e+16, 'train_loss': 0.49589239968918264, 'epoch': 1.5585996955859969})

In [33]:
peft_model.save_pretrained('../bin/pretrained_1')

