In [1]:
!pip install peft

Collecting peft
  Downloading peft-0.11.1-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.11.1


In [2]:
import os
import pickle
import json
import torch
import torch.nn as nn
import torch.optim as optim
import transformers
from transformers import AutoTokenizer,AutoModelForCausalLM,AutoConfig, AutoModel,BitsAndBytesConfig,GenerationConfig
from safetensors.torch import load_file
from datasets import load_dataset
from typing import Union
from peft import (
    LoraConfig,
    get_peft_model,
    set_peft_model_state_dict,
)

**Recoding Source :**

In [3]:
class ODIE():
    #Initial :
    def __init__(self, 
                old_output = None,
                base_model = 'TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T', #Set base model : 
                model_config = 'adapter_config.json',   #Tên file cấu hình mô hình
                model_weights = 'adapter_model.safetensors',  #Tên file trọng số mô hình
                target_modules = [
                        'q_proj', 'k_proj', 'v_proj', 'o_proj', 
                        'up_proj', 'down_proj', 'gate_proj', 
                        'embed_tokens', 'lm_head'
                ],
                template_path = '/kaggle/input/data-nlp-2/Data_train/templates/alpaca.json',
               ):
        self.old_output = old_output
        self.base_model = base_model
        self.model_config = model_config
        self.model_weights = model_weights
        self.target_modules = target_modules
        self.template_path = template_path
        self.tokenizer = self.Set_Tokenize()
        self.check_train = True
        
    #Function Load Model : 
    def Load_Model(self):
        print('========================Please Waiting==============================')
        print('====================================================================')
        print('========================Loading Model===============================')
        output_dir = self.old_output
        #Tải lại cấu hình mô hình :
        model = AutoModelForCausalLM.from_pretrained(self.base_model, #Base Model
                                                torch_dtype=torch.float32,
                                                load_in_8bit = False,
                                                is_decoder=True
                                                )
        
        if output_dir is not None :
            print('Reloading and Retraining Model!')
            print('======================================================')
            #Đường dẫn tới thư mục chứa các tệp mô hình đã lưu :
            config_path = os.path.join(output_dir, self.model_config)
            model_weights_path = os.path.join(output_dir, self.model_weights)

            with open(config_path, 'r') as f:
                config_dict = json.load(f)

            #Chuyển đổi target_modules thành List thay vì Set :
            if isinstance(config_dict['target_modules'], list):
                target_modules = config_dict['target_modules']
            else:
                target_modules = list(config_dict['target_modules'])

            config = LoraConfig(**config_dict)

            #Set Inference_Mode là False để tiếp tục train :
            config.inference_mode = False

            #Set lại target_modules thành List :
            config.target_modules = target_modules

            model = get_peft_model(model, config)

            #Kiểm tra weights và load weights :
            checkpoint_name = model_weights_path
            if os.path.exists(checkpoint_name):
                print(f"Restarting from {checkpoint_name}")
                print('======================================================')
                adapters_weights = load_file(checkpoint_name)
                set_peft_model_state_dict(model, adapters_weights)
                print('Loading sucessful checkpoint')
            else:
                print(f"Checkpoint {checkpoint_name} not found")

            print('Reloading Model Complition!')
        else :
            print('Training From First!')
            config = LoraConfig(
                    r=16,  
                    lora_alpha=16,
                    target_modules= self.target_modules,
                    lora_dropout=0.1,
                    bias="none",
                    task_type="CAUSAL_LM"
                )
            model = get_peft_model(model, config)
        return model
    
    #Set Tokenizer : 
    def Set_Tokenize(self): 
        #Get Tokenize :
        tokenizer = AutoTokenizer.from_pretrained(self.base_model,legacy=False)

        #Set padding : 
        tokenizer.pad_token_id = 0
        
        #Set location of padding :
        tokenizer.padding_side = "left"  # Allow batched inference
    
        return tokenizer
        
    #Tokenize Processing : 
    def Tokenize(self,prompt,add_eos_token=True,cutoff_len = 1024):
        check_train = self.check_train
#         if check_train :
#             padding = False 
#             return_tensors = None
#         else :
#             return_tensors = 'pt'
#             padding = 'max_length' 

            
        tokenizer = self.tokenizer
        result = tokenizer(
                prompt,
                truncation=True,
                max_length=cutoff_len,
                padding=False,
                return_tensors=None,
                )
        if check_train :
            if (
                result["input_ids"][-1] != tokenizer.eos_token_id
                and len(result["input_ids"]) < cutoff_len
                and add_eos_token
            ):
                result["input_ids"].append(tokenizer.eos_token_id)
                result["attention_mask"].append(1)
            result['labels'] = result['input_ids'].copy()
            
        return result
    
    #Tokenize Prompt :
    def Tokenize_Prompt(self,data_point):
        #Prompt Processing Function :
        def Prompt_Processing(
            instructions,
            inputs: Union[None, str] = None,
            labels: Union[None, str] = None
        )-> str:
            #Loading template : 
            with open(self.template_path) as fp:
                template = json.load(fp)

            #Generate Prompt with inputs : 
            if inputs:
                res = template["prompt_input"].format(
                instruction=instructions, input=inputs
                    )
            #Generate Prompt no inputs :
            else:
                res = template["prompt_no_input"].format(
                        instruction=instructions
                    )

            #Concatenate Outputs with Prompt :
            if labels:
                res = f"{res}{labels}"
            return res
        #Process Prompt :
        if self.check_train:
            full_prompt = Prompt_Processing(
                                data_point["instruction"], #Instruction
                                data_point["text"], #Input 
                                data_point["table"] #Output
            )


            #Tokenize Prompt :
            tokenized_full_prompt = self.Tokenize(full_prompt,add_eos_token=True)

            #Train on input: 
            add_eos_token = True
            user_prompt = Prompt_Processing(
                            data_point["instruction"], data_point["text"]
                            )

            tokenized_user_prompt = self.Tokenize(
                        user_prompt, add_eos_token= add_eos_token
                    )
            user_prompt_len = len(tokenized_user_prompt["input_ids"])

            if add_eos_token:
                user_prompt_len -= 1

            tokenized_full_prompt["labels"] = [
                        -100
                    ] * user_prompt_len + tokenized_full_prompt["labels"][
                        user_prompt_len:
                    ]
            
            return tokenized_full_prompt
        else : 
            data = {}
            full_prompt = Prompt_Processing(
                            data_point["instruction"], data_point["text"]
                        )
            data['input'] = full_prompt
            data['output'] = data_point["table"]
            return data
    
    #Loading data :
    def Data_Processing(self,path_data,start_row = 0, num_rows = 30, val_set_size = 0,check_train = True):
        #Read File data :
        if path_data.endswith(".json") or path_data.endswith(".jsonl"):
            data = load_dataset("json", data_files=path_data,split=f"train[{start_row}:{start_row + num_rows}]")
        else:
            data = load_dataset(path_data)

        if 'messages' in data[0]:
            data = data.remove_columns('messages')

        #Set Check_Train
        self.check_train = check_train
        
        if check_train :
            #Split data to data train and data validtions : 
            if val_set_size > 0:
                train_val = data.train_test_split(
                    test_size=val_set_size, shuffle=True, seed=42
                )
                train_data = train_val['train'].shuffle().map(self.Tokenize_Prompt)

                val_data = train_val['test'].shuffle().map(self.Tokenize_Prompt)
                
            else:
                train_data = data.shuffle().map(self.Tokenize_Prompt)
                val_data = None

            return train_data, val_data
        else :            

            #Tokenize data :
            test_data = data.shuffle().map(self.Tokenize_Prompt)
            

            return test_data

        
            
    #Training Model Fucntion :
    def Training_Model(
        self,
        train_data, val_data = None,
        batch_size = 8,
        gradient_accumulation_steps = 2,
        num_epochs = 10,
        learning_rate = 1e-4,
        output_dir = 'Model_Output',
        resume_checkpoint = None,
        is_trainable = True #Thực hiện cho phép retrain 
    ):
                
        #Load Model :
        model = self.Load_Model()
        
        #Get Tokenizer :
        tokenizer = self.tokenizer

        #Information model :
        print('================Information Model=====================')
        print('======================================================')

#         #Remove các đặc trưng không cần thiết : 
#         list_remove = ['instruction','domain', 'text', 'category', 'table']
#         for i in list_remove :
#             if i in train_data[0] :
#                 train_data = train_data.remove_columns(i)
#                 val_data = val_data.remove_columns(i)
        
        
        #Print information of trainable_paramers :
        def print_trainable_parameters_custom(model):
            trainable_params = [p for p in model.parameters() if p.requires_grad]
            total_params = sum(p.numel() for p in trainable_params)
            print(f"Number of trainable parameters: {total_params}")
            for name, param in model.named_parameters():
                if param.requires_grad:
                    print(f"{name}: {param.shape}")

        try:
            model.print_trainable_parameters()
        except AttributeError:
            print_trainable_parameters_custom(model)

        #Set Transformers Arguments : 
        trans_argu = transformers.TrainingArguments(
                    per_device_train_batch_size=batch_size,
                    per_device_eval_batch_size=batch_size,
                    gradient_accumulation_steps=gradient_accumulation_steps,
                    warmup_ratio=0.03,
                    num_train_epochs=num_epochs,
                    learning_rate=learning_rate,
                    fp16=True, #Change
                    logging_steps=10,
                    optim="adamw_torch",
                    save_strategy="epoch",
                    eval_strategy="epoch",
                    output_dir=output_dir,
                    save_total_limit=2, #Số lượng checkpoints tối đa được lưu
                    load_best_model_at_end=True,
                    gradient_checkpointing=True,
                    weight_decay=0.01                
        )
        #Set Data Colacttor : 
        data_collator=transformers.DataCollatorForSeq2Seq(
                    tokenizer, 
                    pad_to_multiple_of=8, 
                    return_tensors="pt", 
                    padding=True,
                    label_pad_token_id = -100
                    )
        


        print('======================================================')
        #Loss :


        #Initialize Custom Trainer with custom loss function :
        trainer = transformers.Trainer(
            model=model,
            args=trans_argu,
            train_dataset=train_data,
            eval_dataset=val_data,
            tokenizer=tokenizer,
            data_collator=data_collator
        )

        
        model.config.use_cache = False
        #Compile model to optimize : 
        if torch.__version__ >= '2' :
            print('Compile Model with Torch :')
            print('======================================================')
            model = torch.compile(model)
        

        print('======================Training============================')
        trainer.train(resume_from_checkpoint=resume_checkpoint)
        
        print('========================Saving==============================')
        model.save_pretrained(os.path.join(output_dir, 'ODIE_Model'), is_trainable = is_trainable)
        tokenizer.save_pretrained(os.path.join(output_dir, 'ODIE_Model'))
        

        print('Training Model Completion!')
        return model, trainer
    
    def Get_Response(self, output: str) -> str:
        #Loading template : 
        with open(self.template_path) as fp:
            template = json.load(fp)
        return output.split(template["response_split"])[1].strip()

In [4]:
# path_old_output = '/kaggle/input/d/hoangtruongnlp/output-premodel/Model'
My_ODIE = ODIE()

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

**Get Data**

In [5]:
path_data_train = '/kaggle/input/data-nlp-2/Data_train/training_data.jsonl'
path_data_test = '/kaggle/input/data-nlp-2/Data_train/test.json'
train_data, val_data = My_ODIE.Data_Processing(path_data = path_data_train ,start_row = 2000, num_rows = 4000, val_set_size = 400, check_train = True)
test_data = My_ODIE.Data_Processing(path_data = path_data_test, start_row = 20, num_rows = 5, val_set_size = 0, check_train = False)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/3600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [6]:
print(train_data)
print(val_data)
print(test_data)

Dataset({
    features: ['instruction', 'domain', 'text', 'category', 'table', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 3600
})
Dataset({
    features: ['instruction', 'domain', 'text', 'category', 'table', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 400
})
Dataset({
    features: ['table', 'difficulty', 'instruction', 'source', 'category', 'text', 'source_type', 'domain', 'input', 'output'],
    num_rows: 5
})


In [7]:
for i in range(1) :
    print('Dataset Training :')
    print(train_data['input_ids'][i])
    print(train_data['attention_mask'][i])    
    print(train_data['labels'][i])
    
    print()
    print('Dataset Validation :')
    print('Dataset Training :')
    print(val_data['input_ids'][i])
    print(val_data['attention_mask'][i])    
    print(val_data['labels'][i])

Dataset Training :
[1, 13866, 338, 385, 15278, 393, 16612, 263, 3414, 29892, 3300, 2859, 411, 385, 1881, 393, 8128, 4340, 3030, 29889, 14350, 263, 2933, 393, 7128, 2486, 1614, 2167, 278, 2009, 29889, 13, 13, 2277, 29937, 2799, 4080, 29901, 13, 5647, 1461, 278, 8018, 2472, 515, 278, 2183, 1426, 393, 5353, 267, 278, 25486, 322, 7037, 310, 23011, 519, 5864, 8974, 297, 11781, 278, 3186, 29915, 29879, 5864, 4225, 29889, 13, 13, 2277, 29937, 10567, 29901, 13, 29934, 264, 809, 519, 5864, 8974, 526, 263, 19328, 15678, 8569, 297, 16021, 5925, 29889, 9267, 5925, 15055, 505, 3902, 4395, 278, 1900, 3519, 363, 4023, 2264, 292, 23011, 519, 5864, 322, 967, 23633, 363, 1716, 278, 5177, 322, 12459, 29889, 512, 697, 6559, 29892, 5925, 414, 4392, 1312, 278, 28326, 4127, 310, 773, 4768, 290, 465, 304, 5706, 5864, 29889, 2688, 1476, 393, 773, 4768, 290, 465, 408, 263, 7601, 5864, 2752, 756, 1784, 29380, 322, 17407, 25486, 975, 13807, 21983, 309, 4084, 1379, 29889, 7280, 6559, 21103, 3598, 19030, 278, 7037,

In [8]:
#Retraining Model : 
model,trainer = My_ODIE.Training_Model(train_data,val_data,batch_size = 4, num_epochs = 10, learning_rate = 1e-4,
                              gradient_accumulation_steps = 16, output_dir = 'Model_Output_Train_V2'
                              )



config.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

Training From First!
trainable params: 13,705,216 || all params: 1,113,753,600 || trainable%: 1.2305


2024-07-01 04:24:05.701918: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-01 04:24:05.702046: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-01 04:24:05.835321: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Compile Model with Torch :


  self.pid = os.fork()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to a



[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss
0,0.1812,0.188827
1,0.1503,0.171136
2,0.12,0.166941
4,0.0834,0.179074
5,0.0664,0.192207
6,0.0481,0.213158
8,0.029,0.262217
9,0.024,0.272119








Training Model Completion!


In [12]:
#Testing : 
tokenizer = My_ODIE.tokenizer
inputs = []
for i in test_data['input']:
    tokenize = tokenizer(i, 
                       padding = True,
                       truncation = True,
                       return_tensors="pt",
                       max_length=2048
                      ).input_ids.to('cuda')
    inputs.append(tokenize)
    
inputs

[tensor([[    1, 13866,   338,   385, 15278,   393, 16612,   263,  3414, 29892,
           3300,  2859,   411,   385,  1881,   393,  8128,  4340,  3030, 29889,
          14350,   263,  2933,   393,  7128,  2486,  1614,  2167,   278,  2009,
          29889,    13,    13,  2277, 29937,  2799,  4080, 29901,    13,  2744,
          14997,   911,  1438, 16200,  8324,   304, 24809, 16200, 12554,  3335,
          29889,  7338,  1461,  8018,   848, 29892,  3704,   278,  2635,   310,
           2280, 29892, 16200,  8158, 29892, 16200,   297,  6578,  2722, 29892,
          24806,  5253, 29892,   322, 17346, 29889,    13,    13,  2277, 29937,
          10567, 29901,    13, 29896, 29889, 11639,  7075, 29892,  6345,   373,
           5490, 29871, 29896, 29892, 29871, 29896, 29929, 29947, 29945, 29892,
            756,   263, 16200,  8158,   310, 29871, 29955, 29906, 29900, 29892,
          23941,  1781, 16200, 29889,  3600, 16200,  4955,  7805, 24596,   277,
           9160,   319, 29892,  6496,   

In [10]:
generation_config = GenerationConfig(
            do_sample=True,
            temperature=0.1,
            top_p=0.75,
            top_k=40,
            num_beams=4,
        )

model.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
model.config.bos_token_id = 1
model.config.eos_token_id = 2

In [15]:
model.eval()
model.to('cuda')
outputs = []
# model.to('cuda')
for inp in inputs :
    with torch.no_grad():
        generation_output = model.generate(
                                input_ids=inp,
                                generation_config=generation_config,
                                return_dict_in_generate=True,
                                output_scores=True,
                                max_new_tokens=1025,
                            )
    outputs.append(generation_output.sequences)

In [16]:
outputs

[tensor([[    1, 13866,   338,  ..., 29945, 29900, 29900]], device='cuda:0'),
 tensor([[    1, 13866,   338,  ..., 29958,    13,  1678]], device='cuda:0'),
 tensor([[    1, 13866,   338,  ..., 29900, 29889, 29900]], device='cuda:0'),
 tensor([[    1, 13866,   338,  ...,    13,  1678,   529]], device='cuda:0'),
 tensor([[    1, 13866,   338,  ..., 29900, 29906, 29896]], device='cuda:0')]

In [21]:
#Decode Generate :    
decode = []
for i in outputs:
    decode.append(tokenizer.decode(i[0], skip_special_tokens=True))

In [22]:
#Set output with template : 
decode_pr = []
for d in decode:
    decode_pr.append(My_ODIE.Get_Response(d))

In [29]:
for i in decode_pr :
    print('Output : ')
    print(i)
    print('='*80)

Output : 
| Credit Score | Date of Application | Credit Inquiries | Loan Amount | Balance |
| --- | --- | --- | --- | --- |
| 720 | January 1, 2005 | 1 | $10,000 | $3,000 |
| 720 | March 1, 2010 | 1 | $5,000 | $0 |
| 720 | June 1, 2012 | 1 | $20,000 | $5,000 |
| 720 | August 1, 2015 | 1 | $250,000 | $250,000 |
| 720 | September 1, 2006 | 1 | $10,000 | $10,000 |
| 720 | November 1, 2020 | 1 | $10,000 | $10,000 |
| 720 | July 10, 2015 | 1 | $300,000 | $250,000 |
| 680 | May 10, 1990 | 1 | $5,000 | $2,500 |
| 680 | July 1, 2012 | 1 | $5,000 | $2,500 |
| 680 | September 15, 2015 | 1 | $3,000 | $1,000 |
| 680 | January 1, 2005 | 1 | $15,000 | $2,000 |
| 680 | March 1, 2010 | 1 | $5,000 | $0 |
| 680 | June 1, 2012 | 1 | $20,000 | $5,000 |
| 680 | August 1, 2015 | 1 | $250,000 | $250,000 |
| 680 | September 1, 2006 | 1 | $10,000 | $10,000 |
| 680 | November 1, 2020 | 1 | $10,000 | $10,000 |
| 680 | July 10, 2015 | 1 | $300,000 | $250,000 |
| 620 | May 10, 1990 | 1 | $5,000 | $2,500 |
| 620 | 

In [27]:
for i in test_data['output'] :
    print('Output : ')
    print(i)
    print('='*80)

Output : 
| Name | Date | Credit Score | Credit Inquiries | Loan | Balance |
| --- | --- | --- | --- | --- | --- |
| John Smith | Jan 1,1985 | 720 | Auto Loan (Nov 1, 2020), Credit Card (Feb 15, 2022), Mortgage (Jul 10, 2015) | Credit Cards, Auto Loan, Mortgage, Student Loan | Cards: $3,000, Auto: $5,000, Mortgage: $250,000, Student: $10,000 |
| Emily Johnson | May 10, 1990 | 680 | Auto Loan, Credit Card, Personal Loan | Credit Cards, Personal Loan, Auto Loan | Cards: $3,500, Collection: $500 |
| Michael Thompson | N/A | 780 | Auto Loan Refinance, New Credit Card, Debt Consolidation Loan | Credit Cards, Auto Loan, Mortgage, Student Loan | Cards: $2,000, Auto: $10,000, Mortgage: $300,000, Student: $5,000 |
Output : 
| Delivery Date | Route | Vehicle Condition | Departure Time | Arrival Time | Travel Time |
| --- | --- | --- | --- | --- | --- |
| June 1, 2023 | Highway 1, Exit 5 to 10 | Intact | 9:00 AM | 11:30 AM | 2h 30m |
| June 10, 2023 | Interstate 95, Exit 15 to 20 | Intact | 8:30 