In [None]:
!pip install torch peft transformers trl datasets bitsandbytes accelerate ipywidgets

from peft import get_peft_model, PeftConfig, PeftModel, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, GenerationConfig
from trl import SFTTrainer
from datasets import load_dataset
import torch
import os

os.environ["HF_TOKEN"] = "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"
dataset = load_dataset("tminh2003/extract-paper", split="train")
print(dataset)

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 2
})


In [None]:
dataset

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 2
})

In [None]:
def generate_prompt(data_point):
  output_texts = []
  for i in range(len(data_point['instruction'])):
      text = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.  # noqa: E501
                ### Instruction:
                {data_point["instruction"]}
                ### Input:
                {data_point["input"]}
                ### Response:
                {data_point["output"]}"""
  output_texts.append(text)
  return output_texts

In [None]:

model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

In [None]:
# Creating tokenizer and defining the pad token
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
def extract_table(query, model):
    encoding = tokenizer(query, return_tensors="pt").to(device)
    generation_config = GenerationConfig(max_new_tokens=300, pad_token_id = tokenizer.eos_token_id, eos_token_id = tokenizer.eos_token_id)
    outputs = model.generate(input_ids=encoding.input_ids, generation_config=generation_config)
    text_output = tokenizer.decode(outputs[0],skip_special_tokens=True)
    print('INPUT\n', query, '\n\nOUTPUT\n', text_output[len(query):])
extract_table("hi", model)

INPUT
 hi 

OUTPUT
 ện tại.

Để được được hỏi đáp, bạn cần có được được hỏi đáp của bạn.

Để được hỏi đáp, bạn cần có được hỏi đáp của bạn.

Để được hỏi đáp, bạn cần có được hỏi đáp của bạn.

Để được hỏi đáp, bạn cần có được hỏi đáp của bạn.

Để được hỏi đáp, bạn cần có được hỏi đáp của bạn.

Để được hỏi đáp, bạn cần có được hỏi đáp của bạn.




In [None]:
# Setting arguments for low-rank adaptation

model = prepare_model_for_kbit_training(model)

lora_alpha = 32 # The weight matrix is scaled by lora_alpha/lora_rank, so I set lora_alpha = lora_rank to remove scaling
lora_dropout = 0.05
lora_rank = 32

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_rank,
    bias="none",  # setting to 'none' for only training weight params instead of biases
    task_type="CAUSAL_LM")

peft_model = get_peft_model(model, peft_config)

In [None]:
# Setting training arguments

output_dir = "tminh2003/extract-paper/tinyllama" # Model repo on your hugging face account where you want to save your model
per_device_train_batch_size = 3
gradient_accumulation_steps = 2
optim = "paged_adamw_32bit"
save_strategy="steps"
save_steps = 10
logging_steps = 10
learning_rate = 2e-3
max_grad_norm = 0.3 # Sets limit for gradient clipping
max_steps = 200     # Number of training steps
warmup_ratio = 0.03 # Portion of steps used for learning_rate to warmup from 0
lr_scheduler_type = "cosine" # I chose cosine to avoid learning plateaus

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    #push_to_hub=True,
    report_to='none'
)

In [None]:
trainer = SFTTrainer(
    model=peft_model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=500,
    tokenizer=tokenizer,
    args=training_arguments,
    formatting_func = generate_prompt,
)
peft_model.config.use_cache = False
trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
10,0.8784
20,0.1951
30,0.04
40,0.0159
50,0.0107
60,0.0042
70,0.0005
80,0.0002
90,0.0001
100,0.0001




TrainOutput(global_step=200, training_loss=0.057293216279649645, metrics={'train_runtime': 77.8514, 'train_samples_per_second': 15.414, 'train_steps_per_second': 2.569, 'total_flos': 623410790400000.0, 'train_loss': 0.057293216279649645, 'epoch': 200.0})

In [None]:
txt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.  # noqa: E501
                ### Instruction:
                Extract a potential table from the input text, if there is no table, output NO TABLE
                ### Input:
                2 Elemental distribution in diffusion layer [95]. Area Ti Co Cr Fe Ni Mo A 0.69 23.63 26.75 23.64 20.81 4.58 B 62.86 11.64 3.86 8.97 12.22 0.44 C 35.71 5.60 38.89 11.70 2.56 5.51 D 46.98 4.04 34.74 8.39 2.19 3.66 E 24.79 9.46 38.39 15.73 4.82 6.81 F 38.99 7.93 31.71 13.22 3.88 4.26 Wang et al. [96] fabricated Al-HEA MMC by PM and SPS method with 10, 20, and 30 vol.% of CuZrAlTiNiW HEA to study the mechanical and corrosive properties. Fig. 38 displays the XRD patterns of the Al-HEA and pure Al composites that were SPS-ed. In addition to the BCC phase and Al matrix, the primary constituents of the composites are the ordered BCC phase, or B2 phase, which is formed in situ, the WAl12 intermetallic compound, and a few other phases that are not yet known. The contents of the B2 and WAl12 phases rise in tandem with increasing HEA volume fractions. It implies that during the sintering process, certain main components of HEA may react with the Al matrix, causing the partial BCC phase to separate and further resulting in the in-situ formation of new phases. A high relative density is achieved in composites, which results in fewer defects or cracks and fewer pores in the material. In HEA 10 wt.% Al composite (Fig. 39 (a)) bright-white particle distribution seen in black matrix phase (V) with less than 16 µm size. The composites show an egg-core-shell structure which is easily visible with the
                ### Response:
                """

extract_table(txt, peft_model)

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


INPUT
 Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.  # noqa: E501
                ### Instruction:
                Extract a potential table from the input text, if there is no table, output NO TABLE
                ### Input:
                2 Elemental distribution in diffusion layer [95]. Area Ti Co Cr Fe Ni Mo A 0.69 23.63 26.75 23.64 20.81 4.58 B 62.86 11.64 3.86 8.97 12.22 0.44 C 35.71 5.60 38.89 11.70 2.56 5.51 D 46.98 4.04 34.74 8.39 2.19 3.66 E 24.79 9.46 38.39 15.73 4.82 6.81 F 38.99 7.93 31.71 13.22 3.88 4.26 Wang et al. [96] fabricated Al-HEA MMC by PM and SPS method with 10, 20, and 30 vol.% of CuZrAlTiNiW HEA to study the mechanical and corrosive properties. Fig. 38 displays the XRD patterns of the Al-HEA and pure Al composites that were SPS-ed. In addition to the BCC phase and Al matrix, the primary constituents of the composites are the ordered BCC phase, or 