In [1]:
#!wget "https://www.dropbox.com/scl/fi/525gv6tmdi3n32mipo6mr/input.zip?rlkey=5jdsxahphk2ped5wxbxnv0n4y&dl=1" -O input.zip
 
#!unzip input.zip
# https://learnopencv.com/fine-tuning-t5/#results

In [2]:
import torch
 
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset
torch.cuda.empty_cache()

2024-06-08 17:43:26.024549: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-08 17:43:26.053475: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
MODEL = 'google-t5/t5-small'
BATCH_SIZE =  2 #48
NUM_PROCS = 16
EPOCHS = 5
OUT_DIR = 't5-small-tag-gen'
MAX_LENGTH = 256 # Maximum context length to consider while preparing dataset

In [4]:
dataset_train = load_dataset(
    'csv',
    data_files='input/train.csv',
    split='train'
)
dataset_valid = load_dataset(
    'csv',
    data_files='input/valid.csv',
    split='train'
)

In [5]:
dataset_train.info.features
dataset_train.info.splits["train"].num_examples


45000

In [6]:
examples = dataset_train.take(1)
ex=next(iter(examples))
ex

{'Id': 34552656,
 'Title': 'Java: Repeat Task Every Random Seconds',
 'Body': '<p>I\'m already familiar with repeating tasks every n seconds by using Java.util.Timer and Java.util.TimerTask. But lets say I want to print "Hello World" to the console every random seconds from 1-5. Unfortunately I\'m in a bit of a rush and don\'t have any code to show so far. Any help would be apriciated.  </p>\n',
 'Tags': '<java><repeat>',
 'CreationDate': '2016-01-01 00:21:59',
 'Y': 'LQ_CLOSE'}

In [7]:
tokenizer = T5Tokenizer.from_pretrained(MODEL)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
	
# Function to convert text data into model inputs and targets
def preprocess_function(examples):
    inputs = [f"assign tag: {title} {body}" for (title, body) in zip(examples['Title'], examples['Body'])]
    print(inputs)
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )
    # Set up the tokenizer for targets
    cleaned_tag = [' '.join(''.join(tag.split('<')).split('>')[:-1]) for tag in examples['Tags']]
   
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            cleaned_tag,
            max_length=MAX_LENGTH,
            truncation=True,
            padding='max_length'
        )
 
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [21]:
print(preprocess_function(examples))

['assign tag: Java: Repeat Task Every Random Seconds <p>I\'m already familiar with repeating tasks every n seconds by using Java.util.Timer and Java.util.TimerTask. But lets say I want to print "Hello World" to the console every random seconds from 1-5. Unfortunately I\'m in a bit of a rush and don\'t have any code to show so far. Any help would be apriciated.  </p>\n']
{'input_ids': [[12317, 7860, 10, 10318, 10, 20469, 16107, 2181, 25942, 5212, 7, 3, 2, 102, 3155, 196, 31, 51, 641, 3324, 28, 6103, 53, 4145, 334, 3, 29, 3978, 57, 338, 10318, 5, 13780, 5, 13368, 52, 11, 10318, 5, 13780, 5, 13368, 52, 382, 9, 7, 157, 5, 299, 8857, 497, 27, 241, 12, 2281, 96, 566, 7126, 1150, 121, 12, 8, 8990, 334, 6504, 3978, 45, 209, 4525, 5, 4877, 27, 31, 51, 16, 3, 9, 720, 13, 3, 9, 10505, 11, 278, 31, 17, 43, 136, 1081, 12, 504, 78, 623, 5, 2372, 199, 133, 36, 3, 9, 2246, 4915, 1054, 5, 3, 2, 87, 102, 3155, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0



In [10]:
# Apply the function to the whole dataset
tokenized_train = dataset_train.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)
tokenized_valid = dataset_valid.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)

In [11]:
torch.cuda.empty_cache()

model = T5ForConditionalGeneration.from_pretrained(MODEL)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model.to(device)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())

print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

60,506,624 total parameters.
60,506,624 training parameters.


In [12]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=OUT_DIR,
    logging_steps=10,
    evaluation_strategy='steps',
    save_steps=500,
    eval_steps=500,
    load_best_model_at_end=True,
    save_total_limit=5,
    report_to='tensorboard',
    learning_rate=0.0001,
    fp16=True,
    dataloader_num_workers=4
)
 
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
)



In [13]:
history = trainer.train()

  0%|          | 0/112500 [00:00<?, ?it/s]

{'loss': 23.8884, 'grad_norm': 143.89828491210938, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}
{'loss': 23.2158, 'grad_norm': nan, 'learning_rate': 2.6e-06, 'epoch': 0.0}
{'loss': 22.3071, 'grad_norm': 115.59175109863281, 'learning_rate': 4.6e-06, 'epoch': 0.0}
{'loss': 21.5628, 'grad_norm': 216.89859008789062, 'learning_rate': 6.6e-06, 'epoch': 0.0}
{'loss': 20.4574, 'grad_norm': 290.817138671875, 'learning_rate': 8.599999999999999e-06, 'epoch': 0.0}
{'loss': 18.6055, 'grad_norm': 327.2432861328125, 'learning_rate': 1.06e-05, 'epoch': 0.0}
{'loss': 17.2848, 'grad_norm': 124.83826446533203, 'learning_rate': 1.2600000000000001e-05, 'epoch': 0.0}
{'loss': 15.1248, 'grad_norm': 158.93531799316406, 'learning_rate': 1.4599999999999999e-05, 'epoch': 0.0}
{'loss': 12.3172, 'grad_norm': 93.06366729736328, 'learning_rate': 1.66e-05, 'epoch': 0.0}
{'loss': 9.9199, 'grad_norm': 108.13162994384766, 'learning_rate': 1.86e-05, 'epoch': 0.0}
{'loss': 7.6012, 'grad_norm': 103.59687042236328

  0%|          | 0/7500 [00:00<?, ?it/s]

{'eval_loss': 0.10462861508131027, 'eval_runtime': 2021.6771, 'eval_samples_per_second': 7.42, 'eval_steps_per_second': 3.71, 'epoch': 0.02}
{'loss': 0.1029, 'grad_norm': 0.5681705474853516, 'learning_rate': 9.999732142857143e-05, 'epoch': 0.02}
{'loss': 0.1134, 'grad_norm': 0.6650452613830566, 'learning_rate': 9.998839285714286e-05, 'epoch': 0.02}
{'loss': 0.1388, 'grad_norm': 0.7985198497772217, 'learning_rate': 9.997946428571428e-05, 'epoch': 0.02}
{'loss': 0.1025, 'grad_norm': 0.5626926422119141, 'learning_rate': 9.997053571428572e-05, 'epoch': 0.02}
{'loss': 0.099, 'grad_norm': 0.7779288291931152, 'learning_rate': 9.996160714285715e-05, 'epoch': 0.02}
{'loss': 0.1204, 'grad_norm': 0.3318697214126587, 'learning_rate': 9.995267857142857e-05, 'epoch': 0.02}
{'loss': 0.106, 'grad_norm': 0.7802842259407043, 'learning_rate': 9.994375e-05, 'epoch': 0.03}
{'loss': 0.1116, 'grad_norm': 1.209157943725586, 'learning_rate': 9.993482142857144e-05, 'epoch': 0.03}
{'loss': 0.0846, 'grad_norm': 0

  0%|          | 0/7500 [00:00<?, ?it/s]

{'eval_loss': 0.08427439630031586, 'eval_runtime': 2021.7456, 'eval_samples_per_second': 7.419, 'eval_steps_per_second': 3.71, 'epoch': 0.04}
{'loss': 0.0702, 'grad_norm': 0.22672660648822784, 'learning_rate': 9.955089285714287e-05, 'epoch': 0.04}
{'loss': 0.0847, 'grad_norm': 4.943324089050293, 'learning_rate': 9.95419642857143e-05, 'epoch': 0.05}
{'loss': 0.0862, 'grad_norm': 0.2861299216747284, 'learning_rate': 9.953303571428572e-05, 'epoch': 0.05}
{'loss': 0.1101, 'grad_norm': 0.2224818766117096, 'learning_rate': 9.952410714285715e-05, 'epoch': 0.05}
{'loss': 0.0858, 'grad_norm': 1.0967695713043213, 'learning_rate': 9.951517857142858e-05, 'epoch': 0.05}
{'loss': 0.1047, 'grad_norm': 0.14748026430606842, 'learning_rate': 9.950625e-05, 'epoch': 0.05}
{'loss': 0.0915, 'grad_norm': 0.37190258502960205, 'learning_rate': 9.949732142857144e-05, 'epoch': 0.05}
{'loss': 0.0883, 'grad_norm': 0.2634839415550232, 'learning_rate': 9.948839285714286e-05, 'epoch': 0.05}
{'loss': 0.1024, 'grad_nor

  0%|          | 0/7500 [00:00<?, ?it/s]

{'eval_loss': 0.07876913994550705, 'eval_runtime': 2021.6934, 'eval_samples_per_second': 7.42, 'eval_steps_per_second': 3.71, 'epoch': 0.07}
{'loss': 0.0922, 'grad_norm': 0.28160831332206726, 'learning_rate': 9.910446428571429e-05, 'epoch': 0.07}
{'loss': 0.1038, 'grad_norm': 0.4383435547351837, 'learning_rate': 9.909553571428572e-05, 'epoch': 0.07}
{'loss': 0.0948, 'grad_norm': 0.28863248229026794, 'learning_rate': 9.908660714285714e-05, 'epoch': 0.07}
{'loss': 0.1246, 'grad_norm': 0.6986559629440308, 'learning_rate': 9.907767857142858e-05, 'epoch': 0.07}
{'loss': 0.085, 'grad_norm': 0.4569411277770996, 'learning_rate': 9.906875e-05, 'epoch': 0.07}
{'loss': 0.0594, 'grad_norm': 0.2801000773906708, 'learning_rate': 9.905982142857143e-05, 'epoch': 0.07}
{'loss': 0.076, 'grad_norm': 0.23465761542320251, 'learning_rate': 9.905089285714286e-05, 'epoch': 0.07}
{'loss': 0.0946, 'grad_norm': 0.48102355003356934, 'learning_rate': 9.90419642857143e-05, 'epoch': 0.07}
{'loss': 0.1149, 'grad_norm

  0%|          | 0/7500 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [14]:
tokenizer.save_pretrained(OUT_DIR)

('t5-small-tag-gen/tokenizer_config.json',
 't5-small-tag-gen/special_tokens_map.json',
 't5-small-tag-gen/spiece.model',
 't5-small-tag-gen/added_tokens.json')

In [17]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
model_path = 't5-small-tag-gen/checkpoint-1500'  # the path where you saved your model
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained('t5-small-tag-gen')



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [18]:

	
def do_correction(text, model, tokenizer):
    input_text = f"assign tag: {text}"
    inputs = tokenizer.encode(
        input_text,
        return_tensors='pt',
        max_length=256,
        padding='max_length',
        truncation=True
    )
 
    # Get correct sentence ids.
    corrected_ids = model.generate(
        inputs,
        max_length=256,
        num_beams=5, # `num_beams=1` indicated temperature sampling.
        early_stopping=True
    )
 
    # Decode.
    corrected_sentence = tokenizer.decode(
        corrected_ids[0],
        skip_special_tokens=True
    )
    return corrected_sentence

In [27]:
do_correction("""
              How to get all the child records from different tables based on given parent ID in sql server,"I am having 4 different tables like 
select * from System 
 
select * from Set 
select * from Item 
select * from Versions 

Now for each system Id there will be **n no.of Sets**, and foe **each set** there qill be **n no. of Items** and for **each item** there will be **n no.of Versions**.

**each system has  <br/>
n no of set <br/>
each Set has <br/>
n no of Items <br/>
each Item has <br/>
n no of Versions**


So, Now when i give **SystemId** then i have to retrieve all the records from 

**Set and Items of each set and Versions of each Items** in single storedprocedure."
              
              """,model,tokenizer)

'sql-server'