In [2]:
!tar -xf input.zip
!tar -xf inference_data.zip

In [3]:
import torch
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset

MODEL = 't5-small'
BATCH_SIZE = 48
NUM_PROCS = 16
EPOCHS = 10
OUT_DIR = 'results_t5small'
MAX_LENGTH = 256

dataset_train = load_dataset(
    'csv', 
    data_files='input/train.csv',
    split='train',
    nrows=20000
)
dataset_valid = load_dataset(
    'csv', 
    data_files='input/valid.csv',
    split='train',
    nrows=5000
)
print(len(dataset_train))
print(len(dataset_valid))

OSError: [WinError 126] The specified module could not be found. Error loading "C:\Users\Admin\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\lib\fbgemm.dll" or one of its dependencies.

In [48]:
tokenizer = T5Tokenizer.from_pretrained(MODEL)
def preprocess_function(examples):
    inputs = [f"assign tag: {title} {body}" for (title, body) in zip(examples['Title'], examples['Body'])]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )
    cleaned_tag = [' '.join(''.join(tag.split('<')).split('>')[:-1]) for tag in examples['Tags']]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            cleaned_tag,
            max_length=MAX_LENGTH,
            truncation=True,
            padding='max_length'
        )
 
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [49]:
tokenized_train = dataset_train.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)
tokenized_valid = dataset_valid.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)

Map (num_proc=16):   0%|          | 0/20000 [00:00<?, ? examples/s]



Map (num_proc=16):   0%|          | 0/5000 [00:00<?, ? examples/s]



In [50]:
trainable_params=[

    'decoder.block.1.layer.2.DenseReluDense.wi.weight',
    'decoder.block.1.layer.2.DenseReluDense.wo.weight',
    'decoder.block.1.layer.2.layer_norm.weight',

    'decoder.block.2.layer.2.DenseReluDense.wi.weight',
    'decoder.block.2.layer.2.DenseReluDense.wo.weight',
    'decoder.block.2.layer.2.layer_norm.weight',
    
    'decoder.block.3.layer.2.DenseReluDense.wi.weight',
    'decoder.block.3.layer.2.DenseReluDense.wo.weight',
    'decoder.block.3.layer.2.layer_norm.weight',
   
    'decoder.block.4.layer.2.DenseReluDense.wi.weight',
    'decoder.block.4.layer.2.DenseReluDense.wo.weight',
    'decoder.block.4.layer.2.layer_norm.weight',
    
    'decoder.block.5.layer.0.SelfAttention.q.weight',
    'decoder.block.5.layer.0.SelfAttention.k.weight',
    'decoder.block.5.layer.0.SelfAttention.v.weight',
    'decoder.block.5.layer.0.SelfAttention.o.weight',
    'decoder.block.5.layer.0.layer_norm.weight',
    'decoder.block.5.layer.2.DenseReluDense.wi.weight',
    'decoder.block.5.layer.2.DenseReluDense.wo.weight',
    'decoder.block.5.layer.2.layer_norm.weight',
    'decoder.final_layer_norm.weight'
]

In [51]:
model = T5ForConditionalGeneration.from_pretrained(MODEL)
for name, param in model.named_parameters():
    if name not in trainable_params: # choose whatever you like here
        param.requires_grad = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device is {device}")
model.to(device)
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")


Device is cuda
60,506,624 total parameters.
11,537,920 training parameters.


In [52]:
training_args=TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=OUT_DIR,
    logging_steps=10,
    evaluation_strategy='steps',
    save_steps=500,
    eval_steps=500,
    load_best_model_at_end=True,
    save_total_limit=5,
    report_to='tensorboard',
    learning_rate=0.0001,
    fp16=True,
    dataloader_num_workers=4
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid
)

Detected kernel version 4.14.344, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [53]:
history = trainer.train()

Step,Training Loss,Validation Loss
500,0.1526,0.12826
1000,0.107,0.093065
1500,0.0925,0.082335
2000,0.0858,0.077966
2500,0.085,0.07527
3000,0.0809,0.073637
3500,0.0795,0.072637
4000,0.0832,0.072171


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


In [60]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_path = './results_t5small/checkpoint-4000'
model = T5ForConditionalGeneration.from_pretrained(model_path)
#tokenizer = T5Tokenizer.from_pretrained('results_t5small')

In [61]:
def do_correction(text, model, tokenizer):
    input_text = f"assign tag: {text}"
    inputs= tokenizer.encode(
        input_text,
        return_tensors='pt',
        max_length=256,
        padding='max_length',
        truncation=True
    )
    corrected_ids = model.generate(
        inputs,
        max_length=256,
        num_beams=5, # `num_beams=1` indicated temperature sampling.
        early_stopping=True
    )
    corrected_sentence = tokenizer.decode(
        corrected_ids[0],
        skip_special_tokens=True
    )
    return corrected_sentence

In [62]:
import os
for file in os.listdir('inference_data/'):
    f = open(f"inference_data/{file}", 'r')
    sentence = f.read()
    corrected_sentence = do_correction(sentence, model, tokenizer)
    print(f"QUERY: {sentence}\nTAGS: {corrected_sentence}")
    print('-'*80)

QUERY: Repeat Task Every Random Seconds <p>I\'m already familiar with repeating tasks every n seconds by using Java.util.Timer and Java.util.TimerTask. But lets say I want to print \"Hello World\" to the console every random seconds from 1-5. Unfortunately I\'m in a bit of a rush and don\'t have any code to show so far. Any help would be apriciated

TAGS: java jquery
--------------------------------------------------------------------------------
QUERY: I have a C++ program that I compile on Mac OS 13.4.1 using Cmake

One of my users has the following error: dyld: cannot load 'my_program' (load command 0x80000034 is unknown)

I have no idea why he has this error message, he is on Mac OS 10.14.6 and we both use an Intel Mac

Here are some information about the binary that might be useful:

otool -L my_program
my_program:
        /System/Library/Frameworks/OpenCL.framework/Versions/A/OpenCL (compatibility version 1.0.0, current version 1.0.0)
        /usr/lib/libc++.1.dylib (compatibilit

IsADirectoryError: [Errno 21] Is a directory: 'inference_data/.ipynb_checkpoints'