In [1]:
!pip install datasets transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from datasets import load_dataset, load_metric

datasets = load_dataset("mbpp", split='train+test')
datasets



  0%|          | 0/4 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],
        num_rows: 374
    })
    test: Dataset({
        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],
        num_rows: 90
    })
    prompt: Dataset({
        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],
        num_rows: 10
    })
})

In [4]:
datasets = datasets.remove_columns(["task_id", "test_list", "test_setup_code", "challenge_test_list"])
datasets = datasets.with_format("torch")
datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'code'],
        num_rows: 374
    })
    test: Dataset({
        features: ['text', 'code'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['text', 'code'],
        num_rows: 90
    })
    prompt: Dataset({
        features: ['text', 'code'],
        num_rows: 10
    })
})

In [5]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("t5-large")
model = T5ForConditionalGeneration.from_pretrained("t5-large")

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

In [6]:
def prepare_data(examples):
    text = examples['text']
    code = examples['code']
    
    model_inputs = tokenizer(text, padding="longest")
    labels = tokenizer(code, padding="longest").input_ids

    labels_with_ignore_index = []
    for labels_example in labels:
      labels_example = [label if label != 0 else -100 for label in labels_example]
      labels_with_ignore_index.append(labels_example)
    
    model_inputs["labels"] = labels_with_ignore_index

    return model_inputs

In [7]:
tokenized_datasets = datasets.map(prepare_data, batched=True, remove_columns=datasets["train"].column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [8]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 374
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 90
    })
    prompt: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10
    })
})

In [9]:
labels = tokenized_datasets["train"]["labels"][0]
tokenizer.decode([label for label in labels if label != -100])

'class Pair(object): def __init__(self, a, b): self.a = a self.b = b def max_chain_length(arr, n): max = 0 mcl = [1 for i in range(n)] for i in range(1, n): for j in range(0, i): if (arr[i].a > arr[j].b and mcl[i] <unk> mcl[j] + 1): mcl[i] = mcl[j] + 1 for i in range(n): if (max <unk> mcl[i]): max = mcl[i] return max</s>'

In [40]:
from transformers import TrainingArguments

args = TrainingArguments(
    "trainer",
    evaluation_strategy = "epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=20,
    learning_rate=2e-5,
    weight_decay=0.01,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [41]:
from transformers import default_data_collator

data_collator = default_data_collator

In [42]:
from transformers import Trainer

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [43]:
import torch, gc
torch.cuda.empty_cache()
gc.collect()

438

In [44]:
trainer.train()

***** Running training *****
  Num examples = 374
  Num Epochs = 20
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 1880
  Number of trainable parameters = 737668096


Epoch,Training Loss,Validation Loss
1,No log,1.599811
2,No log,1.592549
3,No log,1.608108
4,No log,1.629815
5,No log,1.629986
6,0.734000,1.652756
7,0.734000,1.669353
8,0.734000,1.642531
9,0.734000,1.649876
10,0.734000,1.678816


***** Running Evaluation *****
  Num examples = 90
  Batch size = 4
***** Running Evaluation *****
  Num examples = 90
  Batch size = 4
***** Running Evaluation *****
  Num examples = 90
  Batch size = 4
***** Running Evaluation *****
  Num examples = 90
  Batch size = 4
***** Running Evaluation *****
  Num examples = 90
  Batch size = 4
Saving model checkpoint to trainer/checkpoint-500
Configuration saved in trainer/checkpoint-500/config.json
Model weights saved in trainer/checkpoint-500/pytorch_model.bin
tokenizer config file saved in trainer/checkpoint-500/tokenizer_config.json
Special tokens file saved in trainer/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 90
  Batch size = 4
***** Running Evaluation *****
  Num examples = 90
  Batch size = 4
***** Running Evaluation *****
  Num examples = 90
  Batch size = 4
***** Running Evaluation *****
  Num examples = 90
  Batch size = 4
***** Running Evaluation *****
  Num examples = 90
  Batch size 

TrainOutput(global_step=1880, training_loss=0.6330567055560173, metrics={'train_runtime': 861.0162, 'train_samples_per_second': 8.687, 'train_steps_per_second': 2.183, 'total_flos': 1802909306880000.0, 'train_loss': 0.6330567055560173, 'epoch': 20.0})

In [45]:
trainer.save_model("test-t5")

Saving model checkpoint to test-t5
Configuration saved in test-t5/config.json
Model weights saved in test-t5/pytorch_model.bin
tokenizer config file saved in test-t5/tokenizer_config.json
Special tokens file saved in test-t5/special_tokens_map.json


In [46]:
from transformers import T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("test-t5")

loading configuration file test-t5/config.json
Model config T5Config {
  "_name_or_path": "test-t5",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 4096,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": tr

In [50]:
# predictions = []
examples = datasets["test"]["text"]
# for example in examples:
#     input_ids = tokenizer(example, return_tensors='pt').input_ids
#     outputs = model.generate(input_ids, do_sample=True, max_new_tokens=100)
#     predictions.append(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [54]:
input_ids = tokenizer(examples[0], return_tensors='pt').input_ids
outputs = model.generate(input_ids, max_new_tokens=300)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

def remove_char(str1): count = 0 for i in range(len(str1)): count += 1 return count


In [55]:
datasets["test"]["text"][0]

'Write a python function to remove first and last occurrence of a given character from the string.'

In [63]:
print(datasets["test"]["code"][0])

def remove_Occ(s,ch): 
    for i in range(len(s)): 
        if (s[i] == ch): 
            s = s[0 : i] + s[i + 1:] 
            break
    for i in range(len(s) - 1,-1,-1):  
        if (s[i] == ch): 
            s = s[0 : i] + s[i + 1:] 
            break
    return s 


In [57]:
input_ids = tokenizer(examples[1], return_tensors='pt').input_ids
outputs = model.generate(input_ids, max_new_tokens=300)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

def ascend_matrix_sum(matrix,n): return (matrix[n] = sum(n - 1))


In [58]:
datasets["test"]["text"][1]

'Write a function to sort a given matrix in ascending order according to the sum of its rows.'

In [64]:
print(datasets["test"]["code"][1])

def sort_matrix(M):
    result = sorted(M, key=sum)
    return result


In [60]:
input_ids = tokenizer(examples[2], return_tensors='pt').input_ids
outputs = model.generate(input_ids, max_new_tokens=300)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

def max_common_words(dict): max_common_words= 0 dict.count(dict) for idx in range(1, idx, n): if idx >= n: max_common_words = n idx += n idx if idx = n: max_common_words += 1 dict.count(dict) return max_compos


In [61]:
datasets["test"]["text"][2]

'Write a function to count the most common words in a dictionary.'

In [65]:
print(datasets["test"]["code"][2])

from collections import Counter
def count_common(words):
  word_counts = Counter(words)
  top_four = word_counts.most_common(4)
  return (top_four)

