In [1]:
pip install datasets transformers rouge nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
import transformers
from datasets import load_dataset, load_metric
from transformers import TrainingArguments

In [4]:
args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [5]:
import pandas as pd

In [6]:
df  = pd.read_csv("data_pairs_2.csv").drop(['title'],axis=1)
df.to_csv("data_pairs.csv",index=False)

In [7]:
medium_datasets = load_dataset("csv",data_files="data_pairs.csv")
medium_datasets



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-7d2fa71ec5b76a1e/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-7d2fa71ec5b76a1e/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['markdown', 'code'],
        num_rows: 7572
    })
})

In [8]:
datasets_train_test = medium_datasets["train"].train_test_split(test_size=750)
datasets_train_validation = datasets_train_test["train"].train_test_split(test_size=750)

medium_datasets["train"] = datasets_train_validation["train"]
medium_datasets["validation"] = datasets_train_validation["test"]
medium_datasets["test"] = datasets_train_test["test"]

In [9]:
medium_datasets["train"] = medium_datasets["train"].shuffle()
medium_datasets["validation"] = medium_datasets["validation"]
medium_datasets["test"] = medium_datasets["test"].shuffle()

In [10]:
pip install SentencePiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [11]:
import nltk
#nltk.download('punkt')
import string
from transformers import AutoTokenizer, PLBartForConditionalGeneration, PLBartTokenizer


tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-python-en_XX", src_lang="python", tgt_lang="en_XX")

Downloading:   0%|          | 0.00/986k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/783 [00:00<?, ?B/s]

In [12]:
medium_datasets_cleaned = medium_datasets.filter(
    lambda example: (len(example['markdown']) <= 512) and
    (len(example['code']) <= 512)
)

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [13]:
medium_datasets_cleaned

DatasetDict({
    train: Dataset({
        features: ['markdown', 'code'],
        num_rows: 5612
    })
    validation: Dataset({
        features: ['markdown', 'code'],
        num_rows: 697
    })
    test: Dataset({
        features: ['markdown', 'code'],
        num_rows: 683
    })
})

In [14]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [15]:
prefix = "summarize: "
max_input_length = 512
max_target_length = 64

def clean_text(code):
  sentences = nltk.sent_tokenize(code.strip())
  sentences_cleaned = [s for sent in sentences for s in sent.split("\n")]

  text_cleaned = "\n".join(sentences_cleaned)
  return text_cleaned

def preprocess_data(examples):
  texts_cleaned = [clean_text(text) for text in examples["code"]]
  inputs = [prefix + code for code in texts_cleaned]
  model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

  # Setup the tokenizer for targets
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(examples["markdown"], max_length=max_target_length, 
                       truncation=True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [16]:
tokenized_datasets = medium_datasets_cleaned.map(preprocess_data,batched=True)

  0%|          | 0/6 [00:00<?, ?ba/s]

  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['markdown', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5612
    })
    validation: Dataset({
        features: ['markdown', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 697
    })
    test: Dataset({
        features: ['markdown', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 683
    })
})

In [18]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq,Seq2SeqTrainingArguments, Seq2SeqTrainer

In [19]:
batch_size = 8

model_name = "uclanlp/plbart-python-en_XX"
model_dir = "https://huggingface.co/uclanlp/plbart-python-en_XX"

args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=200,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard"
)

In [20]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [22]:
pip install rouge_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24955 sha256=2d0baf2450c7e948a31ba3d8bac1c23b4ece97220eb5fe3b4f1e29a0baf623dd
  Stored in directory: /root/.cache/pip/wheels/84/ac/6b/38096e3c5bf1dc87911e3585875e21a3ac610348e740409c76
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [23]:
metric = load_metric("rouge")

In [24]:

import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels]
    
    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    # Extract ROUGE f1 scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [25]:
plbart_model = PLBartForConditionalGeneration.from_pretrained("uclanlp/plbart-python-en_XX")

Downloading:   0%|          | 0.00/557M [00:00<?, ?B/s]

In [29]:
def model_init():
    return PLBartForConditionalGeneration.from_pretrained("uclanlp/plbart-python-en_XX")

trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=Seq2SeqTrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch"),
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--uclanlp--plbart-python-en_XX/snapshots/48bf6e4889bdb9bafd12381a4e9a9a1e0fe224eb/config.json
Model config PLBartConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "architectures": [
    "PLBartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "dropout": 0.1,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_eo

In [30]:
from huggingface_hub import list_repo_files

list_repo_files(repo_id= "uclanlp/plbart-python-en_XX")

['.gitattributes',
 'config.json',
 'pytorch_model.bin',
 'sentencepiece.bpe.model']

In [None]:
import torch
torch.cuda.empty_cache()
import gc
del variables
gc.collect()

In [None]:
trainer.train()