In [2]:
import json
import tensorflow as tf
import pandas as pd
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from transformers import PreTrainedTokenizerFast
import datasets
from datasets import load_dataset
from transformers import Seq2SeqTrainer,Seq2SeqTrainingArguments

In [3]:
from transformers import AutoConfig, AutoModelForSeq2SeqLM

model_checkpoint = "el-profesor/bert_small_seq2seq"
config = AutoConfig.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_config(config)

Downloading:   0%|          | 0.00/4.94k [00:00<?, ?B/s]

In [None]:
!wget https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip

In [None]:
!pip install rouge_score

In [None]:
!unzip python.zip

In [4]:
columns_long_list = ['repo', 'path', 'url', 'code', 
                     'code_tokens', 'docstring', 'docstring_tokens', 
                     'language', 'partition']

columns_short_list = ['code_tokens', 'docstring_tokens', 
                      'language', 'partition']

def jsonl_list_to_dataframe(file_list, columns=columns_long_list):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f, 
                                   orient='records', 
                                   compression='gzip',
                                   lines=True)[columns] 
                      for f in file_list], sort=False)

In [5]:
from pathlib import Path

python_files = sorted(Path('./python/').glob('**/*.gz'))
pydf = jsonl_list_to_dataframe(python_files)

In [6]:
from transformers import PreTrainedTokenizerFast
fast_tokenizer = Tokenizer.from_pretrained(model_checkpoint)
tokenizer = PreTrainedTokenizerFast(tokenizer_object=fast_tokenizer)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

0

In [7]:
train_df = pd.DataFrame(pydf[pydf['partition']=='train'],columns=['code','docstring'])
train_df.to_csv('train.csv',index=False)
valid_df = pd.DataFrame(pydf[pydf['partition']=='valid'],columns=['code','docstring'])
valid_df.to_csv('valid.csv',index=False)
test_df = pd.DataFrame(pydf[pydf['partition']=='test'],columns=['code','docstring'])
test_df.to_csv('test.csv',index=False)

In [8]:
data_files = {"train": "train.csv","valid":"valid.csv", "test": "test.csv"}
dataset = load_dataset(path = '/kaggle/working',data_files=data_files)

Downloading and preparing dataset csv/working to /root/.cache/huggingface/datasets/csv/working-c1f91967e6fdd5c6/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/working-c1f91967e6fdd5c6/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
# remove docstring from code
def clean_code_column(examples):
    list_ = []
    for e in examples['code']:
        eg = e
        triple_double = eg.split('"""')
        if len(triple_double)==3:
            eg = triple_double[0]+triple_double[-1]
        triple_single = eg.split("'''")
        if len(triple_single)==3:
            eg = triple_single[0]+triple_single[-1]
        single_double = eg.split('"')
        if len(single_double)==3:
            eg = single_double[0]+single_double[-1]
        single_single = eg.split("'")
        if len(single_single)==3:
            eg = single_single[0]+single_single[-1]
        list_.append(eg)
    examples['code']=list_
    return examples
dataset = dataset.map(clean_code_column, batched=True)

  0%|          | 0/413 [00:00<?, ?ba/s]

  0%|          | 0/24 [00:00<?, ?ba/s]

  0%|          | 0/23 [00:00<?, ?ba/s]

In [10]:
encoder_max_length=512
decoder_max_length=128

def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
  inputs = tokenizer(batch["code"], padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = tokenizer(batch["docstring"], padding="max_length", truncation=True, max_length=decoder_max_length)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
  # We have to make sure that the PAD token is ignored
  batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch

In [11]:
batch_size=1000

train_data = dataset['train'].map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["code", "docstring"]
)
val_data = dataset['valid'].map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["code", "docstring"]
)

  0%|          | 0/413 [00:00<?, ?ba/s]

  0%|          | 0/24 [00:00<?, ?ba/s]

In [13]:
!pip install rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24955 sha256=2d7e54e6b4b85a0ecb294c46287bf8c5b10bf6d3a046709705720f69da708732
  Stored in directory: /root/.cache/pip/wheels/84/ac/6b/38096e3c5bf1dc87911e3585875e21a3ac610348e740409c76
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
[0m

In [14]:
batch_size=64
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=True, 
    output_dir="./",
    logging_steps=200,
    save_steps=200,
    eval_steps=10000,
    warmup_steps=2000,
    weight_decay=0.01,
    save_total_limit=3,
)

rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
)
trainer.train()

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 412178
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 19323
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Step,Training Loss,Validation Loss,Rouge2 Precision,Rouge2 Recall,Rouge2 Fmeasure
10000,0.0014,0.002874,0.0001,0.0004,0.0001


Saving model checkpoint to ./checkpoint-200
Configuration saved in ./checkpoint-200/config.json
Model weights saved in ./checkpoint-200/pytorch_model.bin
Saving model checkpoint to ./checkpoint-400
Configuration saved in ./checkpoint-400/config.json
Model weights saved in ./checkpoint-400/pytorch_model.bin
Saving model checkpoint to ./checkpoint-600
Configuration saved in ./checkpoint-600/config.json
Model weights saved in ./checkpoint-600/pytorch_model.bin
Saving model checkpoint to ./checkpoint-800
Configuration saved in ./checkpoint-800/config.json
Model weights saved in ./checkpoint-800/pytorch_model.bin
Deleting older checkpoint [checkpoint-200] due to args.save_total_limit
Saving model checkpoint to ./checkpoint-1000
Configuration saved in ./checkpoint-1000/config.json
Model weights saved in ./checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [checkpoint-400] due to args.save_total_limit
Saving model checkpoint to ./checkpoint-1200
Configuration saved in ./checkpoint-12

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
print(torch.cuda.memory_summary(device=None, abbreviated=False))
