In [1]:
!ls

finetune_model.py  run_language_modeling.py  run_model.ipynb  runs  wandb


In [39]:
# Model paths
MODEL_TYPE = "gpt2" 
OUTPUT_DIR = f"../../weights/es-gpt2/"
#TRAIN_PATH = f"../../data/spanish-corpora/preprocessed_DOGC_lower.txt"
#TEST_PATH = f"../../data/spanish-corpora/preprocessed_DGT_lower.txt"
#VAL_PATH = f"../../data/spanish-corpora/preprocessed_DOGC_lower.txt"
TRAIN_PATH = f"../../data/minicorpus/train_1.txt"
TEST_PATH = f"../../data/minicorpus/test_1.txt"
VAL_PATH = f"../../data/minicorpus/val_1.txt"


# Model paths
#MODEL_TYPE = "gpt2" 
#OUTPUT_DIR = f"../../weights/{MODEL_TYPE}/papers_milan/"
#TRAIN_PATH = f"../../data/papers_milan/train_papers.txt"
#TEST_PATH = f"../../data/papers_milan/test_papers.txt"
#VAL_PATH = f"../../data/papers_milan/val_papers.txt"

# Finetuning Tokenizer

In [11]:
!ls

finetune_model.py	  run_model.ipynb  wandb
run_language_modeling.py  runs		   xla_spawn.py


In [13]:
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

#paths = [str(x) for x in Path(TRAIN_PATH).glob("**/*.txt")]
paths = [TRAIN_PATH]
paths

['../../data/minicorpus/train_1.txt']

In [14]:
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

In [21]:
!mkdir OUTPUT_DIR
tokenizer.save_model(OUTPUT_DIR)

['../../weights/es-gpt2/vocab.json', '../../weights/es-gpt2/merges.txt']

# Finetuning Model

In [16]:
!ls

finetune_model.py	  run_model.ipynb  wandb
run_language_modeling.py  runs		   xla_spawn.py


In [40]:
def create_params_modeling(output_dir, model_type="gpt2", model_name_or_path=None, config_name=None, tokenizer_name=None, 
                            train_path=None, eval_path=None, do_train=False, do_eval=False, evaluate_during_training=False, line_by_line=False, 
                            block_size=-1, per_device_train_batch_size=1, per_device_eval_batch_size=1, warmup_steps=0, learning_rate=5e-5, max_steps=-1):
    return {
        "output_dir": output_dir,
        "model_type": model_type,
        "model_name_or_path": model_name_or_path,
        "config_name": config_name,
        "tokenizer_name": tokenizer_name,
        "do_train": "--do_train" if do_train else "",
        "train_data_file": train_path if do_train else None,
        "do_eval": "--do_eval" if do_eval else "",
        "eval_data_file": eval_path if do_eval else None,
        "evaluate_during_training": "--evaluate_during_training" if evaluate_during_training else "",
        "per_device_train_batch_size": per_device_train_batch_size,
        "per_device_eval_batch_size": per_device_eval_batch_size,
        "block_size": block_size,
        "warmup_steps": warmup_steps, 
        "learning_rate": learning_rate, 
        "max_steps": max_steps,
        "line_by_line": "--line_by_line" if line_by_line else "",
        "fp16": "--fp16",
        "fp16_opt_level": "O1"
    }

In [41]:
cmd_finetuning = """./run_language_modeling.py \
    --output_dir={output_dir} \
    --model_type={model_type} \
    --model_name_or_path={model_name_or_path} \
    --config_name={config_name} \
    --tokenizer_name={tokenizer_name} \
    {do_train} \
    --train_data_file={train_data_file} \
    {do_eval} \
    --eval_data_file={eval_data_file} \
    {evaluate_during_training} \
    --per_device_train_batch_size={per_device_train_batch_size} \
    --per_device_eval_batch_size={per_device_eval_batch_size} \
    --block_size={block_size} \
    --warmup_steps={warmup_steps} \
    --learning_rate={learning_rate} \
    --max_steps={max_steps} \
    --overwrite_output_dir \
    --save_steps 5000 \
    --save_total_limit 3 \
    {line_by_line} \
    {fp16} \
    --fp16_opt_level={fp16_opt_level} \
    --logging_steps 2 
"""

In [None]:
--output_dir=./weights/gpt2/minicorpus/ 
--model_type=gpt2 
--model_name_or_path=gpt2 
--do_train 
--train_data_file=./data/minicorpus/train_1.txt 
--do_eval 
--eval_data_file=./data/minicorpus/test_1.txt 
--evaluate_during_training 

--per_device_train_batch_size=2048 
--per_device_eval_batch_size=2048 

--block_size=128 

--overwrite_output_dir 

--save_steps 5000 
--save_total_limit=5 
--logging_steps=20 
--warmup_steps=10000 
--max_steps=900000 
--learning_rate=0.0001 
--line_by_line

In [48]:
# Arguments for training from scratch. I turn off evaluate_during_training,
#   line_by_line, should_continue, and model_name_or_path.
train_params = create_params_modeling(
                    output_dir=OUTPUT_DIR, 
                    model_type=MODEL_TYPE,
                    model_name_or_path=MODEL_TYPE,
                    config_name=OUTPUT_DIR,
                    tokenizer_name=OUTPUT_DIR,
                    train_path=TRAIN_PATH, 
                    eval_path=TEST_PATH, 
                    do_train=True, 
                    do_eval=True, 
                    evaluate_during_training=True,
                    line_by_line=True,
                    block_size=128,
                    learning_rate=0.0001
                )

In [49]:
cmd_finetuning.format(**train_params)

'./run_language_modeling.py     --output_dir=../../weights/es-gpt2/     --model_type=gpt2     --model_name_or_path=gpt2     --config_name=../../weights/es-gpt2/     --tokenizer_name=../../weights/es-gpt2/     --do_train     --train_data_file=../../data/minicorpus/train_1.txt     --do_eval     --eval_data_file=../../data/minicorpus/test_1.txt     --evaluate_during_training     --per_device_train_batch_size=1     --per_device_eval_batch_size=1     --block_size=128     --warmup_steps=0     --learning_rate=0.0001     --max_steps=-1     --overwrite_output_dir     --save_steps 5000     --save_total_limit 3     --line_by_line     --fp16     --fp16_opt_level=O1     --logging_steps 2 \n'

In [50]:
run {cmd_finetuning.format(**train_params)}

08/31/2020 22:53:22 - INFO - transformers.training_args -   PyTorch: setting up devices
08/31/2020 22:53:22 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='../../weights/es-gpt2/', overwrite_output_dir=True, do_train=True, do_eval=True, do_predict=False, evaluate_during_training=True, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, learning_rate=0.0001, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, warmup_steps=0, logging_dir='runs/Aug31_22-53-22_Camilo-UbuntuPC', logging_first_step=False, logging_steps=2, save_steps=5000, save_total_limit=3, no_cuda=False, seed=42, fp16=True, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, dataloader_drop_last=False)
08/31/2020 22:53:22 - INFO - transformers.configuration_utils -   loading configuration file ../../weights/es-gpt2/confi

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=2145301.0, style=ProgressStyle(descriptio…

08/31/2020 22:58:55 - INFO - transformers.trainer -   {'loss': 102.60490417480469, 'learning_rate': 9.999996892432966e-05, 'epoch': 9.322701103481517e-07, 'step': 2}
08/31/2020 22:58:55 - INFO - transformers.trainer -   ***** Running Evaluation *****
08/31/2020 22:58:55 - INFO - transformers.trainer -     Num examples = 268163
08/31/2020 22:58:55 - INFO - transformers.trainer -     Batch size = 1


Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=268163.0, style=ProgressStyle(descriptio…






KeyboardInterrupt: 