In [3]:
# Model paths
MODEL_TYPE = "gpt2" 
OUTPUT_DIR = f"../../weights/{MODEL_TYPE}/papers_milan/"
TRAIN_PATH = f"../../data/papers_milan/train_papers.txt"
TEST_PATH = f"../../data/papers_milan/test_papers.txt"
VAL_PATH = f"../../data/papers_milan/val_papers.txt"

# Finetuning

In [18]:
def create_params_modeling(output_dir, model_type="gpt2", model_name_or_path=None, train_path=None, eval_path=None, 
                             do_train=False, do_eval=False, evaluate_during_training=False, line_by_line=False, block_size=-1):
    return {
    "output_dir": output_dir,
    "model_type": model_type,
    "model_name_or_path": model_name_or_path,
    "do_train": "--do_train" if do_train else "",
    "train_data_file": train_path if do_train else None,
    "do_eval": "--do_eval" if do_eval else "",
    "eval_data_file": eval_path if do_eval else None,
    "evaluate_during_training": "--evaluate_during_training" if evaluate_during_training else "",
    "block_size": block_size,
    "line_by_line": "--line_by_line" if line_by_line else "",
    "fp16": "--fp16",
    "fp16_opt_level": "O1"
}

In [19]:
cmd_finetuning = """../../transformers/examples/language-modeling/run_language_modeling.py \
    --output_dir={output_dir} \
    --model_type={model_type} \
    --model_name_or_path={model_name_or_path} \
    {do_train} \
    --train_data_file={train_data_file} \
    {do_eval} \
    --eval_data_file={eval_data_file} \
    {evaluate_during_training} \
    --per_device_train_batch_size=1 \
    --per_device_eval_batch_size=1 \
    --block_size={block_size}
    --overwrite_output_dir \
    --save_steps 5000 \
    --save_total_limit 5 \
    {line_by_line} \
    {fp16} \
    --fp16_opt_level={fp16_opt_level} \
    --logging_steps 2 
"""

In [20]:
# Arguments for training from scratch. I turn off evaluate_during_training,
#   line_by_line, should_continue, and model_name_or_path.
train_params = create_params_modeling(output_dir=OUTPUT_DIR, 
                                        model_type=MODEL_TYPE,
                                        model_name_or_path=MODEL_TYPE,
                                        train_path=TRAIN_PATH, 
                                        eval_path=TEST_PATH, 
                                        do_train=True, 
                                        do_eval=True, 
                                        evaluate_during_training=False,
                                        line_by_line=True
                                        )

val_finetuning_params = create_params_modeling(output_dir=OUTPUT_DIR,
                                    model_name_or_path=OUTPUT_DIR,
                                    train_path=None, 
                                    eval_path=VAL_PATH,                                      
                                    do_train=False, 
                                    do_eval=True,
                                    line_by_line=True
                                    )

val_params = create_params_modeling(output_dir=OUTPUT_DIR,
                                    model_name_or_path=MODEL_TYPE,
                                    model_type=MODEL_TYPE,
                                    train_path=None, 
                                    eval_path=VAL_PATH,
                                    do_train=False, 
                                    do_eval=True,
                                    line_by_line=True
                                     )

In [None]:
run {cmd_finetuning.format(**train_params)}

In [21]:
run {cmd_finetuning.format(**val_finetuning_params)}

07/01/2020 18:32:10 - INFO - transformers.training_args -   PyTorch: setting up devices
07/01/2020 18:32:10 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='../../weights/gpt2/papers_milan/', overwrite_output_dir=True, do_train=False, do_eval=True, do_predict=False, evaluate_during_training=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, warmup_steps=0, logging_dir='runs/Jul01_18-32-10_Camilo-UbuntuPC', logging_first_step=False, logging_steps=2, save_steps=5000, save_total_limit=5, no_cuda=False, seed=42, fp16=True, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, dataloader_drop_last=False)
07/01/2020 18:32:10 - INFO - transformers.configuration_utils -   loading configuration file ../../weights/gp

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=447.0, style=ProgressStyle(description_w…

07/01/2020 18:32:18 - INFO - transformers.trainer -   {'eval_loss': 3.5828039736822412, 'step': 0}
07/01/2020 18:32:18 - INFO - __main__ -   ***** Eval results *****
07/01/2020 18:32:18 - INFO - __main__ -     perplexity = 35.974270467091586





In [22]:
run {cmd_finetuning.format(**val_params)}

07/01/2020 18:32:18 - INFO - transformers.training_args -   PyTorch: setting up devices
07/01/2020 18:32:18 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='../../weights/gpt2/papers_milan/', overwrite_output_dir=True, do_train=False, do_eval=True, do_predict=False, evaluate_during_training=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, warmup_steps=0, logging_dir='runs/Jul01_18-32-18_Camilo-UbuntuPC', logging_first_step=False, logging_steps=2, save_steps=5000, save_total_limit=5, no_cuda=False, seed=42, fp16=True, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, dataloader_drop_last=False)
07/01/2020 18:32:19 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazo

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=447.0, style=ProgressStyle(description_w…

07/01/2020 18:32:30 - INFO - transformers.trainer -   {'eval_loss': 5.188957473042294, 'step': 0}
07/01/2020 18:32:30 - INFO - __main__ -   ***** Eval results *****
07/01/2020 18:32:30 - INFO - __main__ -     perplexity = 179.28154962227336





# Generation

In [44]:
def create_params_generation(MODEL_TYPE, MODEL_NAME_OR_PATH, NUM_RETURN_SEQUENCES=1, LENGTH=20, TRANSLATE_TO=""):
    return {
        "model_type": MODEL_TYPE,
        "model_name_or_path": MODEL_NAME_OR_PATH,
        "num_return_sequences": NUM_RETURN_SEQUENCES,
        "length": LENGTH,
        "translate_to": ""
    }

In [None]:
"0": "mensaje a continuación para predicción"
"1": "mensaje a continuación para configuración"

In [None]:
model_type -> "estructura del modelo"
model_name_or_path -> "checkpoints"

In [None]:
GTP2 (frases) -> ingles -> translator -> español
bert (siguiente palabra) -> ingles
Scibert papers (siguiente palabra) -> ingles 
beto (siguiente palabra) -> español
marian (translator) -> idioma

In [45]:
cmd_generation = """python run_generation_server.py \
    --model_type={model_type} \
    --model_name_or_path={model_name_or_path} \
    --num_return_sequences={num_return_sequences} \
    --length={length} \
    {translate_to}
"""

In [47]:
{cmd_generation.format(**generation_finetuning_params)}

08/05/2020 19:46:34 - INFO - transformers.tokenization_utils_base -   Model name '../../weights/gpt2/papers_milan/' not found in model shortcut name list (gpt2, gpt2-medium, gpt2-large, gpt2-xl, distilgpt2). Assuming '../../weights/gpt2/papers_milan/' is a path, a model identifier, or url to a directory containing tokenizer files.
08/05/2020 19:46:34 - INFO - transformers.tokenization_utils_base -   Didn't find file ../../weights/gpt2/papers_milan/added_tokens.json. We won't load it.
08/05/2020 19:46:34 - INFO - transformers.tokenization_utils_base -   loading file ../../weights/gpt2/papers_milan/vocab.json
08/05/2020 19:46:34 - INFO - transformers.tokenization_utils_base -   loading file ../../weights/gpt2/papers_milan/merges.txt
08/05/2020 19:46:34 - INFO - transformers.tokenization_utils_base -   loading file None
08/05/2020 19:46:34 - INFO - transformers.tokenization_utils_base -   loading file ../../weights/gpt2/papers_milan/special_tokens_map.json
08/05/2020 19:46:34 - INFO - tra

In [27]:
cmd_generation = """../../transformers/examples/text-generation/run_generation.py \
    --model_type={model_type} \
    --model_name_or_path={model_name_or_path} \
    --num_return_sequences={num_return_sequences} \
    --length={length} \
    --translate_to={translate_to}
"""

In [34]:
generation_finetuning_params = create_params_generation(MODEL_TYPE, OUTPUT_DIR, NUM_RETURN_SEQUENCES=5, LENGTH=10)
generation_params = create_params_generation(MODEL_TYPE, MODEL_TYPE, NUM_RETURN_SEQUENCES=5, LENGTH=10)

In [7]:
generation_finetuning_params

{'model_type': 'gpt2',
 'model_name_or_path': '../../weights/gpt2/papers_milan/',
 'num_return_sequences': 5,
 'length': 10,
 'translate_to': ''}

In [27]:
run {cmd_generation.format(**generation_finetuning_params)}

07/01/2020 18:35:16 - INFO - transformers.tokenization_utils_base -   Model name '../../weights/gpt2/papers_milan/' not found in model shortcut name list (gpt2, gpt2-medium, gpt2-large, gpt2-xl, distilgpt2). Assuming '../../weights/gpt2/papers_milan/' is a path, a model identifier, or url to a directory containing tokenizer files.
07/01/2020 18:35:16 - INFO - transformers.tokenization_utils_base -   Didn't find file ../../weights/gpt2/papers_milan/added_tokens.json. We won't load it.
07/01/2020 18:35:16 - INFO - transformers.tokenization_utils_base -   loading file ../../weights/gpt2/papers_milan/vocab.json
07/01/2020 18:35:16 - INFO - transformers.tokenization_utils_base -   loading file ../../weights/gpt2/papers_milan/merges.txt
07/01/2020 18:35:16 - INFO - transformers.tokenization_utils_base -   loading file None
07/01/2020 18:35:16 - INFO - transformers.tokenization_utils_base -   loading file ../../weights/gpt2/papers_milan/special_tokens_map.json
07/01/2020 18:35:16 - INFO - tra

Model prompt >>> WTF
=== GENERATED SEQUENCE 1 ===
WTF are compressible let us consider a choice that to and with that is known as compressible norm. We now conclude that the latter is indeed in the works. However, as before, the definition of the in the definition of and the in Appendix A indicate that we are limited by and only consider the rate distortion function. Indeed, this definition is standard and relevant for all, except compressible rate distortion. Of course, a real universal compressible norm for a given rate distortion function is not made explicit
=== GENERATED SEQUENCE 2 ===
WTF: A general case showing that E is zero for all possible, in a compact set. Also, this technique can be as a mathematical method for finite dimensional AR. In this paper, we review, in a compact set, the complete set of and why for some, it can be used for some robust AR. For the remainder of this paper, we present an alternative simple and elegant approach to compress the. We consider the follow

In [28]:
run {cmd_generation.format(**generation_params)}

07/01/2020 18:35:24 - INFO - transformers.tokenization_utils_base -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json from cache at /home/camilojd/.cache/torch/transformers/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
07/01/2020 18:35:24 - INFO - transformers.tokenization_utils_base -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt from cache at /home/camilojd/.cache/torch/transformers/d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
07/01/2020 18:35:25 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json from cache at /home/camilojd/.cache/torch/transformers/4be02c5697d91738003fb1685c9872f284166aa32e061576bbe6aaeb95649fcf.db13c9bc9c7bdd738ec89e069621d88e05dc670366092d

Model prompt >>> WTF
=== GENERATED SEQUENCE 1 ===
WTF from Cueball's room.

When The Flash came in, he kept mentioning the possibility of him having Flash's storybook, which was in fact not called "Flash", but "Flash's place" in this incident.

Grant denied saying anything about the deaths of Barry and Iris, when there were only five policemen left and over ten minutes between them.

Grant's Batman version of himself is very, very angry over this comic.

On one of the many episodes in
=== GENERATED SEQUENCE 2 ===
WTF!

I DON'T SEE ANY MOCKING FOR A

BANK WITH NO ASSETS AROUND!

I DON'T SEE ANY MOCKING FOR A BANK WITH

NO ASSETS AROUND!

I DON'T SEE ANY MOCKING FOR A BANK WITH NO

UNDER THOSE COMPOSITE MATTERS!

I DON'T SEE ANY MOCKING FOR A BANK WITH

UNDER TH
=== GENERATED SEQUENCE 3 ===
WTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTFWTF
=== GENERATED SEQUENCE 4 ===
WTF' – some but n