In [1]:
# Model paths
MODEL_TYPE = "gpt2" 
OUTPUT_DIR = f"../../weights/{MODEL_TYPE}/papers_milan/"
TRAIN_PATH = f"../../data/papers_milan/train_papers.txt"
TEST_PATH = f"../../data/papers_milan/test_papers.txt"
VAL_PATH = f"../../data/papers_milan/val_papers.txt"

# Finetuning

In [2]:
def create_params_modeling(output_dir, model_type="gpt2", model_name_or_path=None, train_path=None, eval_path=None, 
                             do_train=False, do_eval=False, evaluate_during_training=False, line_by_line=False, block_size=-1):
    return {
    "output_dir": output_dir,
    "model_type": model_type,
    "model_name_or_path": model_name_or_path,
    "do_train": "--do_train" if do_train else "",
    "train_data_file": train_path if do_train else None,
    "do_eval": "--do_eval" if do_eval else "",
    "eval_data_file": eval_path if do_eval else None,
    "evaluate_during_training": "--evaluate_during_training" if evaluate_during_training else "",
    "block_size": block_size,
    "line_by_line": "--line_by_line" if line_by_line else "",
    "fp16": "--fp16",
    "fp16_opt_level": "O1"
}

In [None]:
cmd_finetuning = """../../transformers/examples/language-modeling/run_language_modeling.py \
    --output_dir={output_dir} \
    --model_type={model_type} \
    --model_name_or_path={model_name_or_path} \
    {do_train} \
    --train_data_file={train_data_file} \
    {do_eval} \
    --eval_data_file={eval_data_file} \
    {evaluate_during_training} \
    --per_device_train_batch_size=1 \
    --per_device_eval_batch_size=1 \
    --block_size={block_size}
    --overwrite_output_dir \
    --save_steps 5000 \
    --save_total_limit 5 \
    {line_by_line} \
    {fp16} \
    --fp16_opt_level={fp16_opt_level} \
    --logging_steps 2 
"""

In [None]:
# Arguments for training from scratch. I turn off evaluate_during_training,
#   line_by_line, should_continue, and model_name_or_path.
train_params = create_params_modeling(output_dir=OUTPUT_DIR, 
                                        model_type=MODEL_TYPE,
                                        model_name_or_path=MODEL_TYPE,
                                        train_path=TRAIN_PATH, 
                                        eval_path=TEST_PATH, 
                                        do_train=True, 
                                        do_eval=True, 
                                        evaluate_during_training=False,
                                        line_by_line=True
                                        )

val_finetuning_params = create_params_modeling(output_dir=OUTPUT_DIR,
                                    model_name_or_path=OUTPUT_DIR,
                                    train_path=None, 
                                    eval_path=VAL_PATH,                                      
                                    do_train=False, 
                                    do_eval=True,
                                    line_by_line=True
                                    )

val_params = create_params_modeling(output_dir=OUTPUT_DIR,
                                    model_name_or_path=MODEL_TYPE,
                                    model_type=MODEL_TYPE,
                                    train_path=None, 
                                    eval_path=VAL_PATH,
                                    do_train=False, 
                                    do_eval=True,
                                    line_by_line=True
                                     )

In [None]:
run {cmd_finetuning.format(**train_params)}

In [None]:
run {cmd_finetuning.format(**val_finetuning_params)}

In [None]:
run {cmd_finetuning.format(**val_params)}

# Generation

In [8]:
def create_params_generation(MODEL_TYPE, MODEL_NAME_OR_PATH, NUM_RETURN_SEQUENCES=1, LENGTH=20):
    return {
        "model_type": MODEL_TYPE,
        "model_name_or_path": MODEL_NAME_OR_PATH,
        "num_return_sequences": NUM_RETURN_SEQUENCES,
        "length": LENGTH
    }

In [14]:
cmd_generation = """../../transformers/examples/text-generation/run_generation.py \
    --model_type={model_type} \
    --model_name_or_path={model_name_or_path} \
    --num_return_sequences={num_return_sequences} \
    --length={length}
"""

In [15]:
generation_finetuning_params = create_params_generation(MODEL_TYPE, OUTPUT_DIR, NUM_RETURN_SEQUENCES=5, LENGTH=100)
generation_params = create_params_generation(MODEL_TYPE, MODEL_TYPE, NUM_RETURN_SEQUENCES=5, LENGTH=100)

In [16]:
run {cmd_generation.format(**generation_finetuning_params)}

07/01/2020 15:30:07 - INFO - transformers.tokenization_utils_base -   Model name '../../weights/gpt2/papers_milan/' not found in model shortcut name list (gpt2, gpt2-medium, gpt2-large, gpt2-xl, distilgpt2). Assuming '../../weights/gpt2/papers_milan/' is a path, a model identifier, or url to a directory containing tokenizer files.
07/01/2020 15:30:07 - INFO - transformers.tokenization_utils_base -   Didn't find file ../../weights/gpt2/papers_milan/added_tokens.json. We won't load it.
07/01/2020 15:30:07 - INFO - transformers.tokenization_utils_base -   loading file ../../weights/gpt2/papers_milan/vocab.json
07/01/2020 15:30:07 - INFO - transformers.tokenization_utils_base -   loading file ../../weights/gpt2/papers_milan/merges.txt
07/01/2020 15:30:07 - INFO - transformers.tokenization_utils_base -   loading file None
07/01/2020 15:30:07 - INFO - transformers.tokenization_utils_base -   loading file ../../weights/gpt2/papers_milan/special_tokens_map.json
07/01/2020 15:30:07 - INFO - tra

Model prompt >>> Hello my dear
=== GENERATED SEQUENCE 1 ===
Hello my dear, please consider for the case of, if and only if is not identically zero as, is not identically zero and. In view of the above, we will consider for this case only the case of and then assume that A and consequently,,. However, if one, then one is able to verify that, irrespective of whether or not A, is, and therefore, and vice. We will need to do this. Firstly, define A and. Then, and. This is easy
=== GENERATED SEQUENCE 2 ===
Hello my dear, and I will derive the minimum average data rate that is feasible in a feasible and practical setting. To begin with, the best achievable data rate must be strictly higher than the bound given by. Consider a well known algorithm that the performance of an exogenous. For this purpose, it is convenient to recall that an exogenous source such as an i. i.. process can be chosen so that, the exogenous, is able to achieve a given average data rate, which the intended performance l

In [None]:
run {cmd_generation.format(**generation_params)}