In [30]:
from transformers import GPT2Tokenizer, DataCollatorForLanguageModeling, TrainingArguments, Trainer, \
                          GPT2LMHeadModel, TextDataset, pipeline
from datasets import Dataset
import pandas as pd

In [11]:
# Since data lack, it would not go as wanted, this time just learn how to implement code dictation. 
data = pd.read_csv('data/english_to_latex.csv')
data

Unnamed: 0,English,LaTeX
0,integral from a to b of x squared,"\int_{a}^{b} x^2\,dx"
1,integral from negative 1 to 1 of x squared,"\int_{-1}^{1}x^2\,dx"


In [12]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

In [13]:
CONVERSION_PROMPT = 'LCT\n' # LaTex Conversion Task (custom prompt)
CONVERSION_TOKEN = 'LaTeX:'

In [16]:
training_examples = f'{CONVERSION_PROMPT}English: ' + data['English'] + '\n' + CONVERSION_TOKEN + ' ' + data['LaTeX']
print(training_examples[0])

LCT
English: integral from a to b of x squared
LaTeX: \int_{a}^{b} x^2\,dx


In [17]:
task_df = pd.DataFrame({'text': training_examples})
task_df

Unnamed: 0,text
0,LCT\nEnglish: integral from a to b of x square...
1,LCT\nEnglish: integral from negative 1 to 1 of...


In [18]:
latex_data = Dataset.from_pandas(task_df)
latex_data

Dataset({
    features: ['text'],
    num_rows: 2
})

In [19]:
def preprocess(examples): # tokenize our text but don't pad because our collator will pad for us dynamically
    return tokenizer(examples['text'], truncation=True)

In [20]:
latex_data = latex_data.map(preprocess, batched=True)
# latex_data = latex_data.train_test_split(train_size=.8)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [21]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [24]:
latex_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')

In [25]:
training_args = TrainingArguments(
    output_dir='english_to_latex',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    load_best_model_at_end=True,
    logging_steps=5,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch'
)
trainer = Trainer(
    model=latex_gpt2,
    args=training_args,
    train_dataset=latex_data,
    eval_dataset=latex_data,
    # train_dataset=latex_data['train'],
    # eval_dataset=latex_data['test'],
    data_collator=data_collator
)

In [26]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2
  Batch size = 2


{'eval_loss': 4.909360408782959,
 'eval_runtime': 0.6682,
 'eval_samples_per_second': 2.993,
 'eval_steps_per_second': 1.497}

In [27]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 10
  Number of trainable parameters = 124439808


Epoch,Training Loss,Validation Loss
1,No log,4.012093


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2
  Batch size = 2
***** Running Evaluation *****
  Num examples = 2
  Batch size = 2
Saving model checkpoint to english_to_latex/checkpoint-1
Configuration saved in english_to_latex/checkpoint-1/config.json
Configuration saved in english_to_latex/checkpoint-1/generation_config.json
Model weights saved in english_to_latex/checkpoint-1/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2
  Batch size = 2
The following columns in the evaluation set don't have a corresponding

TrainOutput(global_step=10, training_loss=2.9798227310180665, metrics={'train_runtime': 26.867, 'train_samples_per_second': 0.744, 'train_steps_per_second': 0.372, 'total_flos': 336821760000.0, 'train_loss': 2.9798227310180665, 'epoch': 10.0})

In [28]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2
  Batch size = 2


{'eval_loss': 1.6527339220046997,
 'eval_runtime': 0.0413,
 'eval_samples_per_second': 48.397,
 'eval_steps_per_second': 24.198,
 'epoch': 10.0}

In [32]:
calculus_data = TextDataset(
    tokenizer=tokenizer,
    file_path='data/calculus_made_easy.txt',
    block_size=32,
)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)
latex_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')
training_args = TrainingArguments(
    output_dir='calculus',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,
    logging_steps=50,
    eval_steps=50,
    evaluation_strategy='steps',
    save_strategy='steps'
)
trainer = Trainer(
    model=latex_gpt2,
    args=training_args,
    data_collator=data_collator,
    train_dataset=calculus_data.examples[:int(len(calculus_data.examples)*.8)],
    eval_dataset=calculus_data.examples[int(len(calculus_data.examples)*.8):],
)

Creating features from dataset file at data
Saving features into cached file data/cached_lm_GPT2Tokenizer_32_calculus_made_easy.txt [took 0.003 s]
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_spec

In [33]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 636
  Batch size = 32
  Num examples = 636
  Batch size = 32


{'eval_loss': 4.760920524597168,
 'eval_runtime': 0.9143,
 'eval_samples_per_second': 695.579,
 'eval_steps_per_second': 21.874}

In [34]:
trainer.train()

***** Running training *****
  Num examples = 2541
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 80
  Number of trainable parameters = 124439808


Step,Training Loss,Validation Loss
50,3.8246,3.418829


***** Running Evaluation *****
  Num examples = 636
  Batch size = 32
  Num examples = 636
  Batch size = 32


TrainOutput(global_step=80, training_loss=3.6955965995788573, metrics={'train_runtime': 14.3535, 'train_samples_per_second': 177.03, 'train_steps_per_second': 5.574, 'total_flos': 41496440832000.0, 'train_loss': 3.6955965995788573, 'epoch': 1.0})

In [35]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 636
  Batch size = 32


{'eval_loss': 3.367985248565674,
 'eval_runtime': 0.8935,
 'eval_samples_per_second': 711.801,
 'eval_steps_per_second': 22.384,
 'epoch': 1.0}

In [36]:
trainer.save_model()

Saving model checkpoint to calculus
Configuration saved in calculus/config.json
Configuration saved in calculus/generation_config.json
Model weights saved in calculus/pytorch_model.bin


In [37]:
calculus_latex_gpt2 = GPT2LMHeadModel.from_pretrained('calculus')
training_args = TrainingArguments(
    output_dir='calculus_english_to_latex',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    load_best_model_at_end=True,
    logging_steps=5,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch'
)
trainer = Trainer(
    model=calculus_latex_gpt2,
    args=training_args,
    train_dataset=latex_data,
    eval_dataset=latex_data,
    # train_dataset=latex_data['train'],
    # eval_dataset=latex_data['test'],
    data_collator=data_collator
)

loading configuration file calculus/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "use_cache": true,
  "vocab_size": 50257
}

loading weights file

In [38]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2
  Batch size = 2


{'eval_loss': 4.610260963439941,
 'eval_runtime': 0.0262,
 'eval_samples_per_second': 76.266,
 'eval_steps_per_second': 38.133}

In [39]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 10
  Number of trainable parameters = 124439808


Epoch,Training Loss,Validation Loss
1,No log,3.669095


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2
  Batch size = 2
***** Running Evaluation *****
  Num examples = 2
  Batch size = 2
Saving model checkpoint to calculus_english_to_latex/checkpoint-1
Saving model checkpoint to calculus_english_to_latex/checkpoint-1
Configuration saved in calculus_english_to_latex/checkpoint-1/config.json
Configuration saved in calculus_english_to_latex/checkpoint-1/generation_config.json
Model weights saved in calculus_english_to_latex/checkpoint-1/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Nu

TrainOutput(global_step=10, training_loss=2.6668675422668455, metrics={'train_runtime': 21.926, 'train_samples_per_second': 0.912, 'train_steps_per_second': 0.456, 'total_flos': 336821760000.0, 'train_loss': 2.6668675422668455, 'epoch': 10.0})

In [40]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2
  Batch size = 2


{'eval_loss': 1.24819815158844,
 'eval_runtime': 0.0282,
 'eval_samples_per_second': 71.047,
 'eval_steps_per_second': 35.524,
 'epoch': 10.0}

In [41]:
trainer.save_model()

Saving model checkpoint to calculus_english_to_latex
Configuration saved in calculus_english_to_latex/config.json
Configuration saved in calculus_english_to_latex/generation_config.json
Model weights saved in calculus_english_to_latex/pytorch_model.bin


In [42]:
loaded_model = GPT2LMHeadModel.from_pretrained('calculus_english_to_latex')
latex_generator = pipeline('text-generation', model=loaded_model, tokenizer=tokenizer)

loading configuration file calculus_english_to_latex/config.json
Model config GPT2Config {
  "_name_or_path": "calculus",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "use_cache": true,
  "vocab_size": 50257
}


In [43]:
text_sample = 'f of x equals integral from 0 to pi of x to the fourth power'
conversion_text_sample = f'{CONVERSION_PROMPT}English: {text_sample}\n{CONVERSION_TOKEN}'
print(conversion_text_sample)

LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTeX:


In [45]:
print(latex_generator(
    conversion_text_sample, num_beams=5, early_stopping=True, temperature=0.7,
    max_new_tokens=len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

Generate config GenerationConfig {
  "bos_token_id": 50256,
  "do_sample": true,
  "eos_token_id": 50256,
  "max_length": 50,
  "transformers_version": "4.26.1"
}

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTeX: f of x equals integral from 0 to pi of x to the fourth power
LaTeX: f of x equals integral from 0 to pi of x to the fourth power
LaTeX: f of x equals integral from
