In [53]:
import pandas as pd

In [54]:
data = pd.read_csv('../data/english_to_latex.csv')

data.shape

(50, 2)

In [55]:
data.head()

Unnamed: 0,English,LaTeX
0,integral from a to b of x squared,"\int_{a}^{b} x^2 \,dx"
1,integral from negative 1 to 1 of x squared,"\int_{-1}^{1} x^2 \,dx"
2,integral from negative 1 to infinity of x cubed,"\int_{-1}^{\inf} x^3 \,dx"
3,integral from 0 to infinity of x squared,"\int_{0}^{\inf} x^2 \,dx"
4,integral from 0 to infinity of y squared,"\int_{0}^{\inf} y^2 \,dy"


In [56]:
# Prompt Engineerings
from transformers import GPT2Tokenizer

MODEL = 'distilgpt2'

tokenizer = GPT2Tokenizer.from_pretrained(MODEL)

tokenizer.pad_token = tokenizer.eos_token

#add two prompts, one for each task
CONVERSION_PROMPT = 'Task\n'  # LaTeX conversion task
CONVERSION_TOKEN = 'LaTeX:'

loading file https://huggingface.co/distilgpt2/resolve/main/vocab.json from cache at C:\Users\Dell/.cache\huggingface\transformers\55051ac97dcc32f0a736d21a32a4d42b0d9b90f117ca7c38e65038b04bd5c3f5.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/distilgpt2/resolve/main/merges.txt from cache at C:\Users\Dell/.cache\huggingface\transformers\9dfb299b74cdf7601ba7cd3a8073dbdac351caec0ed7ab5849b098b3c8ae3d57.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/distilgpt2/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/distilgpt2/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/distilgpt2/resolve/main/tokenizer_config.json from cache at None
loading file https://huggingface.co/distilgpt2/resolve/main/tokenizer.json from cache at C:\Users\Dell/.cache\huggingface\transformers\accb287b5a5396b2597382916b6cc939fdab1366e8

In [57]:
# This is our "training prompt" that we want GPT2 to recognize and learn

training_examples = f'{CONVERSION_PROMPT}English: ' + data['English'] + '\n' + CONVERSION_TOKEN + ' ' + data['LaTeX'].astype(str)
print(training_examples[0])


Task
English: integral from a to b of x squared
LaTeX: \int_{a}^{b} x^2 \,dx


In [58]:
task_df = pd.DataFrame({'text': training_examples})

task_df.shape

(50, 1)

In [59]:
from datasets import Dataset
data = Dataset.from_pandas(task_df)
data

Dataset({
    features: ['text'],
    num_rows: 50
})

In [60]:
MAX_TOKENS = task_df['text'].apply(lambda x: len(tokenizer(x)['input_ids'])).max() + 5

MAX_TOKENS

49

In [61]:
# tokenizer created input_ids and attention_mask as output
def tokenize_function(examples):
    output = tokenizer(examples['text'], add_special_tokens=True, max_length=MAX_TOKENS, 
                       truncation=True,padding='max_length')
    
    output['labels'] = output["input_ids"]
    # -100 is a reserved value to ignore these tokens when calculating the loss
    output["labels"] = [[-100 if x == tokenizer.pad_token_id else x for x in y] for y in output["labels"]]
    return output

data = data.map(
    tokenize_function,
    batched=True,
)
print(data)


  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'input_ids', 'labels', 'text'],
    num_rows: 50
})


In [62]:
data.set_format(type="python", columns=["input_ids", "attention_mask", "labels"])

data = data.train_test_split(test_size=0.10, shuffle=True, seed=0)
print(data)

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'text'],
        num_rows: 45
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'text'],
        num_rows: 5
    })
})


In [63]:
tokenizer.decode(data['train'][0]['input_ids'])

'Task\nEnglish: integral from 0 to infinity of x squared\nLaTeX: \\int_{0}^{\\inf} x^2 \\,dx<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>'

In [64]:
tokenizer.decode([c for c  in data['train'][0]['labels'] if c != -100])

'Task\nEnglish: integral from 0 to infinity of x squared\nLaTeX: \\int_{0}^{\\inf} x^2 \\,dx'

In [65]:
from transformers import Trainer, TrainingArguments
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained(MODEL)

loading configuration file https://huggingface.co/distilgpt2/resolve/main/config.json from cache at C:\Users\Dell/.cache\huggingface\transformers\f985248d2791fcff97732e4ee263617adec1edb5429a2b8421734c6d14e39bee.422318838d1ec4e061efb4ea29671cb2a044e244dc69229682bebd7cacc81631
Model config GPT2Config {
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "

In [66]:
# Note the batch size of 4 to make sure we have multiple steps per epoch. This generally speeds up training
training_args = TrainingArguments(
    output_dir="./english_to_latex", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=15, # number of training epochs
    per_device_train_batch_size=4, # batch size for training
    per_device_eval_batch_size=4,  # batch size for evaluation
    load_best_model_at_end=True,
    warmup_steps=len(data['train']) // 5,  # number of warmup steps for learning rate scheduler,
    weight_decay = 0.05,
    logging_steps=1,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data['train'],
    eval_dataset=data['test'],
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [67]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 5
  Batch size = 4


{'eval_loss': 4.863057613372803,
 'eval_runtime': 0.513,
 'eval_samples_per_second': 9.747,
 'eval_steps_per_second': 3.899}

In [68]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text.
***** Running training *****
  Num examples = 45
  Num Epochs = 15
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 180


Epoch,Training Loss,Validation Loss
1,2.1975,2.955204
2,1.3588,1.750434
3,1.6091,1.387579
4,0.4103,1.19333
5,0.6977,1.117739
6,0.6681,0.979447
7,0.4118,0.897312
8,0.4568,0.882566
9,0.2631,0.8321
10,0.2394,0.822526


The following columns in the evaluation set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 5
  Batch size = 4
Saving model checkpoint to ./english_to_latex\checkpoint-12
Configuration saved in ./english_to_latex\checkpoint-12\config.json
Model weights saved in ./english_to_latex\checkpoint-12\pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 5
  Batch size = 4
Saving model checkpoint to ./english_to_latex\checkpoint-24
Configuration saved in ./english_to_latex\checkpoint-24\config.json
Model weights saved in ./english_to_latex\checkpoint-24\pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples

TrainOutput(global_step=180, training_loss=0.9258044761088159, metrics={'train_runtime': 689.3919, 'train_samples_per_second': 0.979, 'train_steps_per_second': 0.261, 'total_flos': 8439834009600.0, 'train_loss': 0.9258044761088159, 'epoch': 15.0})

In [69]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 5
  Batch size = 4


{'eval_loss': 0.7722758054733276,
 'eval_runtime': 0.8793,
 'eval_samples_per_second': 5.686,
 'eval_steps_per_second': 2.275,
 'epoch': 15.0}

In [70]:
trainer.save_model()

Saving model checkpoint to ./english_to_latex
Configuration saved in ./english_to_latex\config.json
Model weights saved in ./english_to_latex\pytorch_model.bin


In [71]:
# Load our finetuned model
loaded_model = GPT2LMHeadModel.from_pretrained('./english_to_latex')

loading configuration file ./english_to_latex\config.json
Model config GPT2Config {
  "_name_or_path": "distilgpt2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": 

In [72]:
text_sample = 'f of x equals integral from 1 to inf of x'

conversion_text_sample = f'{CONVERSION_PROMPT}English: {text_sample}\n{CONVERSION_TOKEN}'

print(conversion_text_sample)

Task
English: f of x equals integral from 1 to inf of x
LaTeX:


In [73]:
encoded_input = tokenizer(conversion_text_sample, return_tensors='pt')

print(
    tokenizer.decode(loaded_model.generate(
        input_ids=encoded_input['input_ids'],
        num_beams=3,
        max_length=MAX_TOKENS,
        temperature=1,
        top_k=10,
        early_stopping=True
    )[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Task
English: f of x equals integral from 1 to inf of x
LaTeX: f(x) = \int_{1}^{inf} x \,dx \,dx \,dx \,dx \,dx \
