In [1]:
from datasets import load_dataset

In [2]:
dataset = load_dataset('3NTRPY-13/reuters_articles')
dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'body'],
        num_rows: 17262
    })
    validation: Dataset({
        features: ['title', 'body'],
        num_rows: 2158
    })
    test: Dataset({
        features: ['title', 'body'],
        num_rows: 2158
    })
})

In [3]:
def create_full_article_col(example):
    return {'full_article': f'TITLE:{example["title"]}\n\nBODY:{example["body"]}'}

dataset = dataset.map(create_full_article_col)
dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'body', 'full_article'],
        num_rows: 17262
    })
    validation: Dataset({
        features: ['title', 'body', 'full_article'],
        num_rows: 2158
    })
    test: Dataset({
        features: ['title', 'body', 'full_article'],
        num_rows: 2158
    })
})

In [4]:
dataset['train'][0]['full_article']

'TITLE:BAHIA COCOA REVIEW\n\nBODY:Showers continued throughout the week in\nthe Bahia cocoa zone, alleviating the drought since early\nJanuary and improving prospects for the coming temporao,\nalthough normal humidity levels have not been restored,\nComissaria Smith said in its weekly review.\n    The dry period means the temporao will be late this year.\n    Arrivals for the week ended February 22 were 155,221 bags\nof 60 kilos making a cumulative total for the season of 5.93\nmln against 5.81 at the same stage last year. Again it seems\nthat cocoa delivered earlier on consignment was included in the\narrivals figures.\n    Comissaria Smith said there is still some doubt as to how\nmuch old crop cocoa is still available as harvesting has\npractically come to an end. With total Bahia crop estimates\naround 6.4 mln bags and sales standing at almost 6.2 mln there\nare a few hundred thousand bags still in the hands of farmers,\nmiddlemen, exporters and processors.\n    There are doubts as

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("3NTRPY-13/gpt2-reuters_tokenizer")

In [9]:
CONTEXT_LENGTH = 512

def tokenize(element):
    outputs = tokenizer(
        element['full_article'],
        truncation=True,
        max_length=CONTEXT_LENGTH,
        return_overflowing_tokens=False
    )

    return outputs

tokenized_dataset = dataset.map(
    tokenize,
    batched=True,
    remove_columns=dataset['train'].column_names
)
tokenized_dataset

Map:   0%|          | 0/17262 [00:00<?, ? examples/s]

Map:   0%|          | 0/2158 [00:00<?, ? examples/s]

Map:   0%|          | 0/2158 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 17262
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 2158
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 2158
    })
})

## Preparing Model For Training

In [10]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    'gpt2',
    vocab_size=len(tokenizer),
    n_ctx=CONTEXT_LENGTH,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

In [11]:
config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 0,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 512,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.49.0",
  "use_cache": true,
  "vocab_size": 52000
}

In [14]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f'GPT2 model size is {model_size/1000**2:.1f}M parameters')

GPT2 model size is 125.8M parameters


In [15]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [16]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='models/gpt2-reuters_training',
    hub_model_id='3NTRPY-13/gpt2-reuters_training',
    eval_strategy='epoch',
    auto_find_batch_size=True,
    num_train_epochs=2,
    gradient_accumulation_steps=8,
    weight_decay=0.1,
    lr_scheduler_type='cosine',
    learning_rate=5e-4,
    fp16=True,
    push_to_hub=True,
    logging_steps=10
)

trainer = Trainer(
    model=model,
    processing_class=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
)

In [19]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,4.8381,5.022951


TrainOutput(global_step=538, training_loss=5.491732593805817, metrics={'train_runtime': 456.7765, 'train_samples_per_second': 75.582, 'train_steps_per_second': 1.178, 'total_flos': 6896827680768000.0, 'train_loss': 5.491732593805817, 'epoch': 1.9935125115848007})

In [20]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/503M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/3NTRPY-13/gpt2-reuters_training/commit/8b35a8923fa8d7a9fdf833b6782b987b666cfea0', commit_message='End of training', commit_description='', oid='8b35a8923fa8d7a9fdf833b6782b987b666cfea0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/3NTRPY-13/gpt2-reuters_training', endpoint='https://huggingface.co', repo_type='model', repo_id='3NTRPY-13/gpt2-reuters_training'), pr_revision=None, pr_num=None)

In [21]:
from transformers import pipeline

pipe = pipeline('text-generation', model='3NTRPY-13/gpt2-reuters_training')

config.json:   0%|          | 0.00/898 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/503M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/819k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/465k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.67M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/470 [00:00<?, ?B/s]

Device set to use cuda:0


In [22]:
sample = dataset['test'][2]
sample

{'title': 'CHEFS <CHEF.O> COMPLETES PRIVATE SALE',
 'body': "Chefs International\nInc said it completed a private sale of nine mln units of its\nsecurities for 20 cts per unit for a total price of 1,800,000\ndlrs.\n    The company said each unit consisted of one share of Chef's\ncommon stock and one three-year warrant exerciseable to\npurchase one share of Chef's stock at 25 cents.\n    The company also said Robert E. Brennan purchased 8,250,000\nof the units.\n   \n Reuter\n\x03",
 'full_article': "TITLE:CHEFS <CHEF.O> COMPLETES PRIVATE SALE\n\nBODY:Chefs International\nInc said it completed a private sale of nine mln units of its\nsecurities for 20 cts per unit for a total price of 1,800,000\ndlrs.\n    The company said each unit consisted of one share of Chef's\ncommon stock and one three-year warrant exerciseable to\npurchase one share of Chef's stock at 25 cents.\n    The company also said Robert E. Brennan purchased 8,250,000\nof the units.\n   \n Reuter\n\x03"}

In [23]:
prompt = f"TITLE:{sample['title']}\n\nBODY:"
pipe(prompt, max_new_tokens=128)

[{'generated_text': 'TITLE:CHEFS <CHEF.O> COMPLETES PRIVATE SALE\n\nBODY:Pacific Corp said it is\nan additional shares for the 1986 of the year in May 2,000\nin March 30.\n    The company said its company expects an merger on an outstanding from a share under April 31 at\nof its additional debentures.\n Reuter\n\x03 to a premium on May 31 at the loss for the company said it was\n\x03 per share of the share from\n\x03 its company which said.\n    An issue rose at a year, the company\n\x03 to the first year, compared to be\n\x03 to be acquired.\n    The stock operations were the company also said that it has raised.\n\x03.\n   '}]

In [24]:
prompt = f"TITLE:{sample['title']}"
pipe(prompt, max_new_tokens=128)

[{'generated_text': 'TITLE:CHEFS <CHEF.O> COMPLETES PRIVATE SALE\n\nBODY:Shr loss 26.4 dlrs vs loss 10.7 cts\n    Net profit 15.5 mln\n    Net loss 0.3 mln vs loss 15.4 mln\n    Net loss 6.6 mln vs 13.05 dlrs\n    Net loss 3.8 mln\n Reuter\n\x03 16.82 mln vs 35.00 billion dlrs vs loss 24,000\n    Avg shrs 2.3 mln\n    --900,000\n    Operating 23.8 billion vs loss loss 12.2 mln vs 2 cts vs 17 at 5.1 billion\n\x03 vs 15.7 mln vs 12.0 mln dlr qtr includes 1.4 mln'}]