In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import nltk
import torch
import warnings
warnings.filterwarnings("ignore")




In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [4]:
def load_and_preprocess_dataset(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return preprocess_text(text)

In [5]:
model_name = "gpt2"

In [6]:
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [7]:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

In [8]:
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [9]:
def load_dataset(file_path, tokenizer, block_size=128):
    preprocessed_text = load_and_preprocess_dataset(file_path)
    with open('preprocessed_climate.txt', 'w', encoding='utf-8') as f:
        f.write(preprocessed_text)
    
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path='preprocessed_climate.txt',
        block_size=block_size,
    )
    return dataset

In [10]:
train_dataset = load_dataset("climate.txt", tokenizer)

In [11]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

In [12]:
training_args = TrainingArguments(
    output_dir="./gpt2-climate",
    overwrite_output_dir=True,
    num_train_epochs=270,
    per_device_train_batch_size=24,
    save_steps=10_000,
    save_total_limit=2,
    learning_rate=0.00001,
    fp16=True,
    weight_decay=0.03
)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args, 
    data_collator=data_collator,
    train_dataset=train_dataset,
)

In [14]:
trainer.train()

Step,Training Loss
500,5.7439
1000,5.2466
1500,4.9858
2000,4.7709
2500,4.5875
3000,4.4191
3500,4.2663
4000,4.1278
4500,3.9976
5000,3.8743


TrainOutput(global_step=16470, training_loss=3.5080776527279713, metrics={'train_runtime': 8629.5236, 'train_samples_per_second': 45.618, 'train_steps_per_second': 1.909, 'total_flos': 2.571505532928e+16, 'train_loss': 3.5080776527279713, 'epoch': 270.0})

In [15]:
trainer.save_model("./gpt2-climate")

In [16]:
tokenizer.save_pretrained("./gpt2-climate")

('./gpt2-climate\\tokenizer_config.json',
 './gpt2-climate\\special_tokens_map.json',
 './gpt2-climate\\vocab.json',
 './gpt2-climate\\merges.txt',
 './gpt2-climate\\added_tokens.json')