In [1]:
!pip install transformers torch datasets
!pip install transformers accelerate
!pip install tqdm

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import datasets
from transformers import GPT2TokenizerFast
import torch
from torch.utils.data import DataLoader

In [28]:
from datasets import load_dataset

ds = load_dataset("ESGBERT/environment_data")

In [8]:
ds2 = load_dataset("exo-is/environmental-reporting-on-listed-companies")

README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

environmental_keywords.csv:   0%|          | 0.00/13.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/17277 [00:00<?, ? examples/s]

In [6]:
ds['train'][1]['sentence']

'Working with community and Indigenous-led coalitions, we focused on undoing the damage of the Trump years and pursuing public lands solutions to climate change, species extinction and expanding equitable access to nature.'

In [9]:
ds

DatasetDict({
    train: Dataset({
        features: ['sentence'],
        num_rows: 2100586
    })
})

In [16]:
ds2['train']['Keyword-in-context (750 characters before and after Keyword)'][800]

'Nonhazardous solid waste intensity\t\n0.01\t\n0.01\t\n0.01\nHazardous waste intensity\t\n0.01\t\n0.02\t\n0.03\nPerformance data\nMaterial use\nNonhazardous \nsolid waste by \ndestination\n(MT)\nHazardous solid \nwaste by \ndestination\n(MT)\nWaste intensities \n(MT / MT)\nMaterial use\nThe scope of raw material data is limited to global supply chain \nmanufacturing facilities. Ecolab purchases reconditioned drums and uses \nreusable containers to avoid the use of virgin plastic. The scope of post-\nconsumer resin packaging is North America, Europe and China. The scope \nof reconditioned and reusable packaging is North America and Europe.\nNonhazardous solid waste by destination\nAll waste is disposed of directly by the organization or otherwise directly \nconfirmed by the waste disposal contractor'

In [29]:
# You can limit the number of training samples if needed. Doing so will reduce the quality of your model but will reduce training times.
# I recommend this only when setting up your code (so you don't need to wait for the map function while making things work).
NUM_TRAIN_SAMPLES = len(ds['train'])//10

if NUM_TRAIN_SAMPLES is not None:
  ds['train'] = ds['train'].shuffle(seed=42).select(range(NUM_TRAIN_SAMPLES))

In [30]:
ds

DatasetDict({
    train: Dataset({
        features: ['sentence'],
        num_rows: 210058
    })
})

In [31]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [32]:
len(tokenizer)

50257

In [33]:
# We also add a padding token to allow us to create training batches of a fixed size.
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [34]:
# This isn't the only way to tokenize your dataset, but it's a good starting point. The GPT2TokenizerFast class has a lot of useful functions built-in.
# We eventually want to create batches of sequences where each sequence has the same number of tokens (denoted here as "context size"). This tokenization method will ensure that every sequence is the same size.
# truncation=True will truncate any sequences that are longer than the "context size," and padding='max_length' will add PAD tokens to the end of any sequence that is shorter than the "context size."
# Depending on how you do your training, you may need to configure your loss so that it ignores padding tokens.
CONTEXT_SIZE = 512 # This will need to be adjusted based on the context size of your model.
tokenized_dataset = ds.map(lambda examples: tokenizer(examples['sentence'], truncation=True, padding='max_length', max_length=CONTEXT_SIZE), batched=True)

Map:   0%|          | 0/210058 [00:00<?, ? examples/s]

In [35]:
# The collate function is where we can preprocess our sequence batches.
# Here, we combine the input sequences into a single batch tensor (and do the same for our attention masks).
def collate_fn(batch):

  input_ids = torch.stack([torch.tensor(item['input_ids']) for item in batch])
  attention_mask = torch.stack([torch.tensor(item['attention_mask']) for item in batch])

  return {
      'input_ids': input_ids,
      'attention_mask': attention_mask
  }

In [36]:
# DataLoaders make it easy to batch our data and use our collate function efficiently. You don't need to use them, but they will likely speed up your training time significantly.
# You can iterate over the dataloader to get batches of (preprocessed) sequences.
train_dataloader = DataLoader(tokenized_dataset['train'], batch_size=4, collate_fn=collate_fn, shuffle=True)
# val_dataloader = DataLoader(tokenized_dataset['validation'], batch_size=4, collate_fn=collate_fn, shuffle=False)

In [45]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config

# Customize the GPT-2 model
config = GPT2Config(
    vocab_size=len(tokenizer),
    n_positions = CONTEXT_SIZE,
    n_embd = 512,
    n_layer = 1,
    n_head = 8
)

model = GPT2LMHeadModel(config)

# Resize token embeddings in case you're adding custom tokens
model.resize_token_embeddings(len(tokenizer))

Embedding(50258, 512)

In [46]:
# Check if CUDA (GPU) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move model to GPU
model.to(device)

Using device: cuda


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 512)
    (wpe): Embedding(512, 512)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=1536, nx=512)
          (c_proj): Conv1D(nf=512, nx=512)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=2048, nx=512)
          (c_proj): Conv1D(nf=512, nx=2048)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=512, out_features=50258, bias=False)
)

In [None]:
from accelerate import Accelerator
from tqdm import tqdm  # Import tqdm for progress bars
from torch.optim import AdamW

epochs = 2
optimizer = AdamW(model.parameters(), lr = 1e-4)

# Check if CUDA (GPU) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize Accelerator
accelerator = Accelerator()
device = accelerator.device # Get the device from Accelerator

# Prepare the model, optimizer, and dataloaders for training with Accelerator
model, optimizer, train_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader
)

# Training loop with GPU usage and gradient accumulation
for epoch in range(epochs):
    total_loss = 0
    model.train()  # Explicitly set model to training mode

    # Wrap the training loop with tqdm for progress bar
    with tqdm(train_dataloader, desc=f"Epoch {epoch+1}", unit="batch") as train_bar:
        for batch in train_bar:
            optimizer.zero_grad()

            # Forward pass
            outputs = model(batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['input_ids'])

            # Compute loss
            loss = outputs.loss
            total_loss += loss.item()

            # Backward pass and optimization
            accelerator.backward(loss)  # Use accelerator.backward
            optimizer.step()

            # Update the progress bar with the current loss
            train_bar.set_postfix(loss=loss.item())

    avg_train_loss = total_loss / len(train_dataloader)

    # Print results after the epoch
    print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss}")


Using device: cuda


Epoch 1:  74%|███████▍  | 38983/52515 [1:29:10<31:09,  7.24batch/s, loss=0.3]

In [43]:
import os

# Define output directory for saving the model and tokenizer
output_dir = "fine_tuned_gpt2"
os.makedirs(output_dir, exist_ok=True)

# Save the model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}.")

Model and tokenizer saved to fine_tuned_gpt2.


In [44]:
input_texts = ["Flooding precautions"]

for prompt in input_texts:
  print(f"Prompt: {prompt}")
  for num_beam in [1,5,10]:
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    # Generate text using beam search
    beam_output = model.generate(input_ids,
                                num_beams=num_beam,
                                no_repeat_ngram_size=2,
                                max_length=300,
                                early_stopping=True)

    generated_text = tokenizer.decode(beam_output[0], skip_special_tokens=True)
    print(f"Number of beams: {num_beam}")
    print("Generated Text: ")
    print(generated_text)
    print('-' * 50)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt: Flooding precautions


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Number of beams: 1
Generated Text: 
Flooding precautions are not only used to produce the water for water. .S. Environmental Protection Agency (EPA) and the U.S., the total water consumption. . .3)4.1.5% of water withdrawal.3.4%5 percent of our water usage.2% water use.g.6% in water and water used in our manufacturing processes. and manufacturing facilities. 3.0% and waste management. 4.7% reduction in the production of waste.8% by 2025.9% from the previous year. — the equivalent of the main water withdrawals.com/water and wastewater treatment.html.pdf – in a year, and recycling of wastewater consumption of a water footprint.e. – and reuse and disposal of hazardous waste is not yet the waste generated. 5. 1. 6. in 2020.D.10% is a waste that result.11.A. (2. 2.P.ON.V.a. 8.00. 7.E. for the use of all of total of production.17% decrease.’s total. 	3% (3%, and in% for our waste,000. b.B.15.16 — in 2018.12. n. 10. 12. ·. 13.org/en. �.13.aspx. 9.18. - 5% 1, water-3 percent reduction of which

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Number of beams: 5
Generated Text: 
Flooding precautions to ensure that the Company’s natural gas pipelines, and pipelines are subject to a high degree of subjectivity in the oil and gas and NGLs and natural resources and the availability of oil, gas production and production of crude oil production, which could be adversely affected by our operations and adversely affect our results of operations, financial condition and cash flows of drilling, development and development activities and capital expenditures, including but are not limited to: drilling and drilling activities; the timing of exploration and evaluation, production activities, such as well as the exploration activities that are required in accordance with the requirements; and (ii) the extent of the U.S. Environmental Protection Agency (“”); and operations; (i.P.e.g., and in which we operate in connection with a variety of future operations or in order to comply with our properties, or (v) or production processes that may 