# GPT-2 Training

In [1]:
# Kaggle helper
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/combined/test_combined.csv
/kaggle/input/combined/combined.csv


In [2]:
import transformers

In [3]:
from datasets import load_dataset
# load the dataset
dataset = load_dataset("csv", data_files="../input/combined/combined.csv", split="train")
test = load_dataset("csv", data_files="../input/combined/test_combined.csv", split="train")

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-416316b76e136610/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-416316b76e136610/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.
Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-424c78d725883fd3/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-424c78d725883fd3/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


In [4]:
dataset.features

{'content': Value(dtype='string', id=None),
 'topic': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None)}

In [5]:
test

Dataset({
    features: ['Id', 'Body', 'Title', 'Tags', 'text', 'label'],
    num_rows: 391
})

## Tokenization

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
        return tokenizer(examples["content"], padding="max_length", truncation=True)

def test_tokenize(examples):
        return tokenizer(examples["Title"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [7]:
# remove the content and topic columns in training dataset
tokenized_datasets = tokenized_datasets.remove_columns(["topic","content"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 298
})

In [8]:
# remove the content and topic columns in test dataset
test_tokenized = test.map(test_tokenize, batched=True)
test_tokenized = test_tokenized.remove_columns(["Id","Title", "Body","text","Tags"])
test_tokenized = test_tokenized.rename_column("label", "labels")
test_tokenized.set_format("torch")
test_tokenized

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 391
})

In [9]:
# shuffle the dataset and split into train, eval and test
train_dataset = tokenized_datasets.shuffle(seed=42).select(range(0,200))
eval_dataset = tokenized_datasets.shuffle(seed=42).select(range(200,298))
test_dataset = test_tokenized.shuffle(seed=42)

In [10]:
# vocab size
vocab_size = tokenizer.vocab_size

# loading the model

In [11]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("gpt2-medium", num_labels=4, vocab_size= vocab_size, pad_token_id=tokenizer.eos_token_id)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2-medium and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer", num_train_epochs=2, per_device_train_batch_size=2, per_device_eval_batch_size=2,logging_dir="test_trainer", evaluation_strategy="epoch")

In [17]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

In [18]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [19]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="/test_trainer", num_train_epochs=3, per_device_train_batch_size=1, per_device_eval_batch_size=1,logging_dir="test_trainer", evaluation_strategy="epoch")

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


# hyperparameters

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [21]:
import torch
torch.cuda.empty_cache()

In [22]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [23]:
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

Using device: cuda



# Training

In [24]:
trainer.train()

***** Running training *****
  Num examples = 200
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 300
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.549496,0.836735
2,No log,0.454798,0.897959
3,No log,0.399188,0.928571


***** Running Evaluation *****
  Num examples = 98
  Batch size = 2
***** Running Evaluation *****
  Num examples = 98
  Batch size = 2
***** Running Evaluation *****
  Num examples = 98
  Batch size = 2


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=300, training_loss=0.600687001546224, metrics={'train_runtime': 430.5495, 'train_samples_per_second': 1.394, 'train_steps_per_second': 0.697, 'total_flos': 1114455932928000.0, 'train_loss': 0.600687001546224, 'epoch': 3.0})

# Testing of the Model

In [25]:
predictions = trainer.predict(test_dataset)
print(predictions.predictions.shape, predictions.label_ids.shape)

***** Running Prediction *****
  Num examples = 391
  Batch size = 2


(391, 4) (391,)


In [29]:
import numpy as np
labels = predictions.label_ids
preds = np.argmax(predictions.predictions, axis=-1)
metric.compute(predictions=preds, references=labels)

{'accuracy': 0.5907928388746803}

In [30]:
# -- body
def test_tokenize(examples):
        return tokenizer(examples["Body"], padding="max_length", truncation=True)
test_tokenized = test.map(test_tokenize, batched=True)
test_tokenized = test_tokenized.remove_columns(["Id","Title", "Body","text","Tags"])
test_tokenized = test_tokenized.rename_column("label", "labels")
test_tokenized.set_format("torch")
test_tokenized


  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 391
})

In [31]:
test_dataset = test_tokenized.shuffle(seed=42)
predictions = trainer.predict(test_dataset)
print(predictions.predictions.shape, predictions.label_ids.shape)
import numpy as np
labels = predictions.label_ids
preds = np.argmax(predictions.predictions, axis=-1)
metric.compute(predictions=preds, references=labels)

***** Running Prediction *****
  Num examples = 391
  Batch size = 2


(391, 4) (391,)


{'accuracy': 0.5268542199488491}

In [32]:
# -- title + body
def test_tokenize(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)
test_tokenized = test.map(test_tokenize, batched=True)
test_tokenized = test_tokenized.remove_columns(["Id","Title", "Body","text","Tags"])
test_tokenized = test_tokenized.rename_column("label", "labels")
test_tokenized.set_format("torch")
test_tokenized

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 391
})

In [33]:
test_dataset = test_tokenized.shuffle(seed=42)
predictions = trainer.predict(test_dataset)
print(predictions.predictions.shape, predictions.label_ids.shape)
import numpy as np
labels = predictions.label_ids
preds = np.argmax(predictions.predictions, axis=-1)
metric.compute(predictions=preds, references=labels)

***** Running Prediction *****
  Num examples = 391
  Batch size = 2


(391, 4) (391,)


{'accuracy': 0.5754475703324808}