In [2]:
import torch
print(torch.cuda.get_device_name(0))
print(torch.version.cuda)


AssertionError: Torch not compiled with CUDA enabled

In [3]:
model_name = "gpt2"
train_file = "dataset_train.csv"
output_dir = "/model"

In [4]:
# Load the GPT tokenizer.
# https://huggingface.co/docs/transformers/v4.25.1/en/model_doc/gpt2#transformers.GPT2Tokenizer
from transformers import GPT2Tokenizer

 # gpt2-medium
tokenizer = GPT2Tokenizer.from_pretrained(
    model_name, 
    bos_token='<|startoftext|>', 
    eos_token='<|endoftext|>', 
    pad_token='<|pad|>'
)
     



In [5]:
print("The max model length is {} for this model, although the actual embedding size for GPT small is 768".format(tokenizer.model_max_length))
print("The beginning of sequence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id))
print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id))
print("The padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id))

The max model length is 1024 for this model, although the actual embedding size for GPT small is 768
The beginning of sequence token <|startoftext|> token has the id 50257
The end of sequence token <|endoftext|> has the id 50256
The padding token <|pad|> has the id 50258


In [6]:
# Get the datasets

from datasets import load_dataset

data_files = {}
dataset_args = {}
validation_split_percentage = 5
extension = "csv"
data_files = {
    "train": train_file, 
}

raw_datasets = load_dataset(
    extension, 
    sep=";", 
    data_files=data_files
)

raw_datasets["validation"] = load_dataset(
    extension,
    sep=";", 
    data_files=data_files,
    split=f"train[:{validation_split_percentage}%]",
    **dataset_args,
)

raw_datasets["train"] = load_dataset(
    extension,
    sep=";", 
    data_files=data_files,
    split=f"train[{validation_split_percentage}%:]",
    **dataset_args,
)

In [7]:
# For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
# 'text' is found. You can easily tweak this behavior (see below).
text_column_name = "text"
column_names = raw_datasets["train"].column_names

# The number of processes to use for the preprocessing.
preprocessing_num_workers = None

# We can now call the tokenizer on all our texts.
# This is very simple, using the map method from the Datasets library.
# First we define a function that call the tokenizer on our texts:
def tokenize_function(examples):
    output = tokenizer(examples[text_column_name])
    return output

# Then we apply it to all the splits in our datasets object, using batched=True 
# and 4 processes to speed up the preprocessing.
# We won't need the description column afterward, so we discard it.

tokenized_datasets = raw_datasets.map(
    tokenize_function, 
    batched=True, 
    num_proc=preprocessing_num_workers,
    remove_columns=column_names,
    desc="Running tokenizer on dataset",
)

In [8]:
from itertools import chain

# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
block_size = 1024

# Overwrite the cached training and evaluation sets
overwrite_cache = False

# Code from here:
# https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py#L445
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
# for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
# to preprocess.
#
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=preprocessing_num_workers,
    load_from_cache_file=not overwrite_cache,
    desc=f"Grouping texts in chunks of {block_size}",
)

In [11]:
import torch
print(torch.version.cuda)

None


In [13]:
import torch
import random
import numpy as np

from transformers import GPT2Config, GPT2LMHeadModel

# I'm not really doing anything with the config buheret
configuration = GPT2Config.from_pretrained(
    model_name, 
    output_hidden_states=False
  )

# instantiate the model
model = GPT2LMHeadModel.from_pretrained(
    model_name, 
    config=configuration
)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

# Tell pytorch to run this model on the GPU.
device = torch.device("cpu")
model.to(device)

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [15]:
# Initialize the Trainer
from transformers import (
    TrainingArguments, 
    Trainer, 
    default_data_collator, 
)

train_dataset = lm_datasets["train"]
eval_dataset = lm_datasets["validation"]

# https://huggingface.co/docs/transformers/v4.25.1/en/main_classes/trainer#transformers.TrainingArguments
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    evaluation_strategy="no",  # No evaluation is done during training.
    save_strategy="no",  # No save is done during training.
    )


# https://huggingface.co/docs/transformers/v4.25.1/en/main_classes/trainer#transformers.Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    # Data collator will default to DataCollatorWithPadding, so we change it.
    data_collator=default_data_collator,
    compute_metrics=None,
    preprocess_logits_for_metrics=None,
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [11]:
# Training
train_result = trainer.train(resume_from_checkpoint=None)

NameError: name 'trainer' is not defined

In [12]:

print("Saving model to %s" % output_dir)

trainer.save_model(output_dir=output_dir)  # Saves the tokenizer too for easy upload
metrics = train_result.metrics
trainer.save_metrics("train", metrics)
trainer.save_state()

Saving model to /model


NameError: name 'trainer' is not defined

In [13]:

kwargs = {
    "finetuned_from": model_name, 
    "tasks": "text-generation"
    }
trainer.create_model_card(**kwargs)

NameError: name 'trainer' is not defined

In [14]:

# # Load a trained model and vocabulary that you have fine-tuned
#model = GPT2LMHeadModel.from_pretrained(output_dir)
#tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
#model.to(device)

In [15]:

# Generate Text
model.eval()

prompt = "As a sowtware architect, I"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

print(generated)

sample_outputs = model.generate(
                                generated, 
                                #bos_token_id=random.randint(1,30000),
                                do_sample=True,   
                                top_k=50, 
                                max_length = 300,
                                top_p=0.95, 
                                num_return_sequences=5
                                )

for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

AssertionError: Torch not compiled with CUDA enabled