In [1]:
import os
import random
from transformers import AutoTokenizer
from transformers import TextDataset,DataCollatorForLanguageModeling
import torch
from transformers import Trainer, TrainingArguments, AutoModelWithLMHead, AutoModelForCausalLM
from transformers import pipeline

C:\Users\gabri\anaconda3\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
C:\Users\gabri\anaconda3\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll


In [2]:
# Base model to finetune
base_model = "EleutherAI/gpt-neo-1.3B"

In [3]:
# What percent of the data is text data
test_per = 0.1

In [4]:
# Directories to the data
data_dir = f"Finetuning{os.sep}data_clean"
data_files = [str(i)+".txt" for i in range(1, 11)]

In [5]:
# Files to download/load data to
train_file_name = f"Finetuning{os.sep}train_data_mini.txt"
test_file_name = f"Finetuning{os.sep}test_data_mini.txt"

In [5]:
# test/train data sizes
test_size = 0
train_size = 0

# Open the output files
with open(train_file_name, "w") as train_file:
    with open(test_file_name, "w") as test_file:

        # Iterate over each file and load in the data
        for file in data_files:
            # Open the file
            with open(data_dir + os.sep + file, "r") as f:
                # Iterate over all data in the file
                for line in f:
                    # Get a random number between 0 and 1
                    num = random.uniform(0, 1)
                    
                    # If the number is greater than the test size,
                    # add it to the train data
                    if num > test_per:
                        train_file.write(line)
                        train_size += 1
                    else:
                        test_file.write(line)
                        test_size += 1
                        
print(f"Number of train data: {train_size}")
print(f"Number of test data: {test_size}")

Number of train data: 899830
Number of test data: 100170


In [6]:
# Load in the model
tokenizer = AutoTokenizer.from_pretrained(base_model, framework="pt", device=torch.device("cpu"), torch_dtype=torch.float16)

In [7]:
tokenizer

PreTrainedTokenizerFast(name_or_path='EleutherAI/gpt-neo-1.3B', vocab_size=50257, model_max_len=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'})

In [8]:
# Get the tokenizer max size
max_size = 1024

In [9]:
# Create the dataset
def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=max_size)

    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=max_size)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False, # non masking for generation
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_file_name,test_file_name,tokenizer)

Token indices sequence length is longer than the specified maximum sequence length for this model (8828858 > 2048). Running this sequence through the model will result in indexing errors


In [10]:
# Setup the model trainer
model = AutoModelForCausalLM.from_pretrained(base_model).to(torch.device("cpu"))

training_args = TrainingArguments(
    output_dir="Finetuning/outputs", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=1, # batch size for training
    per_device_eval_batch_size=1,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    no_cuda=False,
    fp16=True,
    fp16_full_eval=True,
    )

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

Using cuda_amp half precision backend


In [12]:
# Train the model
trainer.train()

***** Running training *****
  Num examples = 8621
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 25863
  Number of trainable parameters = 1315575808


Step,Training Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB (GPU 0; 23.99 GiB total capacity; 23.11 GiB already allocated; 0 bytes free; 23.14 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [13]:
# Save the model
trainer.save_model()

Saving model checkpoint to Finetuning/outputs
Configuration saved in Finetuning/outputs\config.json
Model weights saved in Finetuning/outputs\pytorch_model.bin


In [19]:
# Test the model
test_model = pipeline('text-generation',model="Finetuning/outputs", tokenizer='EleutherAI/gpt-neo-1.3B')

RuntimeError: Impossible to instantiate a pipeline with tokenizer specified but not the model as the provided tokenizer may not be compatible with the default model. Please provide a PreTrainedModel class or a path/identifier to a pretrained model when providing tokenizer.

In [21]:
test_model('I love you')[0]['generated_text']