#### Importing Packages

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import torch
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
import random
import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Selecting cpu or gpu based on device

device = "cpu"

if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

print(f"Using {device} device.")

Using mps device.


In [3]:
# Parameters

N_POSITION = 1024 # 128

N_CTX = 1024 # 128

N_EMBD = 1024

N_LAYER = 16

N_HEAD = 16

# SAVE_STEPS = 10_000

LEARNING_RATE = 1e-6

SAVE_STEPS = 100

NUM_EPOCH = 1

BATCH_SIZE = 4 

SAVE_TOTAL_LIMIT = 1

BLOCK_SIZE = 1024 

OUTPUT_DIR = "./gpt_mental_health"

INPUT_FILE = "Train-Data.txt"

OVERWRITE_OUTPUT_DIR = True

MODEL_PATH = "Models/New_E6"

VOCAB_SIZE = 51000

LAST_CHECK_POINT = None

# W_B_API ='3d46c4eb36ede90bf9d2f07dbb15218d3476042a'

In [4]:
# Defining the gpt-2 tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [5]:
# Dataset class to process the data for training

class MentalHealthDataset(Dataset):
    def __init__(self, tokenizer, file_path, block_size, shuffle=True):
        self.tokenizer = tokenizer
        self.block_size = block_size
        self.samples = []
        
        with open(file_path, encoding="utf-8") as f:
            for sentence in f.readlines():
                self.samples.append(sentence)
            
            if shuffle:
                random.shuffle(self.samples)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        tokenized_text = tokenizer(self.samples[idx] , padding="max_length", truncation=True, max_length=self.block_size, return_tensors="pt")
        encoded_inputs = tokenized_text

        input_ids = torch.tensor(encoded_inputs['input_ids'], dtype=torch.long)
        attention_mask = torch.tensor(encoded_inputs['attention_mask'], dtype=torch.long)
        
        labels = input_ids.clone()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

In [6]:
# Initialize GPT-2 configuration
config = GPT2Config(
  vocab_size = VOCAB_SIZE,
  n_positions = N_POSITION,
  n_ctx = N_CTX,
  n_embd = N_EMBD,
  n_layer = N_LAYER,
  n_head = N_HEAD,
  learning_rate = LEARNING_RATE
)

In [7]:
# Initialize GPT-2 model
if MODEL_PATH is None:
    model = GPT2LMHeadModel(config)
    model = model.to(device)
else:
    model = GPT2LMHeadModel.from_pretrained(MODEL_PATH)
    model = model.to(device)

In [8]:
model.num_parameters

<bound method ModuleUtilsMixin.num_parameters of GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51000, 2048)
    (wpe): Embedding(1024, 2048)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-31): 32 x GPT2Block(
        (ln_1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=2048, out_features=51000, bias=False)
)>

In [9]:
# Traing arguments to train the model

training_args = TrainingArguments(
    output_dir = OUTPUT_DIR,
    overwrite_output_dir = OVERWRITE_OUTPUT_DIR,
    num_train_epochs = NUM_EPOCH,
    per_device_train_batch_size = BATCH_SIZE,
    save_steps = SAVE_STEPS,
    save_total_limit = SAVE_TOTAL_LIMIT,
    use_mps_device = True
)

In [10]:
training_args.device

device(type='mps')

In [11]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = MentalHealthDataset(tokenizer, INPUT_FILE, block_size=BLOCK_SIZE, shuffle=False) 
)

In [None]:
# Starting the training

if LAST_CHECK_POINT == None:
    trainer.train()
else:
    trainer.train(resume_from_checkpoint = True)

In [25]:
# Saing the model

model.save_pretrained("./New_Model")

In [10]:
# Loading the saved model

model = GPT2LMHeadModel.from_pretrained("./New_Model").to(device)