Skip to content

Commit

Permalink
modified single GPU version
Browse files Browse the repository at this point in the history
  • Loading branch information
cristinae committed Aug 4, 2023
1 parent f6fbaca commit 95c43c6
Show file tree
Hide file tree
Showing 5 changed files with 97 additions and 129 deletions.
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@ Transformer for classification tasks that operates with document fragments
## Features

* UPCOMING: pert distribution for building document embeddings
* Multi-node and multi-GPU support using the ```Accelerate``` library
* Gradient accumulation to train with larger effective batches
* Gradient accumulation to train with larger effective batches using the ```Accelerate``` library
* An input data streaming implementation to allow training with large datasets
* Possibility to build document embeddings before classification both during training and classification
* Document embedding built as the average of the ```[CLS]``` token of _n_ parts of the document:
Expand All @@ -21,10 +20,12 @@ Transformer for classification tasks that operates with document fragments

## Requirements

* [PyTorch](http://pytorch.org/) version >= 1.9.1
* [Python](https://www.python.org) version >= 3.9
* [PyTorch](http://pytorch.org/) version >= 2.0.1
* [Accelerate](https://github.com/huggingface/accelerate) version >= 0.21.0

## Example Usage

### Slurm

``` srun --ntasks 1 --gpus-per-task 4 accelerate launch --multi_gpu docClassifier.py --gradient_accumulation_size 2```
``` srun --ntasks 1 --gpus-per-task 1 python -u docClassifier.py --gradient_accumulation_size 2```
11 changes: 5 additions & 6 deletions data.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ def __init__(self, filename):
def line_mapper(self, line):

columns = line.split('\t')
label = columns[0]
label = labelToyData(label)
#label = labelPoliticsData(label)
label = columns[2]
#label = labelToyData(label)
label = labelPoliticsData(label)

doc = columns[3]
doc = columns[5]
return (doc, label)

def __iter__(self):
Expand Down Expand Up @@ -150,7 +150,6 @@ def classNamesPoliticsData():
return ['Left', 'Right']

def labelPoliticsData(label):
#neutral appears in the test, but it is not used
d = {'left':0, 'right':1, 'neutral':1}
d = {'left':0, 'right':1}
return d[label]

66 changes: 31 additions & 35 deletions docClassifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,14 @@

import warnings
import argparse
import os.path
import logging

from accelerate import Accelerator
from accelerate.utils import set_seed
import os

import torch
import trainer

import numpy as np
import random


logger = logging.getLogger(__name__)


def readCommandLine():
''' Parser for command line arguments'''
Expand All @@ -29,12 +22,13 @@ def boolean_string(s):
parser = argparse.ArgumentParser(description="Fine-tuning Roberta for document classification")

# Input/Output
parser.add_argument("-c", "--train_dataset", required=False, type=str, default='./train10000', help="Training dataset for classification")
parser.add_argument("-v", "--validation_dataset", type=str, default='./dev10000', help="Validation set")
parser.add_argument("-c", "--train_dataset", required=False, type=str, default='../chatGPT/corpora/full.trainSelectedTopicsC.endees', help="Training dataset for classification")
parser.add_argument("-v", "--validation_dataset", type=str, default='../chatGPT/corpora/full.devSelectedTopicsC.endees', help="Validation set")
parser.add_argument("-t", "--test_dataset", type=str, default='./corpus/corpus.right.elimparcial.txt', help="Test set to evaluate or classify")
parser.add_argument("-o", "--classification_model", required=False, type=str, default='./model/model.monob6.bin', help="Name for the model file")
parser.add_argument("--buffer", type=int, default=1000, help="Test documents are not loaded completely into memory but into <buffer> chunks. Default: 100000 documents.")
parser.add_argument("--shuffling", type=boolean_string, default=True, help="Suffling within a dataset buffer. Options: True, False. Default: True.")
parser.add_argument("-o", "--classification_model", required=False, type=str, default='./model/model.es.8b56lre6.bin', help="Name for the model file")
# Shuffling has been removed, please shuffle the training data beforehand
#parser.add_argument("--buffer", type=int, default=10000, help="Test documents are not loaded completely into memory but into <buffer> chunks. Default: 100000 documents.")
#parser.add_argument("--shuffling", type=boolean_string, default=True, help="Suffling within a dataset buffer. Options: True, False. Default: True.")

# Task (training by default)
parser.add_argument("--task", type=str, default='training', help="Task to perform. Options: training, evaluation, classification. Default: training.")
Expand All @@ -50,56 +44,58 @@ def boolean_string(s):
parser.add_argument("--split_method", type=str, default='sentence', help="How to split the document. Options: char (exact splitting at char level), sentence (aprox splitting at sentence level). Default: sentence.")

# Base model
parser.add_argument("-m", "--pretrained_model", type=str, default='skimai/spanberta-base-cased', help="pretrained model (currently only Roberta family implemented)")
# xlm-roberta-large
parser.add_argument("-m", "--pretrained_model", type=str, default='xlm-roberta-large', help="pretrained model (currently only Roberta family implemented)")
parser.add_argument("-f", "--freeze_pretrained", type=boolean_string, default=False, help="Freeze weights of the pretrained model. Options: True, False")

# Training
parser.add_argument("-e", "--epochs", type=int, default=5, help="Number of epochs")
parser.add_argument("-b", "--batch_size", type=int, default=6, help="Number of documents in a batch. Default: 1")
parser.add_argument("--gradient_accumulation_size", type=int, default=1, help="Creating a larger effective batch size by accumulating n batches before updating. Default: 1")
parser.add_argument("--sentence_batch_size", type=int, default=12, help="Number of sentences per document in a batch. Default: 24.")
parser.add_argument("--eval_steps", type=int, default=500, help="Number of batches prior to validation. Default: 100")
parser.add_argument("-e", "--epochs", type=int, default=6, help="Number of epochs")
parser.add_argument("-b", "--batch_size", type=int, default=8, help="Number of documents in a batch. Default: 1")
parser.add_argument("--sentence_batch_size", type=int, default=24, help="Number of sentences per document in a batch. Default: 24.")
parser.add_argument("--eval_steps", type=int, default=1000, help="Number of batches prior to validation. Default: 1000")

parser.add_argument("--lr", type=float, default=2e-5, help="Learning rate for AdamW. Default: 2e-5.")
parser.add_argument("--lr", type=float, default=5e-6, help="Learning rate for AdamW. Default: 5e-6.")
parser.add_argument("--dropout_prepooling", type=float, default=0.1, help="Dropout to be applied after retrieving the embeddings and before pooling. Default: 0.1.")

# Classifier
parser.add_argument("-d", "--dropout_postpooling", type=float, default=0.1, help="Dropout to be applied before the last linear layer of the classifier. Default: 0.1.")
parser.add_argument("--input_to_classifier", type=str, default='cls_tanh', help="Type of input for the classifier. Options: cls_tanh, cls_raw, poolAvg_tanh, poolAvg_raw. If split_documents is set to True, only cls_tanh is available. Default: cls_tanh.")
parser.add_argument("--number_classes", type=int, default=3, help="Number of classes. Default: 3")
parser.add_argument("--number_classes", type=int, default=2, help="Number of classes. Default: 2")

# Utils
parser.add_argument("--seed", type=int, default=1642, help="Seed to be used by torch, numpy and random (int)")


args = parser.parse_args()
return(args)


def check_args(args):
# TODO
#if not os.path.isfile(args.classification_model):
# raise ValueError(".")
if (args.input_to_classifier):
if (args.input_classifier):
return


if __name__ == "__main__":

args = readCommandLine()
check_args(args)

accelerator = Accelerator(dispatch_batches=False, gradient_accumulation_steps=args.gradient_accumulation_size)
device = accelerator.device
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Working on" , device)

# TODO seed per commandline
RANDOM_SEED = 1642
set_seed(RANDOM_SEED)
RANDOM_SEED = args.seed
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

torch.cuda.manual_seed_all(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Working on" , device)

# the trained model will be stored in 'model'
if not os.path.exists('model'):
os.makedirs('model')

torch.set_printoptions(threshold=10000)

if(args.task == 'training'):
trainer.trainingLoop(accelerator, device, args)
trainer.trainingLoop(device, args)
elif (args.task == 'evaluation'):
trainer.evaluation(device, args)
elif (args.task == 'classification'):
Expand Down
21 changes: 7 additions & 14 deletions network.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-

import transformers
from transformers import get_linear_schedule_with_warmup, RobertaTokenizer, RobertaModel
from transformers import get_linear_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup, XLMRobertaTokenizer, XLMRobertaModel
import torch
from torch import nn, optim

Expand All @@ -11,10 +11,7 @@ class DocTransformerClassifier(nn.Module):

def __init__(self, nClasses, args, device):
super(DocTransformerClassifier, self).__init__()
# add_pooling_layer=False is needed for parallelism
# https://github.com/UKPLab/sentence-transformers/issues/1454
# https://github.com/UKPLab/sentence-transformers/pull/1215
self.transformer = RobertaModel.from_pretrained(args.pretrained_model, add_pooling_layer=False, return_dict=False)
self.transformer = XLMRobertaModel.from_pretrained(args.pretrained_model, return_dict=False)
self.device = device
self.batch_size = args.batch_size
self.split_docs = args.split_documents
Expand Down Expand Up @@ -49,8 +46,6 @@ def forward(self, input_ids, attention_mask):
lineBreak = [0, 203, 2] # Achtung! Hardcoded for Roberta tokeniser (but we don't have the tokeniser at this point)
last_hidden_batch = torch.zeros([1, self.transformer.config.hidden_size], dtype=torch.float32).to(self.device)
last_hidden_state_average = torch.zeros([self.batch_size, self.transformer.config.hidden_size], dtype=torch.float32).to(self.device)
#last_hidden_batch = torch.zeros([1, self.transformer.config.hidden_size], dtype=torch.float32)
#last_hidden_state_average = torch.zeros([self.batch_size, self.transformer.config.hidden_size], dtype=torch.float32)
batch = 0
sentences_in_batch = 0
for sentence_ids, last_hidden in zip(input_ids, last_hidden_state):
Expand All @@ -61,7 +56,6 @@ def forward(self, input_ids, attention_mask):
batch += 1
sentences_in_batch = 0
last_hidden_batch = torch.zeros([1, self.transformer.config.hidden_size], dtype=torch.float64).to(self.device)
#last_hidden_batch = torch.zeros([1, self.transformer.config.hidden_size], dtype=torch.float64)
# This is the equivalent to cls_tanh
pooled_bymethod = self.tanhPrep(last_hidden_state_average)
else:
Expand Down Expand Up @@ -111,16 +105,16 @@ def mean_pool(token_embeds, attention_mask):

def setTokenizer(args):

return RobertaTokenizer.from_pretrained(args.pretrained_model)
return XLMRobertaTokenizer.from_pretrained(args.pretrained_model)

def loadTokenizer():

return RobertaTokenizer.from_pretrained('./model/')
return XLMRobertaTokenizer.from_pretrained('./model/')

def setModel(args, device, nClasses):

model = DocTransformerClassifier(nClasses, args, device)
model = model.to(device) # adding accelerate
model = model.to(device)

return(model)

Expand All @@ -135,14 +129,13 @@ def setScheduler(args, optimizer, dataSize):

totalSteps = dataSize*args.epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=totalSteps)

#scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=totalSteps, num_cycles=int(args.epochs/2))
return(scheduler)


def setLoss(device):

lossFN = nn.CrossEntropyLoss().to(device) # adding accelerate
#lossFN = nn.CrossEntropyLoss()
lossFN = nn.CrossEntropyLoss().to(device)
return(lossFN)


0 comments on commit 95c43c6

Please sign in to comment.