Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from fused_lamb.fused_lamb import FusedLAMBAMP # NOQA
# from fused_lamb.fused_lamb import FusedLAMBAMP # NOQA
2 changes: 1 addition & 1 deletion PyTorch/LanguageModeling/BERT/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
return model

def gelu(x):
return torch.nn.functional.gelu(x, approximate=True)
return torch.nn.functional.gelu(x, approximate="tanh")

def swish(x):
return x * torch.sigmoid(x)
Expand Down
14 changes: 10 additions & 4 deletions PyTorch/LanguageModeling/BERT/run_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@

import modeling
from schedulers import PolyWarmUpScheduler
from lamb_amp_opt.fused_lamb import FusedLAMBAMP
# from lamb_amp_opt.fused_lamb import FusedLAMBAMP
from apex.optimizers import FusedLAMB, FusedMixedPrecisionLamb

from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from utils import is_main_process, format_step, get_world_size, get_rank
Expand Down Expand Up @@ -183,6 +184,7 @@ def parse_arguments():
default=32,
type=int,
help="Total batch size for training.")
parser.add_argument("--new_lamb", type=bool, help="Use this flag to use FusedMixedPrecisionLamb")
parser.add_argument("--learning_rate",
default=5e-5,
type=float,
Expand Down Expand Up @@ -415,8 +417,10 @@ def prepare_model_and_optimizer(args, device, sequence_output_is_dense):
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

optimizer = FusedLAMBAMP(optimizer_grouped_parameters,
lr=args.learning_rate)
if args.new_lamb:
optimizer = FusedMixedPrecisionLamb(optimizer_grouped_parameters, lr=args.learning_rate)
else:
optimizer = FusedLAMB(optimizer_grouped_parameters, lr=args.learning_rate)
lr_scheduler = PolyWarmUpScheduler(optimizer,
warmup=args.warmup_proportion,
total_steps=args.max_steps,
Expand Down Expand Up @@ -463,7 +467,9 @@ def scale_by_grad_accum_steps_wrapper_hook(
if args.gradient_accumulation_steps > 1:
model.register_comm_hook(None, scale_by_grad_accum_steps_wrapper(allreduce_hook))

optimizer.setup_fp32_params()
if hasattr(optimizer, "setup_fp32_params"):
optimizer.setup_fp32_params()


criterion = BertPretrainingCriterion(config.vocab_size, sequence_output_is_dense=sequence_output_is_dense)

Expand Down
98 changes: 98 additions & 0 deletions PyTorch/LanguageModeling/BERT/scripts/configs/pretrain_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,104 @@
# See the License for the specific language governing permissions and
# limitations under the License.

a100-40g_2gpu_fp16 ()
{
train_batch_size="1024"
learning_rate="7.5e-4"
new_lamb="false"
precision="fp16"
num_gpus=2
warmup_proportion="0.2843"
train_steps=7038
save_checkpoint_steps=200
resume_training="false"
create_logfile="true"
accumulate_gradients="true"
gradient_accumulation_steps=32
seed=42
job_name="bert_lamb_pretraining_old_fused_lamb"
allreduce_post_accumulation="true"
allreduce_post_accumulation_fp16="false"
train_batch_size_phase2=512
learning_rate_phase2="4e-3"
warmup_proportion_phase2="0.128"
train_steps_phase2=1563
gradient_accumulation_steps_phase2=128
DATASET=pretrain/phase1/unbinned/parquet # change this for other datasets
DATA_DIR_PHASE1="$BERT_PREP_WORKING_DIR/${DATASET}/"
BERT_CONFIG=bert_configs/large.json
CODEDIR="/mkozuki/repositories/DeepLearningExamples/PyTorch/LanguageModeling/BERT"
init_checkpoint="None"
DATASET2=pretrain/phase2/bin_size_64/parquet # change this for other datasets
DATA_DIR_PHASE2="$BERT_PREP_WORKING_DIR/${DATASET2}/"
wikipedia_source=$BERT_PREP_WORKING_DIR/wikipedia/source/
num_dask_workers=128
num_shards_per_worker=128
num_workers=4
sample_ratio="0.9"
phase2_bin_size=64
masking=static
echo $train_batch_size $learning_rate $precision $num_gpus \
$warmup_proportion $train_steps $save_checkpoint_steps \
$resume_training $create_logfile $accumulate_gradients \
$gradient_accumulation_steps $seed $job_name $allreduce_post_accumulation \
$allreduce_post_accumulation_fp16 $train_batch_size_phase2 $learning_rate_phase2 \
$warmup_proportion_phase2 $train_steps_phase2 $gradient_accumulation_steps_phase2 \
$DATA_DIR_PHASE1 $DATA_DIR_PHASE2 $CODEDIR $init_checkpoint \
$wikipedia_source $num_dask_workers $num_shards_per_worker $num_workers \
$sample_ratio $phase2_bin_size $masking \
$BERT_CONFIG
}

a100-40g_2gpu_fp16_new_lamb ()
{
train_batch_size="1024"
learning_rate="7.5e-4"
new_lamb="true"
precision="fp16"
num_gpus=2
warmup_proportion="0.2843"
train_steps=7038
save_checkpoint_steps=200
resume_training="false"
create_logfile="true"
accumulate_gradients="true"
gradient_accumulation_steps=32
seed=42
job_name="bert_lamb_pretraining_new_fused_lamb"
allreduce_post_accumulation="true"
allreduce_post_accumulation_fp16="false"
train_batch_size_phase2=512
learning_rate_phase2="4e-3"
warmup_proportion_phase2="0.128"
train_steps_phase2=1563
gradient_accumulation_steps_phase2=128
DATASET=pretrain/phase1/unbinned/parquet # change this for other datasets
DATA_DIR_PHASE1="$BERT_PREP_WORKING_DIR/${DATASET}/"
BERT_CONFIG=bert_configs/large.json
CODEDIR="/mkozuki/repositories/DeepLearningExamples/PyTorch/LanguageModeling/BERT"
init_checkpoint="None"
DATASET2=pretrain/phase2/bin_size_64/parquet # change this for other datasets
DATA_DIR_PHASE2="$BERT_PREP_WORKING_DIR/${DATASET2}/"
wikipedia_source=$BERT_PREP_WORKING_DIR/wikipedia/source/
num_dask_workers=128
num_shards_per_worker=128
num_workers=4
sample_ratio="0.9"
phase2_bin_size=64
masking=static
echo $train_batch_size $learning_rate $precision $num_gpus \
$warmup_proportion $train_steps $save_checkpoint_steps \
$resume_training $create_logfile $accumulate_gradients \
$gradient_accumulation_steps $seed $job_name $allreduce_post_accumulation \
$allreduce_post_accumulation_fp16 $train_batch_size_phase2 $learning_rate_phase2 \
$warmup_proportion_phase2 $train_steps_phase2 $gradient_accumulation_steps_phase2 \
$DATA_DIR_PHASE1 $DATA_DIR_PHASE2 $CODEDIR $init_checkpoint \
$wikipedia_source $num_dask_workers $num_shards_per_worker $num_workers \
$sample_ratio $phase2_bin_size $masking \
$BERT_CONFIG
}

dgxa100-80g_8gpu_fp16 ()
{
train_batch_size="8192"
Expand Down
162 changes: 82 additions & 80 deletions PyTorch/LanguageModeling/BERT/scripts/run_pretraining.sh
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ CMD+=" --max_steps=$train_steps"
CMD+=" --warmup_proportion=$warmup_proportion"
CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
CMD+=" --learning_rate=$learning_rate"
CMD+=" --new_lamb=$new_lamb"
CMD+=" --seed=$seed"
CMD+=" $PREC"
CMD+=" $ACCUMULATE_GRADIENTS"
Expand Down Expand Up @@ -244,86 +245,87 @@ if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then
ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16"
fi

if [ ! -d "${DATA_DIR_PHASE2}" ] || [ -z "$(ls -A ${DATA_DIR_PHASE2})" ]; then
echo "Warning! ${DATA_DIR_PHASE2} directory missing."
if [ ! -d "${wikipedia_source}" ] || [ -z "$(ls -A ${wikipedia_source})" ]; then
echo "Error! ${wikipedia_source} directory missing. Training cannot start!"
return -1
fi
preprocess_cmd=" \
mpirun \
--oversubscribe \
--allow-run-as-root \
-np ${num_dask_workers} \
-x LD_PRELOAD=/opt/conda/lib/libjemalloc.so \
preprocess_bert_pretrain \
--schedule mpi \
--vocab-file ${VOCAB_FILE} \
--wikipedia ${wikipedia_source} \
--sink ${DATA_DIR_PHASE2} \
--target-seq-length 512 \
--num-blocks ${num_blocks} \
--sample-ratio ${sample_ratio} \
${phase2_bin_size_flag} \
${masking_flag} \
--seed ${seed}"
echo "Running ${preprocess_cmd} ..."
${preprocess_cmd}

balance_load_cmd=" \
mpirun \
--oversubscribe \
--allow-run-as-root \
-np ${num_dask_workers} \
balance_dask_output \
--indir ${DATA_DIR_PHASE2} \
--num-shards ${num_blocks}"
echo "Running ${balance_load_cmd} ..."
${balance_load_cmd}
fi
echo $DATA_DIR_PHASE2
INPUT_DIR=$DATA_DIR_PHASE2
CMD=" $CODEDIR/run_pretraining.py"
CMD+=" --input_dir=$DATA_DIR_PHASE2"
CMD+=" --output_dir=$CHECKPOINTS_DIR"
CMD+=" --config_file=$BERT_CONFIG"
CMD+=" --vocab_file=$VOCAB_FILE"
CMD+=" --train_batch_size=$train_batch_size_phase2"
CMD+=" --max_seq_length=512"
CMD+=" --max_predictions_per_seq=80"
CMD+=" --max_steps=$train_steps_phase2"
CMD+=" --warmup_proportion=$warmup_proportion_phase2"
CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
CMD+=" --learning_rate=$learning_rate_phase2"
CMD+=" --seed=$seed"
CMD+=" $PREC"
CMD+=" $ACCUMULATE_GRADIENTS"
CMD+=" $CHECKPOINT"
CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
CMD+=" --do_train --phase2 --resume_from_checkpoint --phase1_end_step=$train_steps"
CMD+=" --json-summary ${RESULTS_DIR}/dllogger.json "
CMD+=" --disable_progress_bar"
CMD+=" --num_workers=${num_workers}"

CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"

if [ "$create_logfile" = "true" ] ; then
export GBS=$(expr $train_batch_size_phase2 \* $num_gpus)
printf -v TAG "pyt_bert_pretraining_phase2_%s_gbs%d" "$precision" $GBS
DATESTAMP=`date +'%y%m%d%H%M%S'`
LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
printf "Logs written to %s\n" "$LOGFILE"
fi

set -x
if [ -z "$LOGFILE" ] ; then
$CMD
else
(
$CMD
) |& tee $LOGFILE
fi
# if [ ! -d "${DATA_DIR_PHASE2}" ] || [ -z "$(ls -A ${DATA_DIR_PHASE2})" ]; then
# echo "Warning! ${DATA_DIR_PHASE2} directory missing."
# if [ ! -d "${wikipedia_source}" ] || [ -z "$(ls -A ${wikipedia_source})" ]; then
# echo "Error! ${wikipedia_source} directory missing. Training cannot start!"
# return -1
# fi
# preprocess_cmd=" \
# mpirun \
# --oversubscribe \
# --allow-run-as-root \
# -np ${num_dask_workers} \
# -x LD_PRELOAD=/opt/conda/lib/libjemalloc.so \
# preprocess_bert_pretrain \
# --schedule mpi \
# --vocab-file ${VOCAB_FILE} \
# --wikipedia ${wikipedia_source} \
# --sink ${DATA_DIR_PHASE2} \
# --target-seq-length 512 \
# --num-blocks ${num_blocks} \
# --sample-ratio ${sample_ratio} \
# ${phase2_bin_size_flag} \
# ${masking_flag} \
# --seed ${seed}"
# echo "Running ${preprocess_cmd} ..."
# ${preprocess_cmd}
#
# balance_load_cmd=" \
# mpirun \
# --oversubscribe \
# --allow-run-as-root \
# -np ${num_dask_workers} \
# balance_dask_output \
# --indir ${DATA_DIR_PHASE2} \
# --num-shards ${num_blocks}"
# echo "Running ${balance_load_cmd} ..."
# ${balance_load_cmd}
# fi
# echo $DATA_DIR_PHASE2
# INPUT_DIR=$DATA_DIR_PHASE2
# CMD=" $CODEDIR/run_pretraining.py"
# CMD+=" --input_dir=$DATA_DIR_PHASE2"
# CMD+=" --output_dir=$CHECKPOINTS_DIR"
# CMD+=" --config_file=$BERT_CONFIG"
# CMD+=" --vocab_file=$VOCAB_FILE"
# CMD+=" --train_batch_size=$train_batch_size_phase2"
# CMD+=" --max_seq_length=512"
# CMD+=" --max_predictions_per_seq=80"
# CMD+=" --max_steps=$train_steps_phase2"
# CMD+=" --warmup_proportion=$warmup_proportion_phase2"
# CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
# CMD+=" --learning_rate=$learning_rate_phase2"
# CMD+=" --new_lamb=$new_lamb"
# CMD+=" --seed=$seed"
# CMD+=" $PREC"
# CMD+=" $ACCUMULATE_GRADIENTS"
# CMD+=" $CHECKPOINT"
# CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
# CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
# CMD+=" --do_train --phase2 --resume_from_checkpoint --phase1_end_step=$train_steps"
# CMD+=" --json-summary ${RESULTS_DIR}/dllogger.json "
# CMD+=" --disable_progress_bar"
# CMD+=" --num_workers=${num_workers}"

# CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"

# if [ "$create_logfile" = "true" ] ; then
# export GBS=$(expr $train_batch_size_phase2 \* $num_gpus)
# printf -v TAG "pyt_bert_pretraining_phase2_%s_gbs%d" "$precision" $GBS
# DATESTAMP=`date +'%y%m%d%H%M%S'`
# LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
# printf "Logs written to %s\n" "$LOGFILE"
# fi

# set -x
# if [ -z "$LOGFILE" ] ; then
# $CMD
# else
# (
# $CMD
# ) |& tee $LOGFILE
# fi

set +x

Expand Down