crcrpar · crcrpar · Mar 23, 2022 · Mar 23, 2022
diff --git a/PyTorch/LanguageModeling/BERT/lamb_amp_opt/fused_lamb/__init__.py b/PyTorch/LanguageModeling/BERT/lamb_amp_opt/fused_lamb/__init__.py
@@ -1 +1 @@
-from fused_lamb.fused_lamb import FusedLAMBAMP  # NOQA
+# from fused_lamb.fused_lamb import FusedLAMBAMP  # NOQA
diff --git a/PyTorch/LanguageModeling/BERT/modeling.py b/PyTorch/LanguageModeling/BERT/modeling.py
@@ -119,7 +119,7 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
     return model
 
 def gelu(x):
-    return torch.nn.functional.gelu(x, approximate=True)
+    return torch.nn.functional.gelu(x, approximate="tanh")
 
 def swish(x):
     return x * torch.sigmoid(x)

diff --git a/PyTorch/LanguageModeling/BERT/run_pretraining.py b/PyTorch/LanguageModeling/BERT/run_pretraining.py
@@ -40,7 +40,8 @@
 
 import modeling
 from schedulers import PolyWarmUpScheduler
-from lamb_amp_opt.fused_lamb import FusedLAMBAMP
+# from lamb_amp_opt.fused_lamb import FusedLAMBAMP
+from apex.optimizers import FusedLAMB, FusedMixedPrecisionLamb
 
 from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
 from utils import is_main_process, format_step, get_world_size, get_rank
@@ -183,6 +184,7 @@ def parse_arguments():
                         default=32,
                         type=int,
                         help="Total batch size for training.")
+    parser.add_argument("--new_lamb", type=bool, help="Use this flag to use FusedMixedPrecisionLamb")
     parser.add_argument("--learning_rate",
                         default=5e-5,
                         type=float,
@@ -415,8 +417,10 @@ def prepare_model_and_optimizer(args, device, sequence_output_is_dense):
         {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
         {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
 
-    optimizer = FusedLAMBAMP(optimizer_grouped_parameters,
-                             lr=args.learning_rate)
+    if args.new_lamb:
+        optimizer = FusedMixedPrecisionLamb(optimizer_grouped_parameters, lr=args.learning_rate)
+    else:
+        optimizer = FusedLAMB(optimizer_grouped_parameters, lr=args.learning_rate)
     lr_scheduler = PolyWarmUpScheduler(optimizer,
                                        warmup=args.warmup_proportion,
                                        total_steps=args.max_steps,
@@ -463,7 +467,9 @@ def scale_by_grad_accum_steps_wrapper_hook(
         if args.gradient_accumulation_steps > 1:
             model.register_comm_hook(None, scale_by_grad_accum_steps_wrapper(allreduce_hook))
 
-    optimizer.setup_fp32_params()
+    if hasattr(optimizer, "setup_fp32_params"):
+        optimizer.setup_fp32_params()
+
 
     criterion = BertPretrainingCriterion(config.vocab_size, sequence_output_is_dense=sequence_output_is_dense)
 

diff --git a/PyTorch/LanguageModeling/BERT/scripts/configs/pretrain_config.sh b/PyTorch/LanguageModeling/BERT/scripts/configs/pretrain_config.sh
@@ -13,6 +13,104 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+a100-40g_2gpu_fp16 ()
+{
+    train_batch_size="1024"
+    learning_rate="7.5e-4"
+    new_lamb="false"
+    precision="fp16"
+    num_gpus=2
+    warmup_proportion="0.2843"
+    train_steps=7038
+    save_checkpoint_steps=200
+    resume_training="false"
+    create_logfile="true"
+    accumulate_gradients="true"
+    gradient_accumulation_steps=32
+    seed=42
+    job_name="bert_lamb_pretraining_old_fused_lamb"
+    allreduce_post_accumulation="true"
+    allreduce_post_accumulation_fp16="false"
+    train_batch_size_phase2=512
+    learning_rate_phase2="4e-3"
+    warmup_proportion_phase2="0.128"
+    train_steps_phase2=1563
+    gradient_accumulation_steps_phase2=128
+    DATASET=pretrain/phase1/unbinned/parquet # change this for other datasets
+    DATA_DIR_PHASE1="$BERT_PREP_WORKING_DIR/${DATASET}/"
+    BERT_CONFIG=bert_configs/large.json
+    CODEDIR="/mkozuki/repositories/DeepLearningExamples/PyTorch/LanguageModeling/BERT"
+    init_checkpoint="None"
+    DATASET2=pretrain/phase2/bin_size_64/parquet # change this for other datasets
+    DATA_DIR_PHASE2="$BERT_PREP_WORKING_DIR/${DATASET2}/"
+    wikipedia_source=$BERT_PREP_WORKING_DIR/wikipedia/source/
+    num_dask_workers=128
+    num_shards_per_worker=128
+    num_workers=4
+    sample_ratio="0.9"
+    phase2_bin_size=64
+    masking=static
+    echo $train_batch_size $learning_rate $precision $num_gpus \
+         $warmup_proportion $train_steps $save_checkpoint_steps \
+         $resume_training $create_logfile $accumulate_gradients  \
+         $gradient_accumulation_steps $seed $job_name $allreduce_post_accumulation \
+         $allreduce_post_accumulation_fp16 $train_batch_size_phase2 $learning_rate_phase2 \
+         $warmup_proportion_phase2 $train_steps_phase2 $gradient_accumulation_steps_phase2 \
+         $DATA_DIR_PHASE1 $DATA_DIR_PHASE2 $CODEDIR $init_checkpoint \
+         $wikipedia_source $num_dask_workers $num_shards_per_worker $num_workers \
+         $sample_ratio $phase2_bin_size $masking \
+         $BERT_CONFIG
+}
+
+a100-40g_2gpu_fp16_new_lamb ()
+{
+    train_batch_size="1024"
+    learning_rate="7.5e-4"
+    new_lamb="true"
+    precision="fp16"
+    num_gpus=2
+    warmup_proportion="0.2843"
+    train_steps=7038
+    save_checkpoint_steps=200
+    resume_training="false"
+    create_logfile="true"
+    accumulate_gradients="true"
+    gradient_accumulation_steps=32
+    seed=42
+    job_name="bert_lamb_pretraining_new_fused_lamb"
+    allreduce_post_accumulation="true"
+    allreduce_post_accumulation_fp16="false"
+    train_batch_size_phase2=512
+    learning_rate_phase2="4e-3"
+    warmup_proportion_phase2="0.128"
+    train_steps_phase2=1563
+    gradient_accumulation_steps_phase2=128
+    DATASET=pretrain/phase1/unbinned/parquet # change this for other datasets
+    DATA_DIR_PHASE1="$BERT_PREP_WORKING_DIR/${DATASET}/"
+    BERT_CONFIG=bert_configs/large.json
+    CODEDIR="/mkozuki/repositories/DeepLearningExamples/PyTorch/LanguageModeling/BERT"
+    init_checkpoint="None"
+    DATASET2=pretrain/phase2/bin_size_64/parquet # change this for other datasets
+    DATA_DIR_PHASE2="$BERT_PREP_WORKING_DIR/${DATASET2}/"
+    wikipedia_source=$BERT_PREP_WORKING_DIR/wikipedia/source/
+    num_dask_workers=128
+    num_shards_per_worker=128
+    num_workers=4
+    sample_ratio="0.9"
+    phase2_bin_size=64
+    masking=static
+    echo $train_batch_size $learning_rate $precision $num_gpus \
+         $warmup_proportion $train_steps $save_checkpoint_steps \
+         $resume_training $create_logfile $accumulate_gradients  \
+         $gradient_accumulation_steps $seed $job_name $allreduce_post_accumulation \
+         $allreduce_post_accumulation_fp16 $train_batch_size_phase2 $learning_rate_phase2 \
+         $warmup_proportion_phase2 $train_steps_phase2 $gradient_accumulation_steps_phase2 \
+         $DATA_DIR_PHASE1 $DATA_DIR_PHASE2 $CODEDIR $init_checkpoint \
+         $wikipedia_source $num_dask_workers $num_shards_per_worker $num_workers \
+         $sample_ratio $phase2_bin_size $masking \
+         $BERT_CONFIG
+}
+
 dgxa100-80g_8gpu_fp16 ()
 {
     train_batch_size="8192"

diff --git a/PyTorch/LanguageModeling/BERT/scripts/run_pretraining.sh b/PyTorch/LanguageModeling/BERT/scripts/run_pretraining.sh
@@ -179,6 +179,7 @@ CMD+=" --max_steps=$train_steps"
 CMD+=" --warmup_proportion=$warmup_proportion"
 CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
 CMD+=" --learning_rate=$learning_rate"
+CMD+=" --new_lamb=$new_lamb"
 CMD+=" --seed=$seed"
 CMD+=" $PREC"
 CMD+=" $ACCUMULATE_GRADIENTS"
@@ -244,86 +245,87 @@ if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then
    ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16"
 fi
 
-if [ ! -d "${DATA_DIR_PHASE2}" ] || [ -z "$(ls -A ${DATA_DIR_PHASE2})" ]; then
-   echo "Warning! ${DATA_DIR_PHASE2} directory missing."
-   if [ ! -d "${wikipedia_source}" ] || [ -z "$(ls -A ${wikipedia_source})" ]; then
-      echo "Error! ${wikipedia_source} directory missing. Training cannot start!"
-      return -1
-   fi
-   preprocess_cmd=" \
-      mpirun \
-         --oversubscribe \
-         --allow-run-as-root \
-         -np ${num_dask_workers} \
-         -x LD_PRELOAD=/opt/conda/lib/libjemalloc.so \
-            preprocess_bert_pretrain \
-               --schedule mpi \
-               --vocab-file ${VOCAB_FILE} \
-               --wikipedia ${wikipedia_source} \
-               --sink ${DATA_DIR_PHASE2} \
-               --target-seq-length 512 \
-               --num-blocks ${num_blocks} \
-               --sample-ratio ${sample_ratio} \
-               ${phase2_bin_size_flag} \
-               ${masking_flag} \
-               --seed ${seed}"
-   echo "Running ${preprocess_cmd} ..."
-   ${preprocess_cmd}
-
-   balance_load_cmd=" \
-      mpirun \
-         --oversubscribe \
-         --allow-run-as-root \
-         -np ${num_dask_workers} \
-            balance_dask_output \
-               --indir ${DATA_DIR_PHASE2} \
-               --num-shards ${num_blocks}"
-   echo "Running ${balance_load_cmd} ..."
-   ${balance_load_cmd}
-fi
-echo $DATA_DIR_PHASE2
-INPUT_DIR=$DATA_DIR_PHASE2
-CMD=" $CODEDIR/run_pretraining.py"
-CMD+=" --input_dir=$DATA_DIR_PHASE2"
-CMD+=" --output_dir=$CHECKPOINTS_DIR"
-CMD+=" --config_file=$BERT_CONFIG"
-CMD+=" --vocab_file=$VOCAB_FILE"
-CMD+=" --train_batch_size=$train_batch_size_phase2"
-CMD+=" --max_seq_length=512"
-CMD+=" --max_predictions_per_seq=80"
-CMD+=" --max_steps=$train_steps_phase2"
-CMD+=" --warmup_proportion=$warmup_proportion_phase2"
-CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
-CMD+=" --learning_rate=$learning_rate_phase2"
-CMD+=" --seed=$seed"
-CMD+=" $PREC"
-CMD+=" $ACCUMULATE_GRADIENTS"
-CMD+=" $CHECKPOINT"
-CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
-CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
-CMD+=" --do_train --phase2 --resume_from_checkpoint --phase1_end_step=$train_steps"
-CMD+=" --json-summary ${RESULTS_DIR}/dllogger.json "
-CMD+=" --disable_progress_bar"
-CMD+=" --num_workers=${num_workers}"
-
-CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
-
-if [ "$create_logfile" = "true" ] ; then
-  export GBS=$(expr $train_batch_size_phase2 \* $num_gpus)
-  printf -v TAG "pyt_bert_pretraining_phase2_%s_gbs%d" "$precision" $GBS
-  DATESTAMP=`date +'%y%m%d%H%M%S'`
-  LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
-  printf "Logs written to %s\n" "$LOGFILE"
-fi
-
-set -x
-if [ -z "$LOGFILE" ] ; then
-   $CMD
-else
-   (
-     $CMD
-   ) |& tee $LOGFILE
-fi
+# if [ ! -d "${DATA_DIR_PHASE2}" ] || [ -z "$(ls -A ${DATA_DIR_PHASE2})" ]; then
+#    echo "Warning! ${DATA_DIR_PHASE2} directory missing."
+#    if [ ! -d "${wikipedia_source}" ] || [ -z "$(ls -A ${wikipedia_source})" ]; then
+#       echo "Error! ${wikipedia_source} directory missing. Training cannot start!"
+#       return -1
+#    fi
+#    preprocess_cmd=" \
+#       mpirun \
+#          --oversubscribe \
+#          --allow-run-as-root \
+#          -np ${num_dask_workers} \
+#          -x LD_PRELOAD=/opt/conda/lib/libjemalloc.so \
+#             preprocess_bert_pretrain \
+#                --schedule mpi \
+#                --vocab-file ${VOCAB_FILE} \
+#                --wikipedia ${wikipedia_source} \
+#                --sink ${DATA_DIR_PHASE2} \
+#                --target-seq-length 512 \
+#                --num-blocks ${num_blocks} \
+#                --sample-ratio ${sample_ratio} \
+#                ${phase2_bin_size_flag} \
+#                ${masking_flag} \
+#                --seed ${seed}"
+#    echo "Running ${preprocess_cmd} ..."
+#    ${preprocess_cmd}
+# 
+#    balance_load_cmd=" \
+#       mpirun \
+#          --oversubscribe \
+#          --allow-run-as-root \
+#          -np ${num_dask_workers} \
+#             balance_dask_output \
+#                --indir ${DATA_DIR_PHASE2} \
+#                --num-shards ${num_blocks}"
+#    echo "Running ${balance_load_cmd} ..."
+#    ${balance_load_cmd}
+# fi
+# echo $DATA_DIR_PHASE2
+# INPUT_DIR=$DATA_DIR_PHASE2
+# CMD=" $CODEDIR/run_pretraining.py"
+# CMD+=" --input_dir=$DATA_DIR_PHASE2"
+# CMD+=" --output_dir=$CHECKPOINTS_DIR"
+# CMD+=" --config_file=$BERT_CONFIG"
+# CMD+=" --vocab_file=$VOCAB_FILE"
+# CMD+=" --train_batch_size=$train_batch_size_phase2"
+# CMD+=" --max_seq_length=512"
+# CMD+=" --max_predictions_per_seq=80"
+# CMD+=" --max_steps=$train_steps_phase2"
+# CMD+=" --warmup_proportion=$warmup_proportion_phase2"
+# CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
+# CMD+=" --learning_rate=$learning_rate_phase2"
+# CMD+=" --new_lamb=$new_lamb"
+# CMD+=" --seed=$seed"
+# CMD+=" $PREC"
+# CMD+=" $ACCUMULATE_GRADIENTS"
+# CMD+=" $CHECKPOINT"
+# CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
+# CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
+# CMD+=" --do_train --phase2 --resume_from_checkpoint --phase1_end_step=$train_steps"
+# CMD+=" --json-summary ${RESULTS_DIR}/dllogger.json "
+# CMD+=" --disable_progress_bar"
+# CMD+=" --num_workers=${num_workers}"
+
+# CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
+
+# if [ "$create_logfile" = "true" ] ; then
+#   export GBS=$(expr $train_batch_size_phase2 \* $num_gpus)
+#   printf -v TAG "pyt_bert_pretraining_phase2_%s_gbs%d" "$precision" $GBS
+#   DATESTAMP=`date +'%y%m%d%H%M%S'`
+#   LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
+#   printf "Logs written to %s\n" "$LOGFILE"
+# fi
+
+# set -x
+# if [ -z "$LOGFILE" ] ; then
+#    $CMD
+# else
+#    (
+#      $CMD
+#    ) |& tee $LOGFILE
+# fi
 
 set +x
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from fused_lamb.fused_lamb import FusedLAMBAMP # NOQA
		# from fused_lamb.fused_lamb import FusedLAMBAMP # NOQA