/
speed_monitor.py
145 lines (127 loc) 路 5.76 KB
/
speed_monitor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""See MosaicML's Composer Speed Monitor Callback here:
https://github.com/mosaicml/composer/blob/dev/composer/callbacks/speed_monitor.py
"""
from typing import Optional
from pydantic import BaseModel
class MFUEstimationResult(BaseModel):
flops_per_token_per_fwdbwd: float
flops_per_sequence_per_fwdbwd: float
flops_per_iter_per_fwdbwd: float
flops_achieved_per_second: float
mfu: float
def estimate_mfu(
num_decoder_blocks: int,
num_heads: int,
d_model: int,
context_length: int,
model_total_parameters: int,
effective_batch_size_per_iter: int,
time_taken_per_iter: float,
gpu_promised_flops: float = 312e12, # A100 GPU bfloat16/float16 peak flops is 312 TFLOPS
) -> MFUEstimationResult:
"""
Estimate Model FLOPs Utilization (MFU) as a ratio of achieved FLOPs to
the A100 GPU's peak FLOPs capability.
Parameters
----------
num_decoder_blocks : int
Number of decoder blocks in the Transformer model.
num_heads : int
Number of attention heads in each Transformer block.
d_model : int
Dimension of the model's embeddings.
context_length : int
Number of tokens in each input sequence.
model_total_parameters : int
Total number of learnable parameters in the model.
effective_batch_size_per_iter : int
Effective batch size processed in one iteration, accounting for
gradient accumulation.
time_taken_per_iter : float
Time taken per training iteration in seconds.
gpu_promised_flops : float, optional
Theoretical peak performance of the GPU in FLOPs (default is
312e12 for A100 GPU bfloat16/float16 operations).
Returns
-------
MFUEstimationResult:
Pydantic model with the following fields:
- flops_per_token_per_fwdbwd: FLOPs required for forward and backward pass of a single token
- flops_per_sequence_per_fwdbwd: FLOPs required for forward and backward pass of a single sequence
- flops_per_iter_per_fwdbwd: FLOPs required for forward and backward pass of the effective batch size
- flops_achieved_per_second: FLOPs achieved per second
- mfu: Model FLOPs Utilization (MFU) as a ratio of achieved FLOPs to the A100 GPU's peak FLOPs capability
Example
-------
>>> estimate_mfu(
... num_decoder_blocks=6,
... num_heads=8,
... d_model=512,
... context_length=1024,
... model_total_parameters=1_000_000,
... effective_batch_size_per_iter=20 * 8, # 20 sequences per GPU, 8 GPUs
... time_taken_per_iter=0.1, # 0.1 seconds per iteration
... gpu_promised_flops=312e12, # A100 GPU bfloat16/float16 peak flops
... )
Notes
-----
This function utilizes the formula from the PaLM paper Appendix B
(https://arxiv.org/abs/2204.02311) for estimating the FLOPs required
for one forward and backward pass of a single token and scales it up to
the effective batch size and the given model architecture to calculate MFU.
You can likely use it as a callback in your `Trainer` to log the MFU during training.
"""
# fmt: off
N, L, H, Q, T = model_total_parameters, num_decoder_blocks, num_heads, d_model // num_heads, context_length
flops_per_token_per_fwdbwd = 6 * N + 12 * L * H * Q * T # 1 token forward and backward flops
flops_per_sequence_per_fwdbwd = flops_per_token_per_fwdbwd * T # 1 sequence = T tokens
flops_per_iter_per_fwdbwd = flops_per_sequence_per_fwdbwd * effective_batch_size_per_iter # 1 iter means if batch size is 100, then 100 sequences are processed in 1 iter
# express our flops throughput as ratio of A100 bfloat16 peak flops
flops_achieved_per_second = flops_per_iter_per_fwdbwd * (1.0 / time_taken_per_iter) # per second
mfu = flops_achieved_per_second / gpu_promised_flops
# fmt: on
return MFUEstimationResult(
flops_per_token_per_fwdbwd=flops_per_token_per_fwdbwd,
flops_per_sequence_per_fwdbwd=flops_per_sequence_per_fwdbwd,
flops_per_iter_per_fwdbwd=flops_per_iter_per_fwdbwd,
flops_achieved_per_second=flops_achieved_per_second,
mfu=mfu,
)
def estimate_training_days(
total_tokens_in_corpus: float,
mfu_result: MFUEstimationResult,
gpu_promised_flops: float = 312e12, # Default A100 GPU peak FLOPs
num_gpus: int = 8, # Default number of GPUs
assumed_mfu: Optional[float] = None, # Optional manual MFU override
) -> float:
"""
Estimate the total training time in days based on the model FLOPs
utilization and other training parameters.
Parameters
----------
total_tokens_in_corpus : float
Total number of tokens to be processed during training.
mfu_result : MFUEstimationResult
The result from the estimate_mfu function, containing FLOPs
metrics and Model FLOPs Utilization.
gpu_promised_flops : float, optional
Theoretical peak FLOPs performance of a single GPU.
num_gpus : int, optional
Number of GPUs used in the training setup.
assumed_mfu : float, optional
If provided, overrides the MFU calculated in mfu_result to
manually adjust the utilization rate.
Returns
-------
training_days : float
Estimated total training time in days.
"""
mfu = assumed_mfu if assumed_mfu is not None else mfu_result.mfu
# Total FLOPs needed for the entire dataset
total_flops_needed = total_tokens_in_corpus * mfu_result.flops_per_token_per_fwdbwd
# Effective throughput considering MFU
effective_flops_per_second = gpu_promised_flops * num_gpus * mfu
# Total training time in seconds
total_training_time_seconds = total_flops_needed / effective_flops_per_second
total_training_time_days = total_training_time_seconds / (3600 * 24)
return total_training_time_days