Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support nvidia-smi timeout #779

Merged
merged 2 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 5 additions & 0 deletions agent/metrics_agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,11 @@ func (ma *MetricsAgent) inputGo(name string, sum string, input inputs.Input) {
if err = inputs.MayInit(input); err != nil {
if !errors.Is(err, types.ErrInstancesEmpty) {
log.Println("E! failed to init input:", name, "error:", err)
} else {
if config.Config.DebugMode {
_, inputKey := inputs.ParseInputName(name)
log.Println("W! no instances for input: ", inputKey)
}
}
return
}
Expand Down
5 changes: 4 additions & 1 deletion conf/input.nvidia_smi/nvidia_smi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,7 @@ nvidia_smi_command = ""
# Comma-separated list of the query fields.
# You can find out possible fields by running `nvidia-smi --help-query-gpus`.
# The value `AUTO` will automatically detect the fields to query.
query_field_names = "AUTO"
query_field_names = "AUTO"

# query_timeout is used to set the query timeout to avoid the delay of date collection.
query_timeout = "5s"
8 changes: 4 additions & 4 deletions inputs/nvidia_smi/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,15 @@ func buildMetricInfo(rField rField) MetricInfo {
}
}

func buildQFieldToRFieldMap(qFieldsRaw string, nvidiaSmiCommand string) ([]qField, map[qField]rField, error) {
qFieldsSeparated := strings.Split(qFieldsRaw, ",")
func (s *GPUStats) buildQFieldToRFieldMap() ([]qField, map[qField]rField, error) {
qFieldsSeparated := strings.Split(s.QueryFieldNames, ",")

qFields := toQFieldSlice(qFieldsSeparated)
qFields = append(qFields, requiredFields...)
qFields = removeDuplicateQFields(qFields)

if len(qFieldsSeparated) == 1 && qFieldsSeparated[0] == qFieldsAuto {
parsed, err := parseAutoQFields(nvidiaSmiCommand)
parsed, err := parseAutoQFields(s.NvidiaSmiCommand)
if err != nil {
log.Println("W! failed to auto-determine query field names, falling back to the built-in list. error:", err)
return getKeys(fallbackQFieldToRFieldMap), fallbackQFieldToRFieldMap, nil
Expand All @@ -62,7 +62,7 @@ func buildQFieldToRFieldMap(qFieldsRaw string, nvidiaSmiCommand string) ([]qFiel
qFields = parsed
}

resultTable, err := scrape(qFields, nvidiaSmiCommand)
resultTable, err := s.scrape()

var rFields []rField

Expand Down
10 changes: 5 additions & 5 deletions inputs/nvidia_smi/nvidia_smi.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ const inputName = "nvidia_smi"
type GPUStats struct {
config.PluginConfig

NvidiaSmiCommand string `toml:"nvidia_smi_command"`
QueryFieldNames string `toml:"query_field_names"`
NvidiaSmiCommand string `toml:"nvidia_smi_command"`
QueryFieldNames string `toml:"query_field_names"`
QueryTimeOut config.Duration `toml:"query_timeout"`

qFields []qField
qFieldToMetricInfoMap map[qField]MetricInfo
Expand All @@ -43,7 +44,7 @@ func (s *GPUStats) Init() error {
return types.ErrInstancesEmpty
}

qFieldsOrdered, qFieldToRFieldMap, err := buildQFieldToRFieldMap(s.QueryFieldNames, s.NvidiaSmiCommand)
qFieldsOrdered, qFieldToRFieldMap, err := s.buildQFieldToRFieldMap()
if err != nil {
return err
}
Expand All @@ -58,7 +59,6 @@ func (s *GPUStats) Gather(slist *types.SampleList) {
if s.NvidiaSmiCommand == "" {
return
}

begun := time.Now()

// scrape use seconds
Expand All @@ -67,7 +67,7 @@ func (s *GPUStats) Gather(slist *types.SampleList) {
slist.PushFront(types.NewSample(inputName, "scrape_use_seconds", use))
}(begun)

currentTable, err := scrape(s.qFields, s.NvidiaSmiCommand)
currentTable, err := s.scrape()
if err != nil {
slist.PushFront(types.NewSample(inputName, "scraper_up", 0))
return
Expand Down
10 changes: 5 additions & 5 deletions inputs/nvidia_smi/scrape.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ import (
"flashcat.cloud/categraf/pkg/cmdx"
)

func scrape(qFields []qField, nvidiaSmiCommand string) (*table, error) {
qFieldsJoined := strings.Join(QFieldSliceToStringSlice(qFields), ",")
func (s *GPUStats) scrape() (*table, error) {
qFieldsJoined := strings.Join(QFieldSliceToStringSlice(s.qFields), ",")

cmdAndArgs := strings.Fields(nvidiaSmiCommand)
cmdAndArgs := strings.Fields(s.NvidiaSmiCommand)
cmdAndArgs = append(cmdAndArgs, fmt.Sprintf("--query-gpu=%s", qFieldsJoined))
cmdAndArgs = append(cmdAndArgs, "--format=csv")

Expand All @@ -24,7 +24,7 @@ func scrape(qFields []qField, nvidiaSmiCommand string) (*table, error) {
cmd.Stdout = &stdout
cmd.Stderr = &stderr

err, timeout := cmdx.RunTimeout(cmd, time.Second*5)
err, timeout := cmdx.RunTimeout(cmd, time.Duration(s.QueryTimeOut))
if timeout {
return nil, fmt.Errorf("run command: %s timeout", strings.Join(cmdAndArgs, " "))
}
Expand All @@ -34,7 +34,7 @@ func scrape(qFields []qField, nvidiaSmiCommand string) (*table, error) {
strings.Join(cmdAndArgs, " "), err, stdout.String(), stderr.String())
}

t, err := parseCSVIntoTable(strings.TrimSpace(stdout.String()), qFields)
t, err := parseCSVIntoTable(strings.TrimSpace(stdout.String()), s.qFields)
if err != nil {
return nil, err
}
Expand Down