Skip to content

Commit

Permalink
cfgen: config override for edge hosts
Browse files Browse the repository at this point in the history
Summary: same as title

Reviewed By: anps77

Differential Revision: D58466056

fbshipit-source-id: 38276d0fc93d7e0bcbf6f56f5dd8426f42a1392d
  • Loading branch information
Chengxiong Ruan authored and facebook-github-bot committed Jun 12, 2024
1 parent a315d60 commit 35a3527
Show file tree
Hide file tree
Showing 18 changed files with 1,104 additions and 34 deletions.
62 changes: 49 additions & 13 deletions src/oomd/cfgen/src/cfgen.rs
Original file line number Diff line number Diff line change
Expand Up @@ -746,7 +746,7 @@ fn get_attributes(node: &Node) -> ConfigParams {
oomd_high_threshold_duration: String::from("60"),
oomd_threshold: String::from("60"),
oomd_threshold_duration: String::from("90"),
oomd_restart_threshold: oomd2_oomd_restart_threshold(),
oomd_restart_threshold: oomd2_oomd_restart_threshold(node),
oomd_reclaim_duation: String::from("10"),
oomd_post_action_delay: String::from("15"),
swap_protection_detect_threshold: String::from("5"),
Expand Down Expand Up @@ -870,13 +870,17 @@ fn oomd_extra_rulesets(node: &Node) -> Vec<RuleSet> {
]
}

fn oomd2_oomd_restart_threshold() -> BTreeMap<String, OomdRestartThreshold> {
btreemap! {
String::from("smc_proxy.service") => OomdRestartThreshold{
threshold: String::from("15G"),
duration: String::from("10"),
post_action_delay: String::from("20"),
service_name: String::from("smc_proxy.service")}
fn oomd2_oomd_restart_threshold(node: &Node) -> BTreeMap<String, OomdRestartThreshold> {
if [HostType::GEdge, HostType::Fna].contains(&get_host_type(node)) {
btreemap! {}
} else {
btreemap! {
String::from("smc_proxy.service") => OomdRestartThreshold{
threshold: String::from("15G"),
duration: String::from("10"),
post_action_delay: String::from("20"),
service_name: String::from("smc_proxy.service")}
}
}
}

Expand Down Expand Up @@ -913,7 +917,15 @@ fn should_setup_iocost(node: &Node) -> bool {
}

fn fbtax2_blacklisted_jobs(node: &Node) -> Vec<&'static str> {
if [HostType::TwShared, HostType::Tw].contains(&get_host_type(node)) {
if [
HostType::TwShared,
HostType::Tw,
HostType::FnEdge,
HostType::GEdge,
HostType::Fna,
]
.contains(&get_host_type(node))
{
return vec![
// This ML model has extremely high memory usage, they need to fix
// their stuff at some point.
Expand Down Expand Up @@ -945,16 +957,27 @@ fn senpai_targets(node: &Node) -> Option<String> {
}

match get_host_type(node) {
HostType::TwShared | HostType::Tw => Some(String::from(
"system.slice,workload.slice/workload-wdb.slice,hostcritical.slice,workload.slice/workload-wdb.slice/*,hostcritical.slice/*",
)),
HostType::TwShared | HostType::Tw | HostType::FnEdge | HostType::GEdge | HostType::Fna => {
Some(String::from(
"system.slice,workload.slice/workload-wdb.slice,hostcritical.slice,workload.slice/workload-wdb.slice/*,hostcritical.slice/*",
))
}
HostType::Synmon => Some(String::from("system.slice")),
_ => None,
}
}

fn senpai_limit_min_bytes(node: &Node) -> Option<String> {
if [HostType::TwShared, HostType::Tw, HostType::Synmon].contains(&get_host_type(node)) {
if [
HostType::TwShared,
HostType::Tw,
HostType::Synmon,
HostType::FnEdge,
HostType::GEdge,
HostType::Fna,
]
.contains(&get_host_type(node))
{
let min_bytes = 100 * 1024 * 1024;
return Some(min_bytes.to_string());
}
Expand Down Expand Up @@ -999,9 +1022,22 @@ fn get_host_type(node: &Node) -> HostType {
return HostType::Dns;
}

if node.hostname_prefix() == FNEDGE {
return HostType::FnEdge;
}

if node.hostname_prefix() == GEDGE {
return HostType::GEdge;
}

if node.hostname_prefix() == FNA {
return HostType::Fna;
}

if node.is_devserver() {
return HostType::DevServer;
}

HostType::Default
}

Expand Down
3 changes: 3 additions & 0 deletions src/oomd/cfgen/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ pub enum HostType {
Tw,
Synmon,
Dns,
FnEdge,
GEdge,
Fna,
}

pub struct ConfigParams {
Expand Down
12 changes: 8 additions & 4 deletions src/oomd/cfgen/test/cfgen_test_inputs/devgpu_T17.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
@generated SignedSource<<e8dbe09a4bea0f6be69a57a2bdd4e8e2>>
@generated SignedSource<<c1fa5e796716c915704d08118fe9ae17>>
@codegen-command arc cfgen update-inputs fb-oomd
{
"fqdn": "devgpu109.cln3.facebook.com",
Expand Down Expand Up @@ -36,7 +36,9 @@
"vendor_id": 5348,
"device_id": 5969,
"class_code": 131072,
"board_part_number": "BCM957504-N1100FXB"
"board_part_number": "BCM957504-N1100FXB",
"current_speed_mts": 8000,
"current_width": 16
}
},
"static_smc_tiers": [],
Expand All @@ -48,7 +50,8 @@
"driver": "bnxt_en",
"driver_version": "5.19.0-0_fbk9_fbaccel.phvuisqkh",
"firmware_version": "220.0.59.0/pkg 220.0.83.0",
"bus_info": "0000:b3:00.0"
"bus_info": "0000:b3:00.0",
"speed_mbps": 100000
}
}
},
Expand Down Expand Up @@ -115,7 +118,8 @@
"device_nics_enum": [
"ETH0",
"SVC0"
]
],
"loaded_kernel_modules": []
},
"reservationConfig": {
"active_machine_materialization_id": "",
Expand Down
4 changes: 2 additions & 2 deletions src/oomd/cfgen/test/cfgen_test_inputs/devgpu_non_T17.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
@generated SignedSource<<94b019e8a8c2490e29066f90748585d9>>
@generated SignedSource<<c2dabf2100bc0e0bf76987c73632b0b6>>
@codegen-command arc cfgen update-inputs fb-oomd
{
"fqdn": "devgpu013.eag5.facebook.com",
Expand Down Expand Up @@ -55,7 +55,7 @@
"deviceType": "SERVER",
"datacenter": "eag5",
"cluster": "04",
"memTotal": 2163675131904,
"memTotal": 2163675127808,
"osVersion": {
"distribution_name": "CentOS Stream release",
"version": 9,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
@generated SignedSource<<5c629f38a06ffc96c81ea393c0ff79c0>>
@generated SignedSource<<7094d5f857ef7db387134b6aaa98ec53>>
@codegen-command arc cfgen update-inputs fb-oomd
{
"fqdn": "devvm010.atn6.facebook.com",
Expand Down Expand Up @@ -45,7 +45,8 @@
"driver": "virtio_net",
"driver_version": "1.0.0",
"firmware_version": "",
"bus_info": "0000:00:02.0"
"bus_info": "0000:00:02.0",
"speed_mbps": 0
}
}
},
Expand Down Expand Up @@ -75,7 +76,8 @@
"device_nics_enum": [
"ETH0",
"SVC0"
]
],
"loaded_kernel_modules": []
},
"reservationConfig": {
"active_machine_materialization_id": "",
Expand Down
148 changes: 148 additions & 0 deletions src/oomd/cfgen/test/cfgen_test_inputs/fna_shard00.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
@generated SignedSource<<77a12826ffacd49093dab7d916d4ce1c>>
@codegen-command arc cfgen update-inputs fb-oomd
{
"fqdn": "fna007.01.futp1.facebook.com",
"region": "other",
"clusterType": "FNA",
"modelId": 342361,
"kernelRelease": "5.19.0-0_fbk21_hardened_rc9_12630_gab30f3f5259a",
"serverType": "TYPE_VI_FEED",
"experiments": [],
"cpuArchitecture": "icelake",
"metalosRootfs": false,
"provisioningConfig": {
"ethtoolByInterface": {
"eth0": {
"maxChannelsCombined": 63
},
"ip6tnl0": {},
"tunl0": {},
"tunlany0": {}
},
"cpuCoreCount": 48,
"parentModelId": 328972,
"recoveryEnvironment": false,
"deviceType": "SERVER",
"datacenter": "futp1",
"cluster": "01",
"memTotal": 265750892544,
"osVersion": {
"distribution_name": "EdgeOS",
"version": 1708953544,
"is_in_ramdisk": false,
"is_metalos": false
},
"pciByAddress": {
"0000:47:00.0": {
"vendor_id": 5555,
"device_id": 4125,
"class_code": 131072,
"board_part_number": "MCX623435AC-CDAI",
"current_speed_mts": 16000,
"current_width": 16
}
},
"static_smc_tiers": [],
"machine": "x86_64"
},
"bootConfig": {
"ethtoolByInterface": {
"eth0": {
"driver": "mlx5_core",
"driver_version": "5.19.0-0_fbk21_hardened_rc9_126",
"firmware_version": "22.34.4000 (MT_0000000695)",
"bus_info": "0000:47:00.0",
"speed_mbps": 100000
},
"ip6tnl0": {
"driver": "ip6tnl",
"driver_version": "5.19.0-0_fbk21_hardened_rc9_126",
"firmware_version": "",
"bus_info": "",
"speed_mbps": 0
},
"tunl0": {
"driver": "ipip",
"driver_version": "5.19.0-0_fbk21_hardened_rc9_126",
"firmware_version": "",
"bus_info": "",
"speed_mbps": 0
},
"tunlany0": {
"driver": "ip6tnl",
"driver_version": "5.19.0-0_fbk21_hardened_rc9_126",
"firmware_version": "",
"bus_info": "",
"speed_mbps": 0
}
}
},
"runtimeConfig": {
"hasHighPrivCert": true,
"regionRoutableCluster": "oth1.01",
"block_devices": {
"block_devices": {
"nvme0n1": {
"size_bytes": 3840755982336,
"is_rotational": false,
"model": "SAMSUNG MZQL23T8HCLS-00A07",
"serial": "S64HNJ0T644493",
"physical_block_size": 4096,
"logical_block_size": 4096,
"is_root": false
},
"nvme1n1": {
"size_bytes": 960197124096,
"is_rotational": false,
"model": "SAMSUNG MZQL2960HCJR-00A07",
"serial": "S64FNE0T407240",
"physical_block_size": 4096,
"logical_block_size": 512,
"is_root": true
},
"nvme2n1": {
"size_bytes": 3840755982336,
"is_rotational": false,
"model": "SAMSUNG MZQL23T8HCLS-00A07",
"serial": "S64HNJ0T644491",
"physical_block_size": 4096,
"logical_block_size": 4096,
"is_root": false
}
}
},
"dynamic_smc_tiers": [],
"cluster_state": "CLUSTER_IN_USE",
"installed_platforms": [
"platform010",
"platform010-compat"
],
"device_nics_enum": [
"ETH0",
"SVC0"
],
"loaded_kernel_modules": [],
"mounts": {
"mounts": [
{
"device": "/dev/mapper/transient",
"mount_point": "/",
"fstype": "btrfs",
"mount_options": [
"rw",
"relatime",
"compress-force=zstd:3",
"ssd",
"space_cache=v2",
"subvolid=256",
"subvol=/volume"
]
}
]
}
},
"reservationConfig": {
"active_machine_materialization_id": "",
"current_reservation_host_profile_id": "NEWLY_PROVISIONED_PROFILE"
}
}
Loading

0 comments on commit 35a3527

Please sign in to comment.