11// SPDX-License-Identifier: GPL-2.0
22/* Copyright (c) 2024, Intel Corporation. */
33
4- #include "health.h"
54#include "ice.h"
5+ #include "ice_adminq_cmd.h" /* for enum ice_aqc_health_status_elem */
6+ #include "health.h"
67
78#define ICE_DEVLINK_FMSG_PUT_FIELD (fmsg , obj , name ) \
89 devlink_fmsg_put(fmsg, #name, (obj)->name)
910
11+ #define ICE_HEALTH_STATUS_DATA_SIZE 2
12+
13+ struct ice_health_status {
14+ enum ice_aqc_health_status code ;
15+ const char * description ;
16+ const char * solution ;
17+ const char * data_label [ICE_HEALTH_STATUS_DATA_SIZE ];
18+ };
19+
20+ /*
21+ * In addition to the health status codes provided below, the firmware might
22+ * generate Health Status Codes that are not pertinent to the end-user.
23+ * For instance, Health Code 0x1002 is triggered when the command fails.
24+ * Such codes should be disregarded by the end-user.
25+ * The below lookup requires to be sorted by code.
26+ */
27+
28+ static const char * const ice_common_port_solutions =
29+ "Check your cable connection. Change or replace the module or cable. Manually set speed and duplex." ;
30+ static const char * const ice_port_number_label = "Port Number" ;
31+ static const char * const ice_update_nvm_solution = "Update to the latest NVM image." ;
32+
33+ static const struct ice_health_status ice_health_status_lookup [] = {
34+ {ICE_AQC_HEALTH_STATUS_ERR_UNKNOWN_MOD_STRICT , "An unsupported module was detected." ,
35+ ice_common_port_solutions , {ice_port_number_label }},
36+ {ICE_AQC_HEALTH_STATUS_ERR_MOD_TYPE , "Module type is not supported." ,
37+ "Change or replace the module or cable." , {ice_port_number_label }},
38+ {ICE_AQC_HEALTH_STATUS_ERR_MOD_QUAL , "Module is not qualified." ,
39+ ice_common_port_solutions , {ice_port_number_label }},
40+ {ICE_AQC_HEALTH_STATUS_ERR_MOD_COMM ,
41+ "Device cannot communicate with the module." ,
42+ "Check your cable connection. Change or replace the module or cable. Manually set speed and duplex." ,
43+ {ice_port_number_label }},
44+ {ICE_AQC_HEALTH_STATUS_ERR_MOD_CONFLICT , "Unresolved module conflict." ,
45+ "Manually set speed/duplex or change the port option. If the problem persists, use a cable/module that is found in the supported modules and cables list for this device." ,
46+ {ice_port_number_label }},
47+ {ICE_AQC_HEALTH_STATUS_ERR_MOD_NOT_PRESENT , "Module is not present." ,
48+ "Check that the module is inserted correctly. If the problem persists, use a cable/module that is found in the supported modules and cables list for this device." ,
49+ {ice_port_number_label }},
50+ {ICE_AQC_HEALTH_STATUS_INFO_MOD_UNDERUTILIZED , "Underutilized module." ,
51+ "Change or replace the module or cable. Change the port option." ,
52+ {ice_port_number_label }},
53+ {ICE_AQC_HEALTH_STATUS_ERR_UNKNOWN_MOD_LENIENT , "An unsupported module was detected." ,
54+ ice_common_port_solutions , {ice_port_number_label }},
55+ {ICE_AQC_HEALTH_STATUS_ERR_INVALID_LINK_CFG , "Invalid link configuration." ,
56+ NULL , {ice_port_number_label }},
57+ {ICE_AQC_HEALTH_STATUS_ERR_PORT_ACCESS , "Port hardware access error." ,
58+ ice_update_nvm_solution , {ice_port_number_label }},
59+ {ICE_AQC_HEALTH_STATUS_ERR_PORT_UNREACHABLE , "A port is unreachable." ,
60+ "Change the port option. Update to the latest NVM image." },
61+ {ICE_AQC_HEALTH_STATUS_INFO_PORT_SPEED_MOD_LIMITED , "Port speed is limited due to module." ,
62+ "Change the module or configure the port option to match the current module speed. Change the port option." ,
63+ {ice_port_number_label }},
64+ {ICE_AQC_HEALTH_STATUS_ERR_PARALLEL_FAULT ,
65+ "All configured link modes were attempted but failed to establish link. The device will restart the process to establish link." ,
66+ "Check link partner connection and configuration." ,
67+ {ice_port_number_label }},
68+ {ICE_AQC_HEALTH_STATUS_INFO_PORT_SPEED_PHY_LIMITED ,
69+ "Port speed is limited by PHY capabilities." ,
70+ "Change the module to align to port option." , {ice_port_number_label }},
71+ {ICE_AQC_HEALTH_STATUS_ERR_NETLIST_TOPO , "LOM topology netlist is corrupted." ,
72+ ice_update_nvm_solution , {ice_port_number_label }},
73+ {ICE_AQC_HEALTH_STATUS_ERR_NETLIST , "Unrecoverable netlist error." ,
74+ ice_update_nvm_solution , {ice_port_number_label }},
75+ {ICE_AQC_HEALTH_STATUS_ERR_TOPO_CONFLICT , "Port topology conflict." ,
76+ "Change the port option. Update to the latest NVM image." },
77+ {ICE_AQC_HEALTH_STATUS_ERR_LINK_HW_ACCESS , "Unrecoverable hardware access error." ,
78+ ice_update_nvm_solution , {ice_port_number_label }},
79+ {ICE_AQC_HEALTH_STATUS_ERR_LINK_RUNTIME , "Unrecoverable runtime error." ,
80+ ice_update_nvm_solution , {ice_port_number_label }},
81+ {ICE_AQC_HEALTH_STATUS_ERR_DNL_INIT , "Link management engine failed to initialize." ,
82+ ice_update_nvm_solution , {ice_port_number_label }},
83+ {ICE_AQC_HEALTH_STATUS_ERR_PHY_FW_LOAD ,
84+ "Failed to load the firmware image in the external PHY." ,
85+ ice_update_nvm_solution , {ice_port_number_label }},
86+ {ICE_AQC_HEALTH_STATUS_INFO_RECOVERY , "The device is in firmware recovery mode." ,
87+ ice_update_nvm_solution , {"Extended Error" }},
88+ {ICE_AQC_HEALTH_STATUS_ERR_FLASH_ACCESS , "The flash chip cannot be accessed." ,
89+ "If issue persists, call customer support." , {"Access Type" }},
90+ {ICE_AQC_HEALTH_STATUS_ERR_NVM_AUTH , "NVM authentication failed." ,
91+ ice_update_nvm_solution },
92+ {ICE_AQC_HEALTH_STATUS_ERR_OROM_AUTH , "Option ROM authentication failed." ,
93+ ice_update_nvm_solution },
94+ {ICE_AQC_HEALTH_STATUS_ERR_DDP_AUTH , "DDP package authentication failed." ,
95+ "Update to latest base driver and DDP package." },
96+ {ICE_AQC_HEALTH_STATUS_ERR_NVM_COMPAT , "NVM image is incompatible." ,
97+ ice_update_nvm_solution },
98+ {ICE_AQC_HEALTH_STATUS_ERR_OROM_COMPAT , "Option ROM is incompatible." ,
99+ ice_update_nvm_solution , {"Expected PCI Device ID" , "Expected Module ID" }},
100+ {ICE_AQC_HEALTH_STATUS_ERR_DCB_MIB ,
101+ "Supplied MIB file is invalid. DCB reverted to default configuration." ,
102+ "Disable FW-LLDP and check DCBx system configuration." ,
103+ {ice_port_number_label , "MIB ID" }},
104+ };
105+
106+ static int ice_health_status_lookup_compare (const void * a , const void * b )
107+ {
108+ return ((struct ice_health_status * )a )-> code - ((struct ice_health_status * )b )-> code ;
109+ }
110+
111+ static const struct ice_health_status * ice_get_health_status (u16 code )
112+ {
113+ struct ice_health_status key = { .code = code };
114+
115+ return bsearch (& key , ice_health_status_lookup , ARRAY_SIZE (ice_health_status_lookup ),
116+ sizeof (struct ice_health_status ), ice_health_status_lookup_compare );
117+ }
118+
119+ static void ice_describe_status_code (struct devlink_fmsg * fmsg ,
120+ struct ice_aqc_health_status_elem * hse )
121+ {
122+ static const char * const aux_label [] = { "Aux Data 1" , "Aux Data 2" };
123+ const struct ice_health_status * health_code ;
124+ u32 internal_data [2 ];
125+ u16 status_code ;
126+
127+ status_code = le16_to_cpu (hse -> health_status_code );
128+
129+ devlink_fmsg_put (fmsg , "Syndrome" , status_code );
130+ if (status_code ) {
131+ internal_data [0 ] = le32_to_cpu (hse -> internal_data1 );
132+ internal_data [1 ] = le32_to_cpu (hse -> internal_data2 );
133+
134+ health_code = ice_get_health_status (status_code );
135+ if (!health_code )
136+ return ;
137+
138+ devlink_fmsg_string_pair_put (fmsg , "Description" , health_code -> description );
139+ if (health_code -> solution )
140+ devlink_fmsg_string_pair_put (fmsg , "Possible Solution" ,
141+ health_code -> solution );
142+
143+ for (size_t i = 0 ; i < ICE_HEALTH_STATUS_DATA_SIZE ; i ++ ) {
144+ if (internal_data [i ] != ICE_AQC_HEALTH_STATUS_UNDEFINED_DATA )
145+ devlink_fmsg_u32_pair_put (fmsg ,
146+ health_code -> data_label [i ] ?
147+ health_code -> data_label [i ] :
148+ aux_label [i ],
149+ internal_data [i ]);
150+ }
151+ }
152+ }
153+
154+ static int
155+ ice_port_reporter_diagnose (struct devlink_health_reporter * reporter , struct devlink_fmsg * fmsg ,
156+ struct netlink_ext_ack * extack )
157+ {
158+ struct ice_pf * pf = devlink_health_reporter_priv (reporter );
159+
160+ ice_describe_status_code (fmsg , & pf -> health_reporters .port_status );
161+ return 0 ;
162+ }
163+
164+ static int
165+ ice_port_reporter_dump (struct devlink_health_reporter * reporter , struct devlink_fmsg * fmsg ,
166+ void * priv_ctx , struct netlink_ext_ack __always_unused * extack )
167+ {
168+ struct ice_pf * pf = devlink_health_reporter_priv (reporter );
169+
170+ ice_describe_status_code (fmsg , & pf -> health_reporters .port_status );
171+ return 0 ;
172+ }
173+
174+ static int
175+ ice_fw_reporter_diagnose (struct devlink_health_reporter * reporter , struct devlink_fmsg * fmsg ,
176+ struct netlink_ext_ack * extack )
177+ {
178+ struct ice_pf * pf = devlink_health_reporter_priv (reporter );
179+
180+ ice_describe_status_code (fmsg , & pf -> health_reporters .fw_status );
181+ return 0 ;
182+ }
183+
184+ static int
185+ ice_fw_reporter_dump (struct devlink_health_reporter * reporter , struct devlink_fmsg * fmsg ,
186+ void * priv_ctx , struct netlink_ext_ack * extack )
187+ {
188+ struct ice_pf * pf = devlink_health_reporter_priv (reporter );
189+
190+ ice_describe_status_code (fmsg , & pf -> health_reporters .fw_status );
191+ return 0 ;
192+ }
193+
194+ static void ice_config_health_events (struct ice_pf * pf , bool enable )
195+ {
196+ u8 enable_bits = 0 ;
197+ int ret ;
198+
199+ if (enable )
200+ enable_bits = ICE_AQC_HEALTH_STATUS_SET_PF_SPECIFIC_MASK |
201+ ICE_AQC_HEALTH_STATUS_SET_GLOBAL_MASK ;
202+
203+ ret = ice_aq_set_health_status_cfg (& pf -> hw , enable_bits );
204+ if (ret )
205+ dev_err (ice_pf_to_dev (pf ), "Failed to %s firmware health events, err %d aq_err %s\n" ,
206+ str_enable_disable (enable ), ret ,
207+ ice_aq_str (pf -> hw .adminq .sq_last_status ));
208+ }
209+
210+ /**
211+ * ice_process_health_status_event - Process the health status event from FW
212+ * @pf: pointer to the PF structure
213+ * @event: event structure containing the Health Status Event opcode
214+ *
215+ * Decode the Health Status Events and print the associated messages
216+ */
217+ void ice_process_health_status_event (struct ice_pf * pf , struct ice_rq_event_info * event )
218+ {
219+ const struct ice_aqc_health_status_elem * health_info ;
220+ u16 count ;
221+
222+ health_info = (struct ice_aqc_health_status_elem * )event -> msg_buf ;
223+ count = le16_to_cpu (event -> desc .params .get_health_status .health_status_count );
224+
225+ if (count > (event -> buf_len / sizeof (* health_info ))) {
226+ dev_err (ice_pf_to_dev (pf ), "Received a health status event with invalid element count\n" );
227+ return ;
228+ }
229+
230+ for (size_t i = 0 ; i < count ; i ++ ) {
231+ const struct ice_health_status * health_code ;
232+ u16 status_code ;
233+
234+ status_code = le16_to_cpu (health_info -> health_status_code );
235+ health_code = ice_get_health_status (status_code );
236+
237+ if (health_code ) {
238+ switch (le16_to_cpu (health_info -> event_source )) {
239+ case ICE_AQC_HEALTH_STATUS_GLOBAL :
240+ pf -> health_reporters .fw_status = * health_info ;
241+ devlink_health_report (pf -> health_reporters .fw ,
242+ "FW syndrome reported" , NULL );
243+ break ;
244+ case ICE_AQC_HEALTH_STATUS_PF :
245+ case ICE_AQC_HEALTH_STATUS_PORT :
246+ pf -> health_reporters .port_status = * health_info ;
247+ devlink_health_report (pf -> health_reporters .port ,
248+ "Port syndrome reported" , NULL );
249+ break ;
250+ default :
251+ dev_err (ice_pf_to_dev (pf ), "Health code with unknown source\n" );
252+ }
253+ } else {
254+ u32 data1 , data2 ;
255+ u16 source ;
256+
257+ source = le16_to_cpu (health_info -> event_source );
258+ data1 = le32_to_cpu (health_info -> internal_data1 );
259+ data2 = le32_to_cpu (health_info -> internal_data2 );
260+ dev_dbg (ice_pf_to_dev (pf ),
261+ "Received internal health status code 0x%08x, source: 0x%08x, data1: 0x%08x, data2: 0x%08x" ,
262+ status_code , source , data1 , data2 );
263+ }
264+ health_info ++ ;
265+ }
266+ }
267+
10268/**
11269 * ice_devlink_health_report - boilerplate to call given @reporter
12270 *
@@ -203,14 +461,26 @@ ice_init_devlink_rep(struct ice_pf *pf,
203461 return rep ;
204462}
205463
206- #define ICE_DEFINE_HEALTH_REPORTER_OPS (_name ) \
207- static const struct devlink_health_reporter_ops ice_ ## _name ## _reporter_ops = { \
464+ #define ICE_HEALTH_REPORTER_OPS_FIELD (_name , _field ) \
465+ ._field = ice_##_name##_reporter_##_field,
466+
467+ #define ICE_DEFINE_HEALTH_REPORTER_OPS_1 (_name , _field1 ) \
468+ static const struct devlink_health_reporter_ops ice_##_name##_reporter_ops = { \
208469 .name = #_name, \
209- .dump = ice_ ## _name ## _reporter_dump, \
210- }
470+ ICE_HEALTH_REPORTER_OPS_FIELD(_name, _field1) \
471+ }
472+
473+ #define ICE_DEFINE_HEALTH_REPORTER_OPS_2 (_name , _field1 , _field2 ) \
474+ static const struct devlink_health_reporter_ops ice_##_name##_reporter_ops = { \
475+ .name = #_name, \
476+ ICE_HEALTH_REPORTER_OPS_FIELD(_name, _field1) \
477+ ICE_HEALTH_REPORTER_OPS_FIELD(_name, _field2) \
478+ }
211479
212- ICE_DEFINE_HEALTH_REPORTER_OPS (mdd );
213- ICE_DEFINE_HEALTH_REPORTER_OPS (tx_hang );
480+ ICE_DEFINE_HEALTH_REPORTER_OPS_1 (mdd , dump );
481+ ICE_DEFINE_HEALTH_REPORTER_OPS_1 (tx_hang , dump );
482+ ICE_DEFINE_HEALTH_REPORTER_OPS_2 (fw , dump , diagnose );
483+ ICE_DEFINE_HEALTH_REPORTER_OPS_2 (port , dump , diagnose );
214484
215485/**
216486 * ice_health_init - allocate and init all ice devlink health reporters and
@@ -224,6 +494,12 @@ void ice_health_init(struct ice_pf *pf)
224494
225495 reps -> mdd = ice_init_devlink_rep (pf , & ice_mdd_reporter_ops );
226496 reps -> tx_hang = ice_init_devlink_rep (pf , & ice_tx_hang_reporter_ops );
497+
498+ if (ice_is_fw_health_report_supported (& pf -> hw )) {
499+ reps -> fw = ice_init_devlink_rep (pf , & ice_fw_reporter_ops );
500+ reps -> port = ice_init_devlink_rep (pf , & ice_port_reporter_ops );
501+ ice_config_health_events (pf , true);
502+ }
227503}
228504
229505/**
@@ -246,6 +522,11 @@ void ice_health_deinit(struct ice_pf *pf)
246522{
247523 ice_deinit_devl_reporter (pf -> health_reporters .mdd );
248524 ice_deinit_devl_reporter (pf -> health_reporters .tx_hang );
525+ if (ice_is_fw_health_report_supported (& pf -> hw )) {
526+ ice_deinit_devl_reporter (pf -> health_reporters .fw );
527+ ice_deinit_devl_reporter (pf -> health_reporters .port );
528+ ice_config_health_events (pf , false);
529+ }
249530}
250531
251532static
0 commit comments