|
20 | 20 | #include "xe_gt.h" |
21 | 21 | #include "xe_gt_printk.h" |
22 | 22 | #include "xe_gt_sriov_vf.h" |
| 23 | +#include "xe_gt_throttle.h" |
23 | 24 | #include "xe_guc_ads.h" |
24 | 25 | #include "xe_guc_ct.h" |
25 | 26 | #include "xe_guc_db_mgr.h" |
@@ -501,53 +502,191 @@ static int guc_xfer_rsa(struct xe_guc *guc) |
501 | 502 | return 0; |
502 | 503 | } |
503 | 504 |
|
| 505 | +/* |
| 506 | + * Check a previously read GuC status register (GUC_STATUS) looking for |
| 507 | + * known terminal states (either completion or failure) of either the |
| 508 | + * microkernel status field or the boot ROM status field. Returns +1 for |
| 509 | + * successful completion, -1 for failure and 0 for any intermediate state. |
| 510 | + */ |
| 511 | +static int guc_load_done(u32 status) |
| 512 | +{ |
| 513 | + u32 uk_val = REG_FIELD_GET(GS_UKERNEL_MASK, status); |
| 514 | + u32 br_val = REG_FIELD_GET(GS_BOOTROM_MASK, status); |
| 515 | + |
| 516 | + switch (uk_val) { |
| 517 | + case XE_GUC_LOAD_STATUS_READY: |
| 518 | + return 1; |
| 519 | + |
| 520 | + case XE_GUC_LOAD_STATUS_ERROR_DEVID_BUILD_MISMATCH: |
| 521 | + case XE_GUC_LOAD_STATUS_GUC_PREPROD_BUILD_MISMATCH: |
| 522 | + case XE_GUC_LOAD_STATUS_ERROR_DEVID_INVALID_GUCTYPE: |
| 523 | + case XE_GUC_LOAD_STATUS_HWCONFIG_ERROR: |
| 524 | + case XE_GUC_LOAD_STATUS_DPC_ERROR: |
| 525 | + case XE_GUC_LOAD_STATUS_EXCEPTION: |
| 526 | + case XE_GUC_LOAD_STATUS_INIT_DATA_INVALID: |
| 527 | + case XE_GUC_LOAD_STATUS_MPU_DATA_INVALID: |
| 528 | + case XE_GUC_LOAD_STATUS_INIT_MMIO_SAVE_RESTORE_INVALID: |
| 529 | + return -1; |
| 530 | + } |
| 531 | + |
| 532 | + switch (br_val) { |
| 533 | + case XE_BOOTROM_STATUS_NO_KEY_FOUND: |
| 534 | + case XE_BOOTROM_STATUS_RSA_FAILED: |
| 535 | + case XE_BOOTROM_STATUS_PAVPC_FAILED: |
| 536 | + case XE_BOOTROM_STATUS_WOPCM_FAILED: |
| 537 | + case XE_BOOTROM_STATUS_LOADLOC_FAILED: |
| 538 | + case XE_BOOTROM_STATUS_JUMP_FAILED: |
| 539 | + case XE_BOOTROM_STATUS_RC6CTXCONFIG_FAILED: |
| 540 | + case XE_BOOTROM_STATUS_MPUMAP_INCORRECT: |
| 541 | + case XE_BOOTROM_STATUS_EXCEPTION: |
| 542 | + case XE_BOOTROM_STATUS_PROD_KEY_CHECK_FAILURE: |
| 543 | + return -1; |
| 544 | + } |
| 545 | + |
| 546 | + return 0; |
| 547 | +} |
| 548 | + |
| 549 | +static s32 guc_pc_get_cur_freq(struct xe_guc_pc *guc_pc) |
| 550 | +{ |
| 551 | + u32 freq; |
| 552 | + int ret = xe_guc_pc_get_cur_freq(guc_pc, &freq); |
| 553 | + |
| 554 | + return ret ? ret : freq; |
| 555 | +} |
| 556 | + |
| 557 | +/* |
| 558 | + * Wait for the GuC to start up. |
| 559 | + * |
| 560 | + * Measurements indicate this should take no more than 20ms (assuming the GT |
| 561 | + * clock is at maximum frequency). However, thermal throttling and other issues |
| 562 | + * can prevent the clock hitting max and thus making the load take significantly |
| 563 | + * longer. Allow up to 200ms as a safety margin for real world worst case situations. |
| 564 | + * |
| 565 | + * However, bugs anywhere from KMD to GuC to PCODE to fan failure in a CI farm can |
| 566 | + * lead to even longer times. E.g. if the GT is clamped to minimum frequency then |
| 567 | + * the load times can be in the seconds range. So the timeout is increased for debug |
| 568 | + * builds to ensure that problems can be correctly analysed. For release builds, the |
| 569 | + * timeout is kept short so that users don't wait forever to find out that there is a |
| 570 | + * problem. In either case, if the load took longer than is reasonable even with some |
| 571 | + * 'sensible' throttling, then flag a warning because something is not right. |
| 572 | + * |
| 573 | + * Note that there is a limit on how long an individual usleep_range() can wait for, |
| 574 | + * hence longer waits require wrapping a shorter wait in a loop. |
| 575 | + * |
| 576 | + * Note that the only reason an end user should hit the shorter timeout is in case of |
| 577 | + * extreme thermal throttling. And a system that is that hot during boot is probably |
| 578 | + * dead anyway! |
| 579 | + */ |
| 580 | +#if defined(CONFIG_DRM_XE_DEBUG) |
| 581 | +#define GUC_LOAD_RETRY_LIMIT 20 |
| 582 | +#else |
| 583 | +#define GUC_LOAD_RETRY_LIMIT 3 |
| 584 | +#endif |
| 585 | +#define GUC_LOAD_TIME_WARN_MS 200 |
| 586 | + |
504 | 587 | static void guc_wait_ucode(struct xe_guc *guc) |
505 | 588 | { |
506 | 589 | struct xe_gt *gt = guc_to_gt(guc); |
507 | | - u32 status; |
508 | | - int ret; |
509 | | - |
| 590 | + struct xe_guc_pc *guc_pc = >->uc.guc.pc; |
| 591 | + ktime_t before, after, delta; |
| 592 | + int load_done; |
| 593 | + u32 status = 0; |
| 594 | + int count; |
| 595 | + u64 delta_ms; |
| 596 | + u32 before_freq; |
| 597 | + |
| 598 | + before_freq = xe_guc_pc_get_act_freq(guc_pc); |
| 599 | + before = ktime_get(); |
510 | 600 | /* |
511 | | - * Wait for the GuC to start up. |
512 | | - * NB: Docs recommend not using the interrupt for completion. |
513 | | - * Measurements indicate this should take no more than 20ms |
514 | | - * (assuming the GT clock is at maximum frequency). So, a |
515 | | - * timeout here indicates that the GuC has failed and is unusable. |
516 | | - * (Higher levels of the driver may decide to reset the GuC and |
517 | | - * attempt the ucode load again if this happens.) |
518 | | - * |
519 | | - * FIXME: There is a known (but exceedingly unlikely) race condition |
520 | | - * where the asynchronous frequency management code could reduce |
521 | | - * the GT clock while a GuC reload is in progress (during a full |
522 | | - * GT reset). A fix is in progress but there are complex locking |
523 | | - * issues to be resolved. In the meantime bump the timeout to |
524 | | - * 200ms. Even at slowest clock, this should be sufficient. And |
525 | | - * in the working case, a larger timeout makes no difference. |
| 601 | + * Note, can't use any kind of timing information from the call to xe_mmio_wait. |
| 602 | + * It could return a thousand intermediate stages at random times. Instead, must |
| 603 | + * manually track the total time taken and locally implement the timeout. |
526 | 604 | */ |
527 | | - ret = xe_mmio_wait32(gt, GUC_STATUS, GS_UKERNEL_MASK, |
528 | | - FIELD_PREP(GS_UKERNEL_MASK, XE_GUC_LOAD_STATUS_READY), |
529 | | - 200000, &status, false); |
| 605 | + do { |
| 606 | + u32 last_status = status & (GS_UKERNEL_MASK | GS_BOOTROM_MASK); |
530 | 607 |
|
531 | | - if (ret) { |
532 | | - xe_gt_err(gt, "GuC load failed: status = 0x%08X\n", status); |
533 | | - xe_gt_err(gt, "GuC status: Reset = %u, BootROM = %#X, UKernel = %#X, MIA = %#X, Auth = %#X\n", |
534 | | - REG_FIELD_GET(GS_MIA_IN_RESET, status), |
| 608 | + /* |
| 609 | + * Wait for any change (intermediate or terminal) in the status register. |
| 610 | + * Note, the return value is a don't care. The only failure code is timeout |
| 611 | + * but the timeouts need to be accumulated over all the intermediate partial |
| 612 | + * timeouts rather than allowing a huge timeout each time. So basically, need |
| 613 | + * to treat a timeout no different to a value change. |
| 614 | + */ |
| 615 | + xe_mmio_wait32_not(gt, GUC_STATUS, GS_UKERNEL_MASK | GS_BOOTROM_MASK, |
| 616 | + last_status, 1000 * 1000, &status, false); |
| 617 | + |
| 618 | + after = ktime_get(); |
| 619 | + delta = ktime_sub(after, before); |
| 620 | + delta_ms = ktime_to_ms(delta); |
| 621 | + |
| 622 | + load_done = guc_load_done(status); |
| 623 | + if (load_done != 0) |
| 624 | + break; |
| 625 | + |
| 626 | + if (delta_ms >= (GUC_LOAD_RETRY_LIMIT * 1000)) |
| 627 | + break; |
| 628 | + |
| 629 | + xe_gt_dbg(gt, "load still in progress, count = %d, freq = %dMHz (req %dMHz), status = 0x%08X [0x%02X/%02X]\n", |
| 630 | + count, xe_guc_pc_get_act_freq(guc_pc), |
| 631 | + guc_pc_get_cur_freq(guc_pc), status, |
535 | 632 | REG_FIELD_GET(GS_BOOTROM_MASK, status), |
536 | | - REG_FIELD_GET(GS_UKERNEL_MASK, status), |
| 633 | + REG_FIELD_GET(GS_UKERNEL_MASK, status)); |
| 634 | + } while (1); |
| 635 | + |
| 636 | + if (load_done != 1) { |
| 637 | + u32 ukernel = REG_FIELD_GET(GS_UKERNEL_MASK, status); |
| 638 | + u32 bootrom = REG_FIELD_GET(GS_BOOTROM_MASK, status); |
| 639 | + |
| 640 | + xe_gt_err(gt, "load failed: status = 0x%08X, time = %lldms, freq = %dMHz (req %dMHz), done = %d\n", |
| 641 | + status, delta_ms, xe_guc_pc_get_act_freq(guc_pc), |
| 642 | + guc_pc_get_cur_freq(guc_pc), load_done); |
| 643 | + xe_gt_err(gt, "load failed: status: Reset = %d, BootROM = 0x%02X, UKernel = 0x%02X, MIA = 0x%02X, Auth = 0x%02X\n", |
| 644 | + REG_FIELD_GET(GS_MIA_IN_RESET, status), |
| 645 | + bootrom, ukernel, |
537 | 646 | REG_FIELD_GET(GS_MIA_MASK, status), |
538 | 647 | REG_FIELD_GET(GS_AUTH_STATUS_MASK, status)); |
539 | 648 |
|
540 | | - if ((status & GS_BOOTROM_MASK) == GS_BOOTROM_RSA_FAILED) |
541 | | - xe_gt_err(gt, "GuC firmware signature verification failed\n"); |
| 649 | + switch (bootrom) { |
| 650 | + case XE_BOOTROM_STATUS_NO_KEY_FOUND: |
| 651 | + xe_gt_err(gt, "invalid key requested, header = 0x%08X\n", |
| 652 | + xe_mmio_read32(gt, GUC_HEADER_INFO)); |
| 653 | + break; |
| 654 | + |
| 655 | + case XE_BOOTROM_STATUS_RSA_FAILED: |
| 656 | + xe_gt_err(gt, "firmware signature verification failed\n"); |
| 657 | + break; |
542 | 658 |
|
543 | | - if (REG_FIELD_GET(GS_UKERNEL_MASK, status) == |
544 | | - XE_GUC_LOAD_STATUS_EXCEPTION) |
545 | | - xe_gt_err(gt, "GuC firmware exception. EIP: %#x\n", |
| 659 | + case XE_BOOTROM_STATUS_PROD_KEY_CHECK_FAILURE: |
| 660 | + xe_gt_err(gt, "firmware production part check failure\n"); |
| 661 | + break; |
| 662 | + } |
| 663 | + |
| 664 | + switch (ukernel) { |
| 665 | + case XE_GUC_LOAD_STATUS_EXCEPTION: |
| 666 | + xe_gt_err(gt, "firmware exception. EIP: %#x\n", |
546 | 667 | xe_mmio_read32(gt, SOFT_SCRATCH(13))); |
| 668 | + break; |
| 669 | + |
| 670 | + case XE_GUC_LOAD_STATUS_INIT_MMIO_SAVE_RESTORE_INVALID: |
| 671 | + xe_gt_err(gt, "illegal register in save/restore workaround list\n"); |
| 672 | + break; |
| 673 | + |
| 674 | + case XE_GUC_LOAD_STATUS_HWCONFIG_START: |
| 675 | + xe_gt_err(gt, "still extracting hwconfig table.\n"); |
| 676 | + break; |
| 677 | + } |
547 | 678 |
|
548 | 679 | xe_device_declare_wedged(gt_to_xe(gt)); |
| 680 | + } else if (delta_ms > GUC_LOAD_TIME_WARN_MS) { |
| 681 | + xe_gt_warn(gt, "excessive init time: %lldms! [status = 0x%08X, count = %d]\n", |
| 682 | + delta_ms, status, count); |
| 683 | + xe_gt_warn(gt, "excessive init time: [freq = %dMHz (req = %dMHz), before = %dMHz, perf_limit_reasons = 0x%08X]\n", |
| 684 | + xe_guc_pc_get_act_freq(guc_pc), guc_pc_get_cur_freq(guc_pc), |
| 685 | + before_freq, xe_gt_throttle_get_limit_reasons(gt)); |
549 | 686 | } else { |
550 | | - xe_gt_dbg(gt, "GuC successfully loaded\n"); |
| 687 | + xe_gt_dbg(gt, "init took %lldms, freq = %dMHz (req = %dMHz), before = %dMHz, status = 0x%08X, count = %d\n", |
| 688 | + delta_ms, xe_guc_pc_get_act_freq(guc_pc), guc_pc_get_cur_freq(guc_pc), |
| 689 | + before_freq, status, count); |
551 | 690 | } |
552 | 691 | } |
553 | 692 |
|
|
0 commit comments