Skip to content

Commit 2a46096

Browse files
candicelicyalexdeucher
authored andcommitted
drm/amdgpu: Resolve RAS GFX error count issue after cold boot on Arcturus
Adjust the sequence for ras late init and separate ras reset error status from query status. v2: squash in fix from Candice Signed-off-by: Candice Li <candice.li@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent 28caf8c commit 2a46096

File tree

2 files changed

+28
-8
lines changed

2 files changed

+28
-8
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -594,17 +594,20 @@ int amdgpu_get_gfx_off_status(struct amdgpu_device *adev, uint32_t *value)
594594
int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
595595
{
596596
int r;
597-
r = amdgpu_ras_block_late_init(adev, ras_block);
598-
if (r)
599-
return r;
600597

601598
if (amdgpu_ras_is_supported(adev, ras_block->block)) {
602599
if (!amdgpu_persistent_edc_harvesting_supported(adev))
603600
amdgpu_ras_reset_error_status(adev, AMDGPU_RAS_BLOCK__GFX);
604601

602+
r = amdgpu_ras_block_late_init(adev, ras_block);
603+
if (r)
604+
return r;
605+
605606
r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0);
606607
if (r)
607608
goto late_fini;
609+
} else {
610+
amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
608611
}
609612

610613
return 0;

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,13 @@ static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
197197
if (amdgpu_ras_query_error_status(obj->adev, &info))
198198
return -EINVAL;
199199

200+
/* Hardware counter will be reset automatically after the query on Vega20 and Arcturus */
201+
if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
202+
obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
203+
if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
204+
dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
205+
}
206+
200207
s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
201208
"ue", info.ue_count,
202209
"ce", info.ce_count);
@@ -550,9 +557,10 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
550557
if (amdgpu_ras_query_error_status(obj->adev, &info))
551558
return -EINVAL;
552559

553-
if (obj->adev->asic_type == CHIP_ALDEBARAN) {
560+
if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
561+
obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
554562
if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
555-
DRM_WARN("Failed to reset error counter and error status");
563+
dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
556564
}
557565

558566
return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
@@ -1027,9 +1035,6 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
10271035
}
10281036
}
10291037

1030-
if (!amdgpu_persistent_edc_harvesting_supported(adev))
1031-
amdgpu_ras_reset_error_status(adev, info->head.block);
1032-
10331038
return 0;
10341039
}
10351040

@@ -1149,6 +1154,12 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
11491154
if (res)
11501155
return res;
11511156

1157+
if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
1158+
adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
1159+
if (amdgpu_ras_reset_error_status(adev, info.head.block))
1160+
dev_warn(adev->dev, "Failed to reset error counter and error status");
1161+
}
1162+
11521163
ce += info.ce_count;
11531164
ue += info.ue_count;
11541165
}
@@ -1792,6 +1803,12 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
17921803
continue;
17931804

17941805
amdgpu_ras_query_error_status(adev, &info);
1806+
1807+
if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
1808+
adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
1809+
if (amdgpu_ras_reset_error_status(adev, info.head.block))
1810+
dev_warn(adev->dev, "Failed to reset error counter and error status");
1811+
}
17951812
}
17961813
}
17971814

0 commit comments

Comments
 (0)