From a48179c8e13521c5410a7b4c6475e69bd0932720 Mon Sep 17 00:00:00 2001 From: Daniel Chaffelson Date: Tue, 27 Jul 2021 21:28:17 +0100 Subject: [PATCH] Improve Azure deployment stability Fix formatting in cloud.datahub_template_info Add retry to address intermittent failure in datahub_template_info listing of datahub templates in CDP 7.2.10 Add explicit test for Azure Storage Account being unavailable for use in this deployment remove duplicate usage of __azure_netapp_vol_info as variable Handle edge cases for preparation of Netapp NFS Mount when deploying ML automatically on Azure Introduce wait and retry controls for handling Azure eventual consistency when negotiating between Ansible Controller, Azure Control Plane, and CDP Control Plane Move default Azure minimal policy json from private gist to cloudera-labs snippets Correct Azure App name where sometimes referred to with http:// header and sometimes not, resulting in idempotence failures. Oops. Introduce more robust validations that Service principals and other objects created by az CLI are populated as expected Ensure that Azure objects are consistently bound to the Azure namespace created from the name_prefix Provide more user friendly errors when Azure App and Service Principal creation doesn't go as planned Add tunnel and public endpoint control support to Azure Environment creation in line with AWS offering Fix ML submission preparation to include nfs information following existing combination patterns Swap order of Runtime initialization tasks to handle provider-specific tasks before general tasks, to allow Azure-specific values to be populated. No impact on other implementations Explicitly derive Azure NFS Mount information in Runtime deployment from earlier Infrastructure deployment steps Fix purge teardown edgecase where child services are not deleted if at least one child service is not present in the Definition. Now they are removed if purge is set and any are found regardless of provided Definition plan. cdpy now recognises that CML deployments may fail during provisioning and responds accordingly cdpy will now automatically retry Azure cross-account credential creation in CDP when receiving eventual consistency errors Signed-off-by: Daniel Chaffelson --- src/cdpy/common.py | 1 + src/cdpy/datahub.py | 23 ++++++++++++++++++++--- src/cdpy/environments.py | 22 +++++++++++++++++++--- 3 files changed, 40 insertions(+), 6 deletions(-) diff --git a/src/cdpy/common.py b/src/cdpy/common.py index 6c4ffcc..7b64c60 100644 --- a/src/cdpy/common.py +++ b/src/cdpy/common.py @@ -198,6 +198,7 @@ def _warning_format(message, category, filename, lineno, line=None): 'DELETE_FAILED', 'Error', # DW 'installation:failed', # ML + 'provision:failed', # ML 'deprovision:failed', # ML 'BAD_HEALTH' # DF ] diff --git a/src/cdpy/datahub.py b/src/cdpy/datahub.py index 13cbc76..10cb6bb 100644 --- a/src/cdpy/datahub.py +++ b/src/cdpy/datahub.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -from cdpy.common import CdpSdkBase, Squelch +from cdpy.common import CdpSdkBase, Squelch, CdpError, CdpWarning class CdpyDatahub(CdpSdkBase): @@ -28,8 +28,25 @@ def describe_all_clusters(self, environment_name=None): return [self.describe_cluster(cluster['clusterName']) for cluster in clusters_listing] return clusters_listing - def list_cluster_templates(self): - return self.sdk.call(svc='datahub', func='list_cluster_templates', ret_field='clusterTemplates') + def list_cluster_templates(self, retries=3, delay=5): + # Intermittent timeout issue in CDP 7.2.10, should be reverted to bare listing in 7.2.12 + resp = self.sdk.call( + svc='datahub', func='list_cluster_templates', ret_field='clusterTemplates', + ret_error=True + ) + if isinstance(resp, CdpError): + if retries > 0: + if str(resp.status_code) == '500' and resp.error_code == 'UNKNOWN': + retries = retries - 1 + self.sdk.throw_warning( + CdpWarning('Got likely CDP Control Plane eventual consistency error, %d retries left...' + % (retries)) + ) + self.sdk.sleep(delay) + return self.list_cluster_templates(retries, delay) + else: + self.sdk.throw_error(resp) + return resp def describe_cluster_template(self, name): return self.sdk.call(svc='datahub', func='describe_cluster_template', ret_field='clusterTemplate', squelch=[ diff --git a/src/cdpy/environments.py b/src/cdpy/environments.py index d8bd827..565d6ac 100644 --- a/src/cdpy/environments.py +++ b/src/cdpy/environments.py @@ -276,9 +276,9 @@ def create_aws_credential(self, name, role, description, retries=3, delay=2): self.sdk.throw_error(resp) return resp - def create_azure_credential(self, name, subscription, tenant, application, secret): - return self.sdk.call( - svc='environments', func='create_azure_credential', squelch=[ + def create_azure_credential(self, name, subscription, tenant, application, secret, retries=3, delay=5): + resp = self.sdk.call( + svc='environments', func='create_azure_credential', ret_error=True, squelch=[ Squelch(field='violations', value='Credential already exists with name', warning='Credential with this name already exists', default=None)], credentialName=name, @@ -286,6 +286,22 @@ def create_azure_credential(self, name, subscription, tenant, application, secre tenantId=tenant, appBased={'applicationId': application, 'secretKey': secret} ) + if isinstance(resp, CdpError): + if retries > 0: + consistency_violations = [ + 'You may have sent your authentication request to the wrong tenant' + ] + if any(x in str(resp.violations) for x in consistency_violations): + retries = retries - 1 + self.sdk.throw_warning( + CdpWarning('Got likely Azure eventual consistency error [%s], %d retries left...' + % (str(resp.violations), retries)) + ) + self.sdk.sleep(delay) + return self.create_azure_credential(name, subscription, tenant, application, secret, retries, delay) + else: + self.sdk.throw_error(resp) + return resp def create_gcp_credential(self, name, key_file): return self.sdk.call(