OPR001 - Create app-deploy
==========================

Description
-----------

### Parameters

In [None]:
import os
import getpass

app_name = "app-" + getpass.getuser().lower()
app_version = "v1"

notebooks = [
    os.path.join("..", "notebook-runner", "run505a-sample-notebook.ipynb") 
]

app_requirements_txt = ""

### Set the `azdata` logging directory

To support running multiple creates at the same time, place the
azdata.log separately. This code is placed here, so it runs after
‘injected parameters’ (which may change the app\_name/app\_version)

In [None]:
os.environ["AZDATA_LOGGING_LOG_DIR"] = f"{app_name}-{app_version}"

### Marshall injected parameters of list type from string type

`list` type arguments passed via `azdata notebook run --arguments` are
passed as `string` and need to be converted back to `list` type.

In [None]:
# Escape Codes `btnar` need to be further escaped. e.g. if \t isn't turned into \\t, then the "root\folder\tFilename" becomes "root\folder    Filename".  Escaping \t \b \r \b \a as per:
#
# - http://www.java2s.com/Code/Python/String/EscapeCodesbtnar.htm

if isinstance(notebooks, str):
    notebooks = notebooks[1:-1].replace('\t', '\\t').replace('\n', '\\n').replace('\r', '\\r').replace('\b', '\\b').replace('\a', '\\a').split(', ')

### Common functions

Define helper functions used in this notebook.

In [None]:
# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows
import sys
import os
import re
import json
import platform
import shlex
import shutil
import datetime

from subprocess import Popen, PIPE
from IPython.display import Markdown

retry_hints = {} # Output in stderr known to be transient, therefore automatically retry
error_hints = {} # Output in stderr where a known SOP/TSG exists which will be HINTed for further help
install_hint = {} # The SOP to help install the executable if it cannot be found

first_run = True
rules = None
debug_logging = False

def run(cmd, return_output=False, no_output=False, retry_count=0):
    """Run shell command, stream stdout, print stderr and optionally return output

    NOTES:

    1.  Commands that need this kind of ' quoting on Windows e.g.:

            kubectl get nodes -o jsonpath={.items[?(@.metadata.annotations.pv-candidate=='data-pool')].metadata.name}

        Need to actually pass in as '"':

            kubectl get nodes -o jsonpath={.items[?(@.metadata.annotations.pv-candidate=='"'data-pool'"')].metadata.name}

        The ' quote approach, although correct when pasting into Windows cmd, will hang at the line:
        
            `iter(p.stdout.readline, b'')`

        The shlex.split call does the right thing for each platform, just use the '"' pattern for a '
    """
    MAX_RETRIES = 5
    output = ""
    retry = False

    global first_run
    global rules

    if first_run:
        first_run = False
        rules = load_rules()

    # When running `azdata sql query` on Windows, replace any \n in """ strings, with " ", otherwise we see:
    #
    #    ('HY090', '[HY090] [Microsoft][ODBC Driver Manager] Invalid string or buffer length (0) (SQLExecDirectW)')
    #
    if platform.system() == "Windows" and cmd.startswith("azdata sql query"):
        cmd = cmd.replace("\n", " ")

    # shlex.split is required on bash and for Windows paths with spaces
    #
    cmd_actual = shlex.split(cmd)

    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries
    #
    user_provided_exe_name = cmd_actual[0].lower()

    # When running python, use the python in the ADS sandbox ({sys.executable})
    #
    if cmd.startswith("python "):
        cmd_actual[0] = cmd_actual[0].replace("python", sys.executable)

        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail
        # with:
        #
        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)
        #
        # Setting it to a default value of "en_US.UTF-8" enables pip install to complete
        #
        if platform.system() == "Darwin" and "LC_ALL" not in os.environ:
            os.environ["LC_ALL"] = "en_US.UTF-8"

    # When running `kubectl`, if AZDATA_OPENSHIFT is set, use `oc`
    #
    if cmd.startswith("kubectl ") and "AZDATA_OPENSHIFT" in os.environ:
        cmd_actual[0] = cmd_actual[0].replace("kubectl", "oc")

    # To aid supportabilty, determine which binary file will actually be executed on the machine
    #
    which_binary = None

    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to
    # get JWT tokens, it returns "(56) Failure when receiving data from the peer".  If another instance
    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost
    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we
    # look for the 2nd installation of CURL in the path)
    if platform.system() == "Windows" and cmd.startswith("curl "):
        path = os.getenv('PATH')
        for p in path.split(os.path.pathsep):
            p = os.path.join(p, "curl.exe")
            if os.path.exists(p) and os.access(p, os.X_OK):
                if p.lower().find("system32") == -1:
                    cmd_actual[0] = p
                    which_binary = p
                    break

    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this
    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) 
    #
    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.
    #
    if which_binary == None:
        which_binary = shutil.which(cmd_actual[0])

    if which_binary == None:
        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:
            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))

        raise FileNotFoundError(f"Executable '{cmd_actual[0]}' not found in path (where/which)")
    else:   
        cmd_actual[0] = which_binary

    start_time = datetime.datetime.now().replace(microsecond=0)

    print(f"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)")
    print(f"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})")
    print(f"       cwd: {os.getcwd()}")

    # Command-line tools such as CURL and AZDATA HDFS commands output
    # scrolling progress bars, which causes Jupyter to hang forever, to
    # workaround this, use no_output=True
    #

    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait
    #
    wait = True 

    try:
        if no_output:
            p = Popen(cmd_actual)
        else:
            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)
            with p.stdout:
                for line in iter(p.stdout.readline, b''):
                    line = line.decode()
                    if return_output:
                        output = output + line
                    else:
                        if cmd.startswith("azdata notebook run"): # Hyperlink the .ipynb file
                            regex = re.compile('  "(.*)"\: "(.*)"') 
                            match = regex.match(line)
                            if match:
                                if match.group(1).find("HTML") != -1:
                                    display(Markdown(f' - "{match.group(1)}": "{match.group(2)}"'))
                                else:
                                    display(Markdown(f' - "{match.group(1)}": "[{match.group(2)}]({match.group(2)})"'))

                                    wait = False
                                    break # otherwise infinite hang, have not worked out why yet.
                        else:
                            print(line, end='')
                            if rules is not None:
                                apply_expert_rules(line)

        if wait:
            p.wait()
    except FileNotFoundError as e:
        if install_hint is not None:
            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))

        raise FileNotFoundError(f"Executable '{cmd_actual[0]}' not found in path (where/which)") from e

    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()

    if not no_output:
        for line in iter(p.stderr.readline, b''):
            try:
                line_decoded = line.decode()
            except UnicodeDecodeError:
                # NOTE: Sometimes we get characters back that cannot be decoded(), e.g.
                #
                #   \xa0
                #
                # For example see this in the response from `az group create`:
                #
                # ERROR: Get Token request returned http error: 400 and server 
                # response: {"error":"invalid_grant",# "error_description":"AADSTS700082: 
                # The refresh token has expired due to inactivity.\xa0The token was 
                # issued on 2018-10-25T23:35:11.9832872Z
                #
                # which generates the exception:
                #
                # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa0 in position 179: invalid start byte
                #
                print("WARNING: Unable to decode stderr line, printing raw bytes:")
                print(line)
                line_decoded = ""
                pass
            else:

                # azdata emits a single empty line to stderr when doing an hdfs cp, don't
                # print this empty "ERR:" as it confuses.
                #
                if line_decoded == "":
                    continue
                
                print(f"STDERR: {line_decoded}", end='')

                if line_decoded.startswith("An exception has occurred") or line_decoded.startswith("ERROR: An error occurred while executing the following cell"):
                    exit_code_workaround = 1

                # inject HINTs to next TSG/SOP based on output in stderr
                #
                if user_provided_exe_name in error_hints:
                    for error_hint in error_hints[user_provided_exe_name]:
                        if line_decoded.find(error_hint[0]) != -1:
                            display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))

                # apply expert rules (to run follow-on notebooks), based on output
                #
                if rules is not None:
                    apply_expert_rules(line_decoded)

                # Verify if a transient error, if so automatically retry (recursive)
                #
                if user_provided_exe_name in retry_hints:
                    for retry_hint in retry_hints[user_provided_exe_name]:
                        if line_decoded.find(retry_hint) != -1:
                            if retry_count < MAX_RETRIES:
                                print(f"RETRY: {retry_count} (due to: {retry_hint})")
                                retry_count = retry_count + 1
                                output = run(cmd, return_output=return_output, retry_count=retry_count)

                                if return_output:
                                    return output
                                else:
                                    return

    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time

    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so
    # don't wait here, if success known above
    #
    if wait: 
        if p.returncode != 0:
            raise SystemExit(f'Shell command:\n\n\t{cmd} ({elapsed}s elapsed)\n\nreturned non-zero exit code: {str(p.returncode)}.\n')
    else:
        if exit_code_workaround !=0 :
            raise SystemExit(f'Shell command:\n\n\t{cmd} ({elapsed}s elapsed)\n\nreturned non-zero exit code: {str(exit_code_workaround)}.\n')

    print(f'\nSUCCESS: {elapsed}s elapsed.\n')

    if return_output:
        return output

def load_json(filename):
    """Load a json file from disk and return the contents"""

    with open(filename, encoding="utf8") as json_file:
        return json.load(json_file)

def load_rules():
    """Load any 'expert rules' from the metadata of this notebook (.ipynb) that should be applied to the stderr of the running executable"""

    # Load this notebook as json to get access to the expert rules in the notebook metadata.
    #
    try:
        j = load_json("opr001-create-app-deploy.ipynb")
    except:
        pass # If the user has renamed the book, we can't load ourself.  NOTE: Is there a way in Jupyter, to know your own filename?
    else:
        if "metadata" in j and \
            "azdata" in j["metadata"] and \
            "expert" in j["metadata"]["azdata"] and \
            "expanded_rules" in j["metadata"]["azdata"]["expert"]:

            rules = j["metadata"]["azdata"]["expert"]["expanded_rules"]

            rules.sort() # Sort rules, so they run in priority order (the [0] element).  Lowest value first.

            # print (f"EXPERT: There are {len(rules)} rules to evaluate.")

            return rules

def apply_expert_rules(line):
    """Determine if the stderr line passed in, matches the regular expressions for any of the 'expert rules', if so
    inject a 'HINT' to the follow-on SOP/TSG to run"""

    global rules

    for rule in rules:
        notebook = rule[1]
        cell_type = rule[2]
        output_type = rule[3] # i.e. stream or error
        output_type_name = rule[4] # i.e. ename or name 
        output_type_value = rule[5] # i.e. SystemExit or stdout
        details_name = rule[6]  # i.e. evalue or text 
        expression = rule[7].replace("\\*", "*") # Something escaped *, and put a \ in front of it!

        if debug_logging:
            print(f"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.")

        if re.match(expression, line, re.DOTALL):

            if debug_logging:
                print("EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'".format(output_type_name, output_type_value, expression, notebook))

            match_found = True

            display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))




print('Common functions defined successfully.')

# Hints for binary (transient fault) retry, (known) error and install guide
#
retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'], 'azdata': ['Endpoint sql-server-master does not exist', 'Endpoint livy does not exist', 'Failed to get state for cluster', 'Endpoint webhdfs does not exist', 'Adaptive Server is unavailable or does not exist', 'Error: Address already in use']}
error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']], 'azdata': [['azdata login', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['The token is expired', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Reason: Unauthorized', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Max retries exceeded with url: /api/v1/bdc/endpoints', 'SOP028 - azdata login', '../common/sop028-azdata-login.ipynb'], ['Look at the controller logs for more details', 'TSG027 - Observe cluster deployment', '../diagnose/tsg027-observe-bdc-create.ipynb'], ['provided port is already allocated', 'TSG062 - Get tail of all previous container logs for pods in BDC namespace', '../log-files/tsg062-tail-bdc-previous-container-logs.ipynb'], ['Create cluster failed since the existing namespace', 'SOP061 - Delete a big data cluster', '../install/sop061-delete-bdc.ipynb'], ['Failed to complete kube config setup', 'TSG067 - Failed to complete kube config setup', '../repair/tsg067-failed-to-complete-kube-config-setup.ipynb'], ['Error processing command: "ApiError', 'TSG110 - Azdata returns ApiError', '../repair/tsg110-azdata-returns-apierror.ipynb'], ['Error processing command: "ControllerError', 'TSG036 - Controller logs', '../log-analyzers/tsg036-get-controller-logs.ipynb'], ['ERROR: 500', 'TSG046 - Knox gateway logs', '../log-analyzers/tsg046-get-knox-logs.ipynb'], ['Data source name not found and no default driver specified', 'SOP069 - Install ODBC for SQL Server', '../install/sop069-install-odbc-driver-for-sql-server.ipynb'], ["Can't open lib 'ODBC Driver 17 for SQL Server", 'SOP069 - Install ODBC for SQL Server', '../install/sop069-install-odbc-driver-for-sql-server.ipynb'], ['Control plane upgrade failed. Failed to upgrade controller.', 'TSG108 - View the controller upgrade config map', '../diagnose/tsg108-controller-failed-to-upgrade.ipynb']]}
install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb'], 'azdata': ['SOP063 - Install azdata CLI (using package manager)', '../install/sop063-packman-install-azdata.ipynb']}

### Instantiate Kubernetes client

In [None]:
# Instantiate the Python Kubernetes client into 'api' variable

import os

try:
    from kubernetes import client, config
    from kubernetes.stream import stream

    if "KUBERNETES_SERVICE_PORT" in os.environ and "KUBERNETES_SERVICE_HOST" in os.environ:
        config.load_incluster_config()
    else:
        try:
            config.load_kube_config()
        except:
            display(Markdown(f'HINT: Use [TSG118 - Configure Kubernetes config](../repair/tsg118-configure-kube-config.ipynb) to resolve this issue.'))
            raise
    api = client.CoreV1Api()

    print('Kubernetes client instantiated')
except ImportError:
    from IPython.display import Markdown
    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))
    raise

### Get the Kubernetes namespace for the big data cluster

Get the namespace of the Big Data Cluster use the kubectl command line
interface .

**NOTE:**

If there is more than one Big Data Cluster in the target Kubernetes
cluster, then either:

-   set \[0\] to the correct value for the big data cluster.
-   set the environment variable AZDATA\_NAMESPACE, before starting
    Azure Data Studio.

In [None]:
# Place Kubernetes namespace name for BDC into 'namespace' variable

if "AZDATA_NAMESPACE" in os.environ:
    namespace = os.environ["AZDATA_NAMESPACE"]
else:
    try:
        namespace = run(f'kubectl get namespace --selector=MSSQL_CLUSTER -o jsonpath={{.items[0].metadata.name}}', return_output=True)
    except:
        from IPython.display import Markdown
        print(f"ERROR: Unable to find a Kubernetes namespace with label 'MSSQL_CLUSTER'.  SQL Server Big Data Cluster Kubernetes namespaces contain the label 'MSSQL_CLUSTER'.")
        display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))
        display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))
        display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))
        raise

print(f'The SQL Server Big Data Cluster Kubernetes namespace is: {namespace}')

### Create a temporary directory to stage files

In [None]:
# Create a temporary directory to hold configuration files

import tempfile

temp_dir = tempfile.mkdtemp()

print(f"Temporary directory created: {temp_dir}")

### Copy notebook files to app-deploy staging folder

Copy notebooks from library into staging directory. The notebooks are
prefixed with ‘step001’ etc. so their order is maintained and they will
be run in order.

Each notebook may also contain `expert rules` whose expression if
matched in the output of a notebook, requires further notebooks to be
run. Those additional notebooks will be staged as well.

In [None]:
import json

from shutil import copyfile

# Always copy in run001, it's used to run all other notebooks
#
additional_notebooks = [
    os.path.join("..", "notebook-runner", "run001-run-notebook.ipynb")
]

def load_json(filename):
    with open(filename, encoding="utf8") as json_file:
        return json.load(json_file)

def look_for_additional_notebooks(n):
    """Check to see if this notebook has expert rules requiring other notebooks that need to be run (on expression match)
    """
    j = load_json(n)

    if "metadata" in j and \
        "azdata" in j["metadata"] and \
        "expert" in j["metadata"]["azdata"] and \
        "expanded_rules" in j["metadata"]["azdata"]["expert"]:

        rules = j["metadata"]["azdata"]["expert"]["expanded_rules"]

        for rule in rules:
            additional_notebook = rule[1]
            additional_notebooks.append(additional_notebook)

            # Recursively continue looking for additional notebooks in the expert rules
            #
            look_for_additional_notebooks(additional_notebook)

step_counter = 1
for notebook in notebooks:
    destination_file = "step{0:03d}-{1}".format(step_counter, os.path.basename(notebook))
    copyfile(notebook, os.path.join(temp_dir, destination_file))
    print("Staged: " + destination_file)

    look_for_additional_notebooks(notebook)
    
    step_counter = step_counter + 1

### Copy additional notebook files to app-deploy staging folder

A notebook may contain `expert rules` which contain expressions, which
if matched in the output of a notebook, require further additional
notebooks to be run. Copy those additional notebooks into the staging
directory.

In [None]:
from shutil import copyfile

for notebook in additional_notebooks:
    copyfile(notebook, os.path.join(temp_dir, os.path.basename(notebook)))
    print("Staged: " + os.path.basename(notebook))

### Helper function to save configuration files to disk

In [None]:
# Define helper function 'save_file' to save configuration files to the temporary directory created above
import os
import io

def save_file(filename, contents):
    with io.open(os.path.join(temp_dir, filename), "w", encoding='utf8', newline='\n') as text_file:
      text_file.write(contents)

      print("File saved: " + os.path.join(temp_dir, filename))

print("Function `save_file` defined successfully.")

### Get the controller username and password

Get the controller username and password from the Kubernetes Secret
Store and place in the required AZDATA\_USERNAME and AZDATA\_PASSWORD
environment variables.

In [None]:
# Place controller secret in AZDATA_USERNAME/AZDATA_PASSWORD environment variables

import os, base64

os.environ["AZDATA_USERNAME"] = run(f'kubectl get secret/controller-login-secret -n {namespace} -o jsonpath={{.data.username}}', return_output=True)
os.environ["AZDATA_USERNAME"] = base64.b64decode(os.environ["AZDATA_USERNAME"]).decode('utf-8')

os.environ["AZDATA_PASSWORD"] = run(f'kubectl get secret/controller-login-secret -n {namespace} -o jsonpath={{.data.password}}', return_output=True)
os.environ["AZDATA_PASSWORD"] = base64.b64decode(os.environ["AZDATA_PASSWORD"]).decode('utf-8')


print(f"Controller username '{os.environ['AZDATA_USERNAME']}' and password stored in environment variables")

### Configure the app-deploy azdata python dependencies

NOTE: These dependencies are installed into `azdata` python installation
in the app-deploy container, so they are available to
`azdata notebook run`.

NOTE: azdata notebook run does not support injecting arguments with """
(which can encode newlines), therefore treat " " as a need for a newline
(and replace).

In [None]:
save_file("app-requirements.txt", app_requirements_txt.replace(" ", "\n"))

### Configure container scoped dependencies

These are the dependencies that need to be installed before Python pip
install is called (requirements.txt)

-   azdata
-   spark kernels (into azdata python instance)
-   kubectl

In [None]:
pre_package_install_sh = """
apt-get install gnupg ca-certificates curl apt-transport-https lsb-release -y

# Install AZDATA
#
wget -qO- https://packages.microsoft.com/keys/microsoft.asc | apt-key add -
add-apt-repository "$(wget -qO- https://packages.microsoft.com/config/ubuntu/16.04/mssql-server-2019.list)"
apt-get update
apt-get install -y azdata-cli
azdata --version

ACCEPT_EULA=Y apt-get install -y msodbcsql17 unixodbc-dev

# Install Spark (Scala) and PySpark Jupyter kernels
#
/opt/azdata/bin/python3 -m pip install -r /var/opt/app/app-requirements.txt
/opt/azdata/bin/python3 -m pip install sparkmagic
/opt/azdata/bin/python3 /opt/azdata/bin/jupyter-kernelspec install --user /opt/azdata/lib/python3.6/site-packages/sparkmagic/kernels/sparkkernel
/opt/azdata/bin/python3 /opt/azdata/bin/jupyter-kernelspec install --user /opt/azdata/lib/python3.6/site-packages/sparkmagic/kernels/pysparkkernel

mkdir -p /root/.sparkmagic
echo '{ "ignore_ssl_errors": true }' > /root/.sparkmagic/config.json

# Install SQL and Powershell notebook kernels
#

# Download the Microsoft repository GPG keys
wget -q https://packages.microsoft.com/config/ubuntu/16.04/packages-microsoft-prod.deb

# Register the Microsoft repository GPG keys
dpkg -i packages-microsoft-prod.deb

# Update the list of products
apt-get update

# Install PowerShell
apt-get install -y powershell

# Install SqlServer module
pwsh -Command "Install-Module -Name SqlServer -force"

# Install KUBECTL
#
apt-get update && apt-get install -y apt-transport-https
curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
echo "deb https://apt.kubernetes.io/ kubernetes-xenial main" | tee -a /etc/apt/sources.list.d/kubernetes.list
apt-get update
apt-get install -y kubectl
"""

save_file("pre-package-install.sh", pre_package_install_sh)

### The app-deploy python script

This script runs in the app-deploy container.

NOTE: The app-deploy container itself supports only Python 3.5, so
features like interpolated strings will not work in the script that
app-deploy launches. But the azdata CLI is installed into Python 3.6, so
the notebooks that azdata CLI runs can use the later features of Python.

In [None]:
run_py = f"""
import os,  json, datetime, shlex, subprocess, glob, base64
from subprocess import PIPE, Popen

# Retry hints for transient errors that happen before cell execution.  There are more retry hints for cell level
# transient errors inside the notebook run() function.
#
retry_hints = [
    "Error: Address already in use"  # About .1% this is seen, resulting in RuntimeError: Kernel died before replying to kernel_info
]

MAX_RETRIES = 10

# NOTEBOOK_CELL_TIMEOUT
#
# 4 hours (14400 minutes) per cell timeout (overrides the 10 minute default, e.g. core dump file 
# copies can take a very long time.  Note, individual notebooks can have their own
# timeout, so this timeout needs to be larger than any individual notebooks. 
#
NOTEBOOK_CELL_TIMEOUT = 14400

USE_AD_AUTH = {"True" if "AZDATA_AD_AUTH" in os.environ else "False"}

def run(cmd, retry_count=0, return_output=False):

    env = os.environ.copy()
    env["ACCEPT_EULA"] = "yes"

    print("START:" + cmd)
    process = Popen(shlex.split(cmd), stdout=PIPE, stderr=PIPE, env=env)
    stdout, stderr = process.communicate()
    returncode = process.returncode

    stdout = str(stdout.decode())
    if stdout != "" and not return_output: # Don't print if returning, may contain secrets
        print("STDOUT: " + stdout)

    stderr = str(stderr.decode())
    if stderr != "":
        print("STDERR: " + stderr)

    for retry_hint in retry_hints:
        if stderr.find(retry_hint) != -1:
            if retry_count < MAX_RETRIES:
                retry_count = retry_count + 1
                print("RETRY: Retrying {{0}} (due to: {{1}})".format(retry_count, retry_hint))
                return run(cmd, retry_count=retry_count)

    print("SUCCESS: " if returncode == 0 else "ERROR: " + cmd)

    if return_output:
        return returncode, stdout
    else:
        return returncode

def handler():

    session_start = datetime.datetime.utcnow()

    # Remove output-*.ipynb files from previous runs
    #
    for notebook in glob.glob("output-*.ipynb"):
        os.remove(notebook)

    # Run notebooks in order
    #
    files = glob.glob('step*.ipynb')
    files.sort()

    for file in files:
        exit_code = run('azdata notebook run --path "run001-run-notebook.ipynb" -a "{{{{\\\\"session_start\\\\": \\\\"{{0}}\\\\", \\\\"app_name\\\\": \\\\"{app_name}\\\\", \\\\"app_version\\\\": \\\\"{app_version}\\\\", \\\\"notebook_path\\\\": \\\\"{{1}}\\\\", \\\\"use_ad_auth\\\\": \\\\"{{4}}\\\\"}}}}" --output-path "{{2}}" --output-html --timeout {{3}}'.format(session_start, file, os.getcwd(), NOTEBOOK_CELL_TIMEOUT, "True" if USE_AD_AUTH else "False"))

        if exit_code != 0:

            hdfs_destination = 'app-deploy/{{0}}-{{1}}/logs/{{2}}/{{3:02d}}/{{4:02d}}/{{5:02d}}-{{6:02d}}-{{7}}-{{8}}'.format(
                "{app_name}", "{app_version}", session_start.year, session_start.month, session_start.day, session_start.hour, session_start.minute, "FAILURE", "run001-run-notebook.ipynb" +  "-" + file)

            _, os.environ["AZDATA_USERNAME"] = run("kubectl get secret/controller-login-secret -n {namespace} -o jsonpath={{.data.username}}", return_output=True)
            os.environ["AZDATA_USERNAME"] = base64.b64decode(os.environ["AZDATA_USERNAME"]).decode('utf-8')

            _, os.environ["AZDATA_PASSWORD"] = run("kubectl get secret/controller-login-secret -n {namespace} -o jsonpath={{.data.password}}", return_output=True)
            os.environ["AZDATA_PASSWORD"] = base64.b64decode(os.environ["AZDATA_PASSWORD"]).decode('utf-8')

            run("azdata bdc hdfs cp --from-path {{0}} --to-path hdfs:/{{1}}".format(
                os.path.join(os.getcwd(), "output-run001-run-notebook.ipynb").replace("\\\\", "\\\\\\\\"), hdfs_destination))

            break

    return exit_code

if __name__ == '__main__':
    handler()
"""

save_file("run.py", run_py)

### Configure the app-deploy spec yaml file

In [None]:
spec = f"""
entrypoint: handler
name: {app_name}
output:
  out: str
owners: []
poolsize: 1
readers: []
replicas: 1
runtime: Python
src: ./run.py
version: {app_version}
"""

save_file("spec.yaml", spec)

### Delete the app-deploy if previously created

In [None]:
app_exists = run(f"azdata app list --name {app_name} --version {app_version}", return_output=True)

if len(app_exists) > 2:  # 2 is the length returned when the app does not exist
    run(f"azdata app delete --name {app_name} --version {app_version}")

# NOTE: Provide diagostic information for a 409 result (Conflict) in the next step, run `app list` to verify
# the app has been deleted.  409 (Conflict) has been seen below
#
run(f"azdata app list --name {app_name} --version {app_version}")

### Create the app-deploy application

In [None]:
run(f'azdata app create --spec "{temp_dir}"')

### Wait for the app-deploy application to get into Ready state

In [None]:
import json

state = "WaitingForCreate"
counter = 0
MAX_CHECKS_FOR_TO_REACH_READY_STATE = 150 # It can take a while for all the requirements to be installed!

while state == "Creating" or state == "WaitingForCreate" or state == "WaitingForCredentials":
    app_state = run(f'azdata app describe --name {app_name} --version {app_version} --spec "{temp_dir}"', return_output=True)

    app_state_json = json.loads(app_state)

    try:
        state = app_state_json["state"]
    except TypeError as e: # Sometimes, we see "TypeError: string indices must be integers"
        state = str(ex)

    counter = counter + 1
    print(f"State: '{state}' (waiting ({counter}) for 'Ready' state)")
    print('')

    if counter == MAX_CHECKS_FOR_TO_REACH_READY_STATE:
        print (app_state)
        raise SystemExit(f'App has not moved to Ready state in {counter} attempts')

print (f"App state: {app_state}")

if state == "Error":
    raise SystemExit('App is in Error state')

print (f"App successfully moved from WaitingForCreate through to Ready state")

### Show the app-deploy details

In [None]:
run(f"azdata app describe --name {app_name} --version {app_version}")

### Create the Kubernetes RBAC settings

In [None]:
role_binding = run(f"kubectl get clusterrolebindings --field-selector metadata.name={app_name}-{app_version}-{namespace}-default-admin-binding --no-headers -o jsonpath={{.items}}", return_output=True)

if role_binding == "[]": # does not exist
    run(f"kubectl create clusterrolebinding {app_name}-{app_version}-{namespace}-default-admin-binding --clusterrole=cluster-admin --user=system:serviceaccount:{namespace}:default")

### Verify the app-deploy installation logs

The app-deploy installation logs are in the directory:

    /var/log/supervisor/log

e.g.

-   AppSetup-stderr—supervisor-AZYYnr.log
-   MLServer-stderr—supervisor-RgvrIA.log
-   AppSetup-stdout—supervisor-qsAKzn.log
-   MLServer-stdout—supervisor-vRFUc5.log

In [None]:
import re

tail_lines = 2000
pod = run(f'kubectl get pod --selector=app=app-{app_name}-{app_version} -n {namespace} -o jsonpath={{.items[0].metadata.name}}', return_output=True)
container = "mlserver"
log_files = [ "/var/log/supervisor/log/*-*---supervisor-*.log" ]

expressions_to_analyze = [ ]

### Get tail for log

In [None]:
# Display the last 'tail_lines' of files in 'log_files' list

pods = api.list_namespaced_pod(namespace)

entries_for_analysis = []

for p in pods.items:
    if pod is None or p.metadata.name == pod:
        for c in p.spec.containers:
            if container is None or c.name == container:
                for log_file in log_files:
                    print (f"- LOGS: '{log_file}' for CONTAINER: '{c.name}' in POD: '{p.metadata.name}'")
                    try:
                        output = stream(api.connect_get_namespaced_pod_exec, p.metadata.name, namespace, command=['/bin/sh', '-c', f'tail -n {tail_lines} {log_file}'], container=c.name, stderr=True, stdout=True)
                    except Exception:
                        print (f"FAILED to get LOGS for CONTAINER: {c.name} in POD: {p.metadata.name}")
                    else:
                        for line in output.split('\n'):
                            for expression in expressions_to_analyze:
                                if expression.match(line):
                                    entries_for_analysis.append(line)
                            print(line)
print("")
print(f"{len(entries_for_analysis)} log entries found for further analysis.")

### Analyze log entries and suggest relevant Troubleshooting Guides

In [None]:
# Analyze log entries and suggest further relevant troubleshooting guides

from IPython.display import Markdown

import os
import json
import requests
import ipykernel
import datetime

from urllib.parse import urljoin
from notebook import notebookapp

def get_notebook_name():
    """Return the full path of the jupyter notebook.   Some runtimes (e.g. ADS) 
    have the kernel_id in the filename of the connection file.  If so, the 
    notebook name at runtime can be determined using `list_running_servers`.
    Other runtimes (e.g. azdata) do not have the kernel_id in the filename of
    the connection file, therefore we are unable to establish the filename
    """
    connection_file = os.path.basename(ipykernel.get_connection_file())
    
    # If the runtime has the kernel_id in the connection filename, use it to
    # get the real notebook name at runtime, otherwise, use the notebook 
    # filename from build time.
    try: 
        kernel_id = connection_file.split('-', 1)[1].split('.')[0]
    except:
        pass
    else:
        for servers in list(notebookapp.list_running_servers()):
            try:
                response = requests.get(urljoin(servers['url'], 'api/sessions'), params={'token': servers.get('token', '')}, timeout=.01)
            except:
                pass
            else:
                for nn in json.loads(response.text):
                    if nn['kernel']['id'] == kernel_id:
                        return nn['path']

def load_json(filename):
    with open(filename, encoding="utf8") as json_file:
        return json.load(json_file)

def get_notebook_rules():
    """Load the notebook rules from the metadata of this notebook (in the .ipynb file)"""
    file_name = get_notebook_name()

    if file_name == None:
        return None
    else:
        j = load_json(file_name)

        if "azdata" not in j["metadata"] or \
            "expert" not in j["metadata"]["azdata"] or \
            "log_analyzer_rules" not in j["metadata"]["azdata"]["expert"]:
            return []
        else:
            return j["metadata"]["azdata"]["expert"]["log_analyzer_rules"]

rules = get_notebook_rules()

if rules == None:
    print("")
    print(f"Log Analysis only available when run in Azure Data Studio.  Not available when run in azdata.")
else:
    hints = 0
    if len(rules) > 0:
        for entry in entries_for_analysis:
            for rule in rules:
                if entry.find(rule[0]) != -1:
                    print (entry)

                    display(Markdown(f'HINT: Use [{rule[2]}]({rule[3]}) to resolve this issue.'))
                    hints = hints + 1

    print("")
    print(f"{len(entries_for_analysis)} log entries analyzed (using {len(rules)} rules). {hints} further troubleshooting hints made inline.")

### Clean up temporary directory for staging configuration files

In [None]:
# Delete the temporary directory used to hold configuration files

import shutil

shutil.rmtree(temp_dir)

print(f'Temporary directory deleted: {temp_dir}')

In [None]:
print('Notebook execution complete.')

Related
-------

-   [OPR002 - Run
    app-deploy](../notebook-o16n/opr002-run-app-deploy.ipynb)

-   [OPR600 - Monitor infrastructure
    (Kubernetes)](../notebook-o16n/opr600-monitor-infrastructure-k8s.ipynb)

-   [OPR900 - Troubleshoot run
    app-deploy](../notebook-o16n/opr900-troubleshoot-run-app-deploy.ipynb)