<h1><center>Ingest History Data from Apigee with Dataproc Cluster</center></h1>
<a id="tc"></a>

## Table of Contents
1. [Configuration](#configuration) 
2. [History Ingest to Google Cloud Storage](#ingest)
3. [Save states of jobs to GCS](#states)
4. [Push Notebook to GCS Bucket](#gcs)

<a id="configuration"></a>
## Configuration
[back to Table Of Contents](#tc)

In [5]:
import os 
os.environ["JAVA_HOME"] = '/usr/lib/jvm/jdk1.8.0_221'
os.environ["PATH"] += os.pathsep + os.environ["JAVA_HOME"] + '/bin'

In [6]:
BUCKET = 'ai4ops-main-storage-bucket'
PROJECT = 'kohls-kos-cicd'
CLUSTER = 'ai4ops'
REGION='global'
SCRIPT_PATH = 'poc/spark/ingest'
AI4OPS_HISTORY_PATH=f"gs://{BUCKET}/apigee_history/apigee/metrics/history"
RESOURCES='/opt/dataproc/.resources'

INGEST_JOB_1='job_part_kohls_06_01.json'
INGEST_JOB_2='job_part_kohls_06_02.json'
INGEST_JOB_3='job_part_kohls_06_03.json'

arguments = {'--token_file_gcs_path':f'gs://{BUCKET}/resources/a_with_proxy_v2.0.txt',\
             '--res_path':RESOURCES,\
             '--by_proxy':'50',
             '--by_time':'480',
             '--batch_size':'50'\
            }

<a id="ingest"></a>
## History Ingest to Google Cloud Storage
[back to Table Of Contents](#tc)

In [1]:
from job_api import *
import importlib
from datetime import datetime
import sys
import pyspark
import json

In [8]:
builder = DataprocJobBuilder()
session = Session(BUCKET, REGION, CLUSTER, PROJECT)



In [10]:
%%py_script --name yarn_logging.py --path poc/spark/ingest
import os
import logging
import sys


class YarnLogger:
    @staticmethod
    def setup_logger():
        if not 'LOG_DIRS' in os.environ:
            sys.stderr.write('Missing LOG_DIRS environment variable, pyspark logging disabled')
            return

        file = os.environ['LOG_DIRS'].split(',')[0] + '/pyspark.log'
        logging.basicConfig(filename=file, level=logging.INFO,
                            format='%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s')

    def __getattr__(self, key):
        return getattr(logging, key)


YarnLogger.setup_logger()

<job_api.PyScript at 0x7f52b4774f98>

In [11]:
job_1_name = "api_ai4ops_history_ingest_{}".format(int(datetime.now().timestamp()))

arguments['--tasks_file_path'] = f'./{INGEST_JOB_1}'
arguments['--output_file_pattern_path'] = f'{AI4OPS_HISTORY_PATH}/{job_1_name}'

job1 = builder.job_file(f'{SCRIPT_PATH}/apigee_history_ingest.py')\
.job_id(job_1_name)\
.py_file(f'{SCRIPT_PATH}/apigee_ingest_utils.py')\
.py_file(f'{SCRIPT_PATH}/ai4ops_db.py')\
.py_script('yarn_logging.py')\
.jar(f"gs://{BUCKET}/resources/spark.http.apigee-1.0-SNAPSHOT-jar-with-dependencies.jar")\
.file(f'{SCRIPT_PATH}/jobs/{INGEST_JOB_1}')\
.arguments(**arguments)\
.build_job()


executor1 = DataprocExecutor(job1, session)

In [12]:
job_2_name = "api_ai4ops_history_ingest_{}".format(int(datetime.now().timestamp()))

arguments['--tasks_file_path'] = f'./{INGEST_JOB_2}'
arguments['--output_file_pattern_path'] = f'{AI4OPS_HISTORY_PATH}/{job_2_name}'
# builder = DataprocJobBuilder()
job2 = builder.job_file(f'{SCRIPT_PATH}/apigee_history_ingest.py')\
.job_id(job_2_name)\
.py_file(f'{SCRIPT_PATH}/apigee_ingest_utils.py')\
.py_file(f'{SCRIPT_PATH}/ai4ops_db.py')\
.py_script('yarn_logging.py')\
.jar(f"gs://{BUCKET}/resources/spark.http.apigee-1.0-SNAPSHOT-jar-with-dependencies.jar")\
.file(f'{SCRIPT_PATH}/jobs/{INGEST_JOB_2}')\
.arguments(**arguments)\
.build_job()


executor2 = DataprocExecutor(job2, session)

In [13]:
job_3_name = "api_ai4ops_history_ingest_{}".format(int(datetime.now().timestamp()))

arguments['--tasks_file_path'] = f'./{INGEST_JOB_3}'
arguments['--output_file_pattern_path'] = f'{AI4OPS_HISTORY_PATH}/{job_3_name}'
# builder = DataprocJobBuilder()
job3 = builder.job_file(f'{SCRIPT_PATH}/apigee_history_ingest.py')\
.job_id(job_3_name)\
.py_file(f'{SCRIPT_PATH}/apigee_ingest_utils.py')\
.py_file(f'{SCRIPT_PATH}/ai4ops_db.py')\
.py_script('yarn_logging.py')\
.jar(f"gs://{BUCKET}/resources/spark.http.apigee-1.0-SNAPSHOT-jar-with-dependencies.jar")\
.file(f'{SCRIPT_PATH}/jobs/{INGEST_JOB_3}')\
.arguments(**arguments)\
.build_job()


executor3 = DataprocExecutor(job3, session)

In [46]:
res1 = executor1.submit_job(run_async=True)

Job with id api_ai4ops_history_ingest_1567426379 was submitted to the cluster ai4ops


In [47]:
res2 = executor2.submit_job(run_async=True)

Job with id api_ai4ops_history_ingest_1567426380 was submitted to the cluster ai4ops


In [48]:
res3 = executor3.submit_job(run_async=True)

Job with id api_ai4ops_history_ingest_1567426381 was submitted to the cluster ai4ops


In [49]:
sleep(60)
state1 = executor1.get_job_state()
state2 = executor2.get_job_state()
state3 = executor3.get_job_state()

print('State 1: {}'.format(state1))
print('State 2: {}'.format(state2))
print('State 3: {}'.format(state3))

if state1 not in ['DONE', 'RUNNING']:
    raise RuntimeError('Previous workflow step was failed')

if state2 not in ['DONE', 'RUNNING']:
    raise RuntimeError('Previous workflow step was failed')

if state3 not in ['DONE', 'RUNNING']:
    raise RuntimeError('Previous workflow step was failed')

State 1: RUNNING
State 2: RUNNING
State 3: RUNNING


In [37]:
ingest_transition = {
    "INGEST_JOB_1": f"{job1.job_id}",
    "INGEST_JOB_2": f"{job2.job_id}",
    "INGEST_JOB_3": f"{job3.job_id}",
    "INGEST_TIMESTAMP": f"{int(datetime.now().timestamp())}",
    "INGEST_BUCKET": f"{BUCKET}",
    "INGEST_OUTPUT_JOB_1": f"{AI4OPS_HISTORY_PATH}/{job_1_name}/chunk*",
    "INGEST_OUTPUT_JOB_2": f"{AI4OPS_HISTORY_PATH}/{job_2_name}/chunk*",
    "INGEST_OUTPUT_JOB_3": f"{AI4OPS_HISTORY_PATH}/{job_3_name}/chunk*",
    "INGEST_STATE_JOB_1": f"{state1}",
    "INGEST_STATE_JOB_2": f"{state2}",
    "INGEST_STATE_JOB_3": f"{state3}"
}

with open('api_transition_ingest.json', 'w') as file:
     file.write(json.dumps(ingest_transition)) 

In [35]:
executor1.get_job_state()

'RUNNING'

<a id="gcs"></a>
## Push Notebook to GCS Bucket
[back to Table Of Contents](#tc)

In [None]:
!gsutil cp api_history_ingest.ipynb gs://ai4ops-main-storage-bucket/ai4ops-source/ai4ops-jupyter-ds-03/api