In [None]:
import logging, uuid, time, csv

from kubernetes import client, config, watch

logging.basicConfig(level=logging.INFO)
config.load_kube_config("/home/goughes/k8s/configs/erikdev-admin.yaml")
core_api = client.CoreV1Api()
batch_api = client.BatchV1Api()

In [None]:
# based on https://medium.com/@aris.david/how-to-create-a-job-using-kubernetes-python-client-ed00ac2b791d
class Kubernetes:
    def __init__(self):

        # Init Kubernetes
        self.core_api = client.CoreV1Api()
        self.batch_api = client.BatchV1Api()

    def create_namespace(self, namespace):

        namespaces = self.core_api.list_namespace()
        all_namespaces = []
        for ns in namespaces.items:
            all_namespaces.append(ns.metadata.name)

        if namespace in all_namespaces:
            logging.info(f"Namespace {namespace} already exists. Reusing.")
        else:
            namespace_metadata = client.V1ObjectMeta(name=namespace)
            self.core_api.create_namespace(
                client.V1Namespace(metadata=namespace_metadata)
            )
            logging.info(f"Created namespace {namespace}.")

        return namespace

    @staticmethod
    def create_container(image, name, pull_policy, cpu_limit, mem_limit, sleep_time):

        resources = client.V1ResourceRequirements(
            requests={"cpu": cpu_limit, "memory": mem_limit},
            limits={"cpu": cpu_limit, "memory": mem_limit}
        )
            
        container = client.V1Container(
            image=image,
            name=name,
            resources=resources,
            image_pull_policy=pull_policy,
            args=[sleep_time],
            command=["sleep"],
        )

        logging.info(
            f"Created sleep container with name: {container.name}, "
            f"image: {container.image} and args: {container.args}"
        )

        return container

    @staticmethod
    def create_pod_template(pod_name, container):
        pod_template = client.V1PodTemplateSpec(
            spec=client.V1PodSpec(restart_policy="Never", containers=[container]),
            metadata=client.V1ObjectMeta(name=pod_name, labels={"pod_name": pod_name}),
        )

        return pod_template

    @staticmethod
    def create_job(job_name, pod_template):
        metadata = client.V1ObjectMeta(name=job_name, labels={"job_name": job_name})

        job = client.V1Job(
            api_version="batch/v1",
            kind="Job",
            metadata=metadata,
            spec=client.V1JobSpec(backoff_limit=0, template=pod_template),
        )

        return job
    
    @staticmethod
    def get_all_pods(namespace):
        pods = core_api.list_namespaced_pod(namespace, pretty=True, timeout_seconds=60)
        print("number of pods: " + str(len(pods.items)))
        return pods
    
    @staticmethod
    def get_all_jobs(namespace):
        jobs = batch_api.list_namespaced_job(namespace, pretty=True, timeout_seconds=60)
        print("number of jobs: " + str(len(jobs.items)))
        return jobs
    
    @staticmethod
    def delete_all_jobs(namespace):
        jobs = batch_api.list_namespaced_job(namespace, pretty=True, timeout_seconds=60)
        deleteoptions = client.V1DeleteOptions()
        for job in jobs.items:
            print("Deleting job " + job.metadata.name)
            jobname = job.metadata.name
            api_response = batch_api.delete_namespaced_job(jobname,
                                                           namespace,
                                                           grace_period_seconds=0, 
                                                           propagation_policy='Background')
            logging.debug(api_response)
    
    """
        interval: time to wait/sleep between each job submission
    """
    @staticmethod
    def submit_burst(namespace, cpu_limit, mem_limit, total_jobs, sleep_time):
        try:
            image = "busybox:1.36"
            name = "burst-sleep-" + namespace
            pull_policy = "Never"
            print("bursting", total_jobs, "sleep", sleep_time)
            burst_submitted = 0
            while burst_submitted < total_jobs:
                container = k8s.create_container(image, name, pull_policy, cpu_limit, mem_limit, sleep_time)

                pod_id = uuid.uuid4()
                job_id = pod_id
                # create template
                _pod_name = f"{namespace}-burst-pod-{pod_id}"
                _pod_spec = k8s.create_pod_template(_pod_name, container)

                # create job
                _job_name = f"{namespace}-burst-{job_id}"
                _job = k8s.create_job(_job_name, _pod_spec)

                # execute job
                batch_api = client.BatchV1Api()
                batch_api.create_namespaced_job(namespace, _job)
                burst_submitted = burst_submitted + 1
        except Exception as e:
            print(e)
            
    @staticmethod
    def submit_workflow(namespace, cpu_limit, mem_limit, total_jobs, sleep_time, bursts, interval):
        image = "busybox:1.36"
        name = "sleep-" + namespace
        pull_policy = "Never"
        print("submit")
        execution_time = 0
        jobs_submitted = 0
        while jobs_submitted < total_jobs:
            for burst in bursts:
                if execution_time == burst[0]:
                    print("Submitting burst at " + str(execution_time))
                    k8s.submit_burst(namespace, cpu_limit, mem_limit, burst[1], burst[2])
                    
            #container = k8s.create_container(image, name, pull_policy, cpu_limit, mem_limit, sleep_time)

            #pod_id = uuid.uuid4()
            #job_id = pod_id
            # create template
           #_pod_name = f"{namespace}-pod-{pod_id}"
            #_pod_spec = k8s.create_pod_template(_pod_name, container)

            # create job
           # _job_name = f"{namespace}-{job_id}"
            #_job = k8s.create_job(_job_name, _pod_spec)

            # execute job
            #batch_api = client.BatchV1Api()
            #batch_api.create_namespaced_job(namespace, _job)
            jobs_submitted = jobs_submitted + 1
            execution_time = execution_time + interval
            time.sleep(interval)

In [None]:
k8s = Kubernetes()

namespaces = ["tenant1"]#, "tenant2", "tenant3"]

# clean each namespace of any leftover jobs
for namespace in namespaces:
    k8s.delete_all_jobs(namespace)

# wait until all the pods are deleted before starting run
cleaned_up = False
while not cleaned_up:
    print("waiting for pod clean up...")
    all_pods = core_api.list_pod_for_all_namespaces()
    all_tenant_pods = [ pod for pod in all_pods.items if "tenant" in pod.metadata.name]
    if not all_tenant_pods:
        print("pods are cleaned up!")
        cleaned_up = True
    time.sleep(1)

In [None]:


# format of parameters list of lists
# params = [
#     [namespace1, cpu_limit, mem_limit, num_jobs, sleep_time, submission_interval],
#     [namespace2, cpu_limit, mem_limit, num_jobs, sleep_time, submission_interval],
# ]

bursts_tenant1 = [
    [5, 60, "20"],
    #[120, 60, "20"]
]
params = [
    ["tenant1", "4", "8G", 40, "60", bursts_tenant1, 5], 
]
#["tenant2", "1", "2G", 750, "60", 1.5],
#["tenant3", "1", "2G", 500, "60", 2]

from multiprocess import Pool

def submit_parallel_workflows(params):
    namespace  = params[0]
    cpu_limit  = params[1]
    mem_limit  = params[2]
    num_jobs   = params[3]
    sleep_time = params[4]
    bursts     = params[5]
    interval   = params[6]
    k8s.submit_workflow(namespace, cpu_limit, mem_limit, num_jobs, sleep_time, bursts, interval)

    
p = Pool(len(params))
result = p.map_async(submit_parallel_workflows, params)



### Dictionary format used to store experiment data

Assuming `namespaces = ["tenant1", "tenant2", "tenant3"]` the dictionary would be created as follows:
```
data = {
    'timestamp':[1,2,3,4,5],
    'tenant1_pending':[3,4,3,4,5],
    'tenant1_running':[5,6,5,6,5],
    'tenant1_completed':[7,6,7,8,7],
    'tenant2_pending':[1,1,1,1,1],
    'tenant2_running':[2,2,2,2,2],
    'tenant2_completed':[3,3,3,3,3],
    'tenant3_pending':[2,3,2,3,2],
    'tenant3_running':[3,4,3,4,3],
    'tenant3_completed':[5,5,5,5,5],
}
```

In [None]:
import pprint

# returns total pending/running/completed cores across all namespaces
def get_totals(namespaces):
    total_running = 0
    total_pending = 0
    total_completed = 0
    for namespace in namespaces:
        if not exp_data[namespace+'_pending'] or not exp_data[namespace+'_running'] or not exp_data[namespace+'_completed']:
            return 0, 0, 0
        total_pending += exp_data[namespace+'_pending'][-1] 
        total_running += exp_data[namespace+'_running'][-1]
        total_completed += exp_data[namespace+'_completed'][-1]
    return total_pending, total_running, total_completed


# initialize data dictionary
exp_data = {}
exp_data['timestamp'] = [] # empty array for timestamps

for namespace in namespaces:
    exp_data[namespace+'_pending'] = [] # empty array for each namespace's pending jobs
    exp_data[namespace+'_running'] = [] # empty array for each namespace's running jobs
    exp_data[namespace+'_completed'] = [] # empty array for each namespace's completed jobs


finished = False
while not finished:
    # get pods from all namespaces
    all_pods = core_api.list_pod_for_all_namespaces()
    # get jobs from all namespaces
    all_jobs = batch_api.list_job_for_all_namespaces()
    
    # filter for pods with "tenant" in the name
    all_tenant_pods = [ pod for pod in all_pods.items if "tenant" in pod.metadata.name]
    all_tenant_jobs = [ job for job in all_jobs.items if "tenant" in job.metadata.name]
    
    # insert the timestamp
    exp_data['timestamp'].append(int(time.time())) # epoch time
    
    # iterate through namespaces and collect info on pending/running/completed jobs
    for namespace in namespaces:
        # get pods for the tenant of current namespace
        tenant_pods = [ pod for pod in all_tenant_pods if namespace in pod.metadata.name]
        tenant_jobs = [ job for job in all_tenant_jobs if namespace in job.metadata.name]
        
        running_cores = 0
        pending_cores = 0
        completed_cores = 0
        
        # loop through pods 
        for pod in tenant_pods:
            cores = int(pod.spec.containers[0].resources.limits['cpu'])
            if pod.status.phase == "Pending":
                pending_cores = pending_cores + cores
            elif pod.status.phase == "Running":
                running_cores = running_cores + cores
            elif pod.status.phase == "Succeeded":
                completed_cores = completed_cores + cores
        
        # check for pending jobs - jobs that are unable to submit a pod due to quota limits
        for job in tenant_jobs:
            has_pod = False
            job_name = job.metadata.name
            for pod in tenant_pods:
                if job_name in pod.metadata.name:
                    has_pod = True
            if not has_pod and job.status.active is None and job.status.terminating is None and job.status.succeeded is None and job.status.completion_time is None and job.status.ready == 0 and job.status.conditions is None :
                job_cores = int(job.spec.template.spec.containers[0].resources.limits['cpu'])
                pending_cores = pending_cores + job_cores
                
        exp_data[namespace+'_pending'].append(pending_cores)
        exp_data[namespace+'_running'].append(running_cores)
        exp_data[namespace+'_completed'].append(completed_cores)
    
    # check to see if there are still any running or pending jobs
    total_pending, total_running, total_completed = get_totals(namespaces)
    print("pending", total_pending, "running", total_running, "completed", total_completed)
    if total_pending == 0 and total_running == 0 and total_completed > 0:
        finished = True

    time.sleep(1)
print("all done!")
pprint.pprint(exp_data)

In [None]:
# don't really need to track completed pods
for namespace in namespaces:
    exp_data.pop(namespace+'_completed')

import pandas as pd
pd_data = pd.DataFrame(exp_data)
pd_data.head()
pd_data.plot(x="timestamp", figsize=(20,10))

In [None]:
all_pods = core_api.list_pod_for_all_namespaces()
all_jobs = batch_api.list_job_for_all_namespaces()
# filter for pods with "tenant" in the name
all_tenant_pods = [ pod for pod in all_pods.items if "tenant" in pod.metadata.name]
all_tenant_jobs = [ job for job in all_jobs.items if "tenant" in job.metadata.name]   
    
for namespace in namespaces:
    tenant_pods = [ pod for pod in all_tenant_pods if namespace in pod.metadata.name]
    tenant_jobs = [ job for job in all_tenant_jobs if namespace in job.metadata.name]

    
    total_queue_time = 0
    total_run_time = 0
    total_time = 0
    
    # start/end used to calculate makespan
    start = None
    end = None
    makespan = 0
    for pod in tenant_pods:
        #print(pod.status.container_statuses[0].state)
        pod_schedule_time = pod.metadata.creation_timestamp
        job_name = pod.metadata.name[:-6]
        for job in tenant_jobs:
            if job.metadata.name == job_name:
                schedule_time = job.metadata.creation_timestamp
        #pod_start_time = pod.status.start_time
        pod_start_time = pod.status.container_statuses[0].state.terminated.started_at
        pod_end_time = pod.status.container_statuses[0].state.terminated.finished_at
        pod_queue_time = pod_start_time - schedule_time
        pod_run_time = pod_end_time - pod_start_time
        pod_total_time = pod_end_time - schedule_time
        total_queue_time = total_queue_time + int(pod_queue_time.total_seconds())
        total_run_time = total_run_time + int(pod_run_time.total_seconds())
        total_time = total_time + int(pod_total_time.total_seconds())
        # calculate makespan
        if not start or schedule_time < start:
            start = schedule_time
        if not end or pod_end_time > end:
            end = pod_end_time
        #print(pod.metadata.name, pod_start_time, int(pod_queue_time.total_seconds()))
    print(namespace, "Total queue time:", total_queue_time)
    print(namespace, "Average queue time:", total_queue_time / len(tenant_pods))
    print(namespace, "Total run time for all jobs:", total_run_time)
    print(namespace, "Total queue + run time for all jobs:", total_time)
    makespan = end - start
    print(namespace, "Total workflow run time (makespan):", int(makespan.total_seconds()))

In [None]:
print(tenant_jobs[0])