# Used packages and general settings

In [None]:
import re
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from elasticsearch import Elasticsearch

%matplotlib inline

# Elasticsearch configuration

In [None]:
username = "username"
password = "password"
es = Elasticsearch([{"host": "es-cms.cern.ch", "port": 9203, "http_auth": username + ":" + password}], use_ssl=True, verify_certs=True, ca_certs="ca-bundle.trust.crt")

# Time filter

In [None]:
def time_filter(days=0, until=0):
    indices = es.cat.indices(index="cms-20*", h="index", request_timeout=600).split("\n")
    indices = sorted(indices)
    indices = [x for x in indices if x != ""]
    if days == 0:
        return ["cms-20*"]
    today = datetime.date.today()
    filtered = []
    datefmt = "%Y-%m-%d"
    for i in indices:
        date = re.sub(r"cms-", "", i).rstrip()
        date = datetime.datetime.strptime(date, datefmt).date()
        diff = today - date
        if until <= diff.days < days + until:
            filtered.append(i.rstrip())
    return filtered

# Indices to be considered

In [None]:
no_of_days = 0
last_day = 0
ind = time_filter(no_of_days, last_day)
ind = ",".join(ind)

# Part 1

Produce a plot of the overall CPU efficiency of all CMS jobs in few time windows, e.g. the last 1-3-6 months:
1. for all jobs
2. for successful jobs only

## Query

In [None]:
body = {
    "size": 0,
    "query": {
        "bool": {
            "must": [
                {
                    "match": {
                        "Status": "Completed"
                    }
                },
                {
                    "range": {
                        "RecordTime": {
                            "gte": 1483228800000,
                            "lte": 1600000000000,
                            "format": "epoch_millis"
                        }
                    }
                },
                {
                    "range": {
                        "CpuTimeHr": {
                            "gt": 0
                        }
                    }
                },
                {
                    "range": {
                        "CommittedCoreHr": {
                            "gt": 0
                        }
                    }
                },
                {
                    "exists": {
                        "field": "ExitCode"
                    }
                }
            ]
        }
    },
    "aggs": {
        "RecordTime": {
            "date_histogram": {
                "field": "RecordTime",
                "interval": "week",
                "time_zone": "Europe/Berlin",
                "min_doc_count": 1
            },
            "aggs": {
                "ExitCode": {
                    "terms": {
                        "field": "ExitCode",
                    },
                    "aggs": {
                        "CpuTimeHr": {
                            "sum": {
                                "field": "CpuTimeHr"
                            }
                        },
                        "CommittedCoreHr": {
                            "sum": {
                                "field": "CommittedCoreHr"
                            }
                        }
                    }
                }
            }
        }
    }
}

res = es.search(index=ind, body=body, request_timeout=1200)

## Function for calculation of CPU efficiency

In [None]:
def cpu_efficiency_calculation(external_buckets):
    external_buckets_data = []
    cpu_eff_of_all_jobs = []
    cpu_eff_of_successful_jobs = []
    for external_bucket in external_buckets:
        external_buckets_data.append(external_bucket["key"])
        buckets_of_ExitCode = external_bucket["ExitCode"]["buckets"]
        sum_of_CpuTimeHr = 0
        sum_of_CommittedCoreHr = 0
        for b_ExitCode in buckets_of_ExitCode:
            sum_of_CpuTimeHr = sum_of_CpuTimeHr + b_ExitCode["CpuTimeHr"]["value"]
            sum_of_CommittedCoreHr = sum_of_CommittedCoreHr + b_ExitCode["CommittedCoreHr"]["value"]
            if b_ExitCode["key"] == 0:
                cpu_eff_of_successful_jobs.append(b_ExitCode["CpuTimeHr"]["value"] / b_ExitCode["CommittedCoreHr"]["value"] * 100)
        cpu_eff_of_all_jobs.append(sum_of_CpuTimeHr / sum_of_CommittedCoreHr * 100)

    return external_buckets_data, cpu_eff_of_all_jobs, cpu_eff_of_successful_jobs

## Calculation of CPU efficiency

In [None]:
buckets_of_RecordTime = res["aggregations"]["RecordTime"]["buckets"]
time_stamps, cpu_eff_for_all_jobs, cpu_eff_for_successful_jobs = cpu_efficiency_calculation(buckets_of_RecordTime)

## Time series

In [None]:
time_series_for_all_jobs = pd.Series(cpu_eff_for_all_jobs, index=pd.to_datetime(time_stamps, unit="ms"))
time_series_for_successful_jobs = pd.Series(cpu_eff_for_successful_jobs, index=pd.to_datetime(time_stamps, unit="ms"))

## Function for ploting a graph of CPU efficiency

In [None]:
def plot_cpu_eff_for_all_task_types(x_from, x_to):
    time_series_for_all_jobs.plot(style="r", label="all jobs")
    time_series_for_successful_jobs.plot(style="b", label="successful jobs")
    plt.legend(loc=9, bbox_to_anchor=(1.22, 0.6))
    plt.ylabel("CPU efficiency (%)")
    plt.xlim([pd.Timestamp(x_from), pd.Timestamp(x_to)])

## Graph of CPU efficiency

In [None]:
date_from = "2017-01-01"
date_to = "2018-04-01"
plot_cpu_eff_for_all_task_types(date_from, date_to)

# Part 2

Rank sites for CPU efficiency:
1. for all jobs
2. for successful jobs only

## Query

In [None]:
body = {
    "size": 0,
    "query": {
        "bool": {
            "must": [
                {
                    "match": {
                        "Status": "Completed"
                    }
                },
                {
                    "range": {
                        "CpuTimeHr": {
                            "gt": 0
                        }
                    }
                },
                {
                    "range": {
                        "CommittedCoreHr": {
                            "gt": 0
                        }
                    }
                },
                {
                    "exists": {
                        "field": "ExitCode"
                    }
                },
                {
                    "range": {
                        "RecordTime": {
                            "gte": 1498860000000,
                            "format": "epoch_millis"
                        }
                    }
                }
            ]
        }
    },
    "aggs": {
        "Sites": {
            "terms": {
                "field": "Site",
                "size": 100
            },
            "aggs": {
                "ExitCode": {
                    "terms": {
                        "field": "ExitCode",
                    },
                    "aggs": {
                        "CpuTimeHr": {
                            "sum": {
                                "field": "CpuTimeHr"
                            }
                        },
                        "CommittedCoreHr": {
                            "sum": {
                                "field": "CommittedCoreHr"
                            }
                        }
                    }
                }
            }
        }
    }
}

res = es.search(index=ind, body=body, request_timeout=1200)

## Calculation of CPU efficiency

In [None]:
buckets_of_Sites = res["aggregations"]["Sites"]["buckets"]
sites, cpu_eff_for_all_jobs, cpu_eff_for_successful_jobs = cpu_efficiency_calculation(buckets_of_Sites)

## Function for merging by case insensitive names

In [None]:
def merging_by_case_insensitive_names(main_list, list_a, list_b):
    main_list_upper_case = [x.upper() for x in main_list]
    dict_a = {}
    dict_b = {}
    numbers_of_merging_dict = {}
    for i in range(0, len(main_list_upper_case)):
        element = main_list_upper_case[i]
        if element in dict_a.keys():
            dict_a[element] = dict_a[element] + list_a[i]
            dict_b[element] = dict_b[element] + list_b[i]
            numbers_of_merging_dict[element] = numbers_of_merging_dict[element] + 1
        else:
            dict_a[element] = list_a[i]
            dict_b[element] = list_b[i]
            numbers_of_merging_dict[element] = 1
    main_list_merged = list(dict_a.keys())
    list_a_merged = list(dict_a.values())
    list_b_merged = list(dict_b.values())
    numbers_of_merging = list(numbers_of_merging_dict.values())

    return main_list_merged, list_a_merged, list_b_merged, numbers_of_merging

## Merging sites by case insensitive names

In [None]:
sites_merged, cpu_eff_for_all_jobs_merged, cpu_eff_for_successful_jobs_merged, merging_numbers = merging_by_case_insensitive_names(sites, cpu_eff_for_all_jobs, cpu_eff_for_successful_jobs)
for i in range(0, len(cpu_eff_for_all_jobs_merged)):
    cpu_eff_for_all_jobs_merged[i] = cpu_eff_for_all_jobs_merged[i] / merging_numbers[i]
    cpu_eff_for_successful_jobs_merged[i] = cpu_eff_for_successful_jobs_merged[i] / merging_numbers[i]

## Sorting by CPU efficiency for all jobs

In [None]:
sites = sites_merged
cpu_eff_for_all_jobs = cpu_eff_for_all_jobs_merged
cpu_eff_for_successful_jobs = cpu_eff_for_successful_jobs_merged
list_to_sort = []
for i in range(len(sites)):
    list_to_sort.append((sites[i], cpu_eff_for_all_jobs[i], cpu_eff_for_successful_jobs[i]))
ordered_list = sorted(list_to_sort, key=lambda site: site[1], reverse=True)
sites = [i[0] for i in ordered_list]
cpu_eff_for_all_jobs = [i[1] for i in ordered_list]
cpu_eff_for_successful_jobs = [i[2] for i in ordered_list]

## Function for plotting a graph of CPU efficiency

In [None]:
def plot_cpu_eff_for_all_sites(x_data, y_data_a, y_data_b, index, title=""):
    plt.rcParams["figure.figsize"] = (50, 10)
    plt.rcParams.update({"font.size": 25})
    index = np.arange(len(index))
    bar_width = 0.35
    opacity = 0.8
    plt.bar(index, y_data_a, bar_width, alpha=opacity, align="center", color="b", label="all jobs")
    plt.bar(index + bar_width, y_data_b, bar_width, alpha=opacity, align="center", color="g", label="successful jobs")
    plt.xticks(index + bar_width * 0.5, x_data, rotation=90)
    plt.legend(loc=9, bbox_to_anchor=(1.07, 0.6))
    plt.margins(0.005, 0.005)
    plt.title("CPU efficiency by sites" + title)
    plt.ylabel("CPU efficiency")
    plt.ylim([0, 100])
    plt.show()

## Function for plotting a histogram of CPU efficiency

In [None]:
def histogram(cpu_eff, title):
    y_values, _, _ = plt.hist(cpu_eff, bins=100, range=[0, 100])
    plt.title(title)
    plt.ylim([0, y_values.max() + 1])
    plt.xlabel("CPU efficiency")
    plt.ylabel("Frequency")
    plt.show()

## Graphs of CPU efficiency

In [None]:
plot_cpu_eff_for_all_sites(sites, cpu_eff_for_all_jobs, cpu_eff_for_successful_jobs, sites)
histogram(cpu_eff_for_all_jobs, "All jobs")
histogram(cpu_eff_for_successful_jobs, "Successful jobs")

# Part 3

Rank the task types by cumulative wall-clock time:
1. “Task types” = e.g. GENSIM, DIGI, RECO, etc.

## Query

In [None]:
body = {
    "size": 0,
    "query": {
        "bool": {
            "must": [
                {
                    "match": {
                        "Status": "Completed"
                    }
                },
                {
                    "range": {
                        "RequestCpus": {
                            "gt": 0
                        }
                    }
                },
                {
                    "range": {
                        "CommittedCoreHr": {
                            "gt": 0
                        }
                    }
                },
                {
                    "range": {
                        "RecordTime": {
                            "gte": 1498860000000,
                            "format": "epoch_millis"
                        }
                    }
                }
            ]
        }
    },
    "aggs": {
        "TaskType": {
            "terms": {
                "field": "TaskType",
                "size": 100
            },
            "aggs": {
                "RequestCpus": {
                    "sum": {
                        "field": "RequestCpus"
                    }
                },
                "CommittedCoreHr": {
                    "sum": {
                        "field": "CommittedCoreHr"
                    }
                },
                "CoreHr": {
                    "sum": {
                        "field": "CoreHr"
                    }
                }
            }
        }
    }
}

res = es.search(index=ind, body=body, request_timeout=1200)

## Listing the wall-clock

In [None]:
task_types = []
wall_clock = []
wall_clock_CoreHr = []
buckets = res["aggregations"]["TaskType"]["buckets"]
for b in buckets:
    task_types.append(b["key"])
    wall_clock.append(b["CommittedCoreHr"]["value"] / b["RequestCpus"]["value"])
    wall_clock_CoreHr.append(b["CoreHr"]["value"])

## Merging task types by case insensitive names

In [None]:
task_types_merged, wall_clock_merged, wall_clock_CoreHr_merged, merging_numbers = merging_by_case_insensitive_names(task_types, wall_clock, wall_clock_CoreHr)
for i in range(0, len(wall_clock_merged)):
    wall_clock_merged[i] = wall_clock_merged[i] / merging_numbers[i]

## Function for plotting a graph of wall-clock

In [None]:
def plot_wall_clock(list_of_task_types, list_of_wall_clock, ylabel):
    plt.rcParams['figure.figsize'] = (50, 10)
    plt.rcParams.update({"font.size": 25})
    task_types_by_wall_clock = dict(zip(list_of_task_types, list_of_wall_clock))
    task_types_by_wall_clock_df = pd.DataFrame.from_dict(task_types_by_wall_clock, orient="index")
    task_types_by_wall_clock_df = task_types_by_wall_clock_df.sort_values(by=[0], ascending=False)
    total_sum_of_wall_clock = sum(task_types_by_wall_clock_df[0])
    sum_of_wall_clock = 0
    number_of_data = 0
    for i in range(0, len(task_types_by_wall_clock_df)):
        sum_of_wall_clock = sum_of_wall_clock + task_types_by_wall_clock_df[0][i]
        if sum_of_wall_clock / total_sum_of_wall_clock < 0.9:
            number_of_data = number_of_data + 1
        else:
            number_of_data = number_of_data + 1
            break
    task_types_by_wall_clock_df[:number_of_data].plot(kind="bar", legend=None)
    plt.ylabel(ylabel)
    plt.title("Wall-clock by task types")
    plt.show()

## Graphs of wall-clock

In [None]:
plot_wall_clock(task_types_merged, wall_clock_merged, "Wall-clock (average of hours for one core)")
plot_wall_clock(task_types_merged, wall_clock_CoreHr_merged, "Wall-clock (sum of hours)")

# Part 4

Try to do point 2 above on each of point 3 above:
1. filter the task types that globally account for > 90% of the total wallclock time (as from point 3) and do point 2 for each of them.


## Function for finding task types, wich will be considered

In [None]:
def considered_task_types(list_of_task_types_merged, list_of_wall_clock_merged):
    list_to_sort = []
    for i in range(len(list_of_task_types_merged)):
        list_to_sort.append((list_of_task_types_merged[i], list_of_wall_clock_merged[i]))
    ordered_list = sorted(list_to_sort, key=lambda task_type: task_type[1], reverse=True)
    task_types_sorted = [i[0] for i in ordered_list]
    wall_clock_sorted = [i[1] for i in ordered_list]
    total_wall_clock = sum(wall_clock_sorted)
    considered_task_types = []
    sum_of_wall_clock = 0
    for i in range(len(task_types_sorted)):
        sum_of_wall_clock = sum_of_wall_clock + wall_clock_sorted[i] / total_wall_clock
        if sum_of_wall_clock < 0.9:
            considered_task_types.append(task_types_sorted[i])
        else:
            break
    task_types_temporary = []
    for i in range(0, len(task_types)):
        for j in range(0, len(considered_task_types)):
            if task_types[i].upper() == considered_task_types[j]:
                task_types_temporary.append(task_types[i])
                break
    task_types_query = []
    for task_type in task_types_temporary:
        task_types_query.append({"term": {"TaskType": task_type}})
    return task_types_query

##  Considered task types

In [None]:
task_types_for_query_wall_clock = considered_task_types(task_types_merged, wall_clock_merged)
task_types_for_query_wall_clock_CoreHr = considered_task_types(task_types_merged, wall_clock_CoreHr_merged)

## Function for query

In [None]:
def make_query_for_wall_clock(names_of_task_types):
    body = {
        "size": 0,
        "query": {
            "constant_score": {
                "filter": {
                    "bool": {
                        "must": [
                            {
                                "match": {
                                    "Status": "Completed"
                                }
                            },
                            {
                                "range": {
                                    "CpuTimeHr": {
                                        "gt": 0
                                    }
                                }
                            },
                            {
                                "range": {
                                    "CommittedCoreHr": {
                                        "gt": 0
                                    }
                                }
                            },
                            {
                                "exists": {
                                    "field": "ExitCode"
                                }
                            },
                            {
                                "range": {
                                    "RecordTime": {
                                        "gte": 1498860000000,
                                        "format": "epoch_millis"
                                    }
                                }
                            }
                        ],
                        "should": names_of_task_types
                    }
                }
            }
        },
        "aggs": {
            "TaskType": {
                "terms": {
                    "field": "TaskType",
                    "size": len(names_of_task_types)
                },
                "aggs": {
                    "Sites": {
                        "terms": {
                            "field": "Site",
                            "size": 100
                        },
                        "aggs": {
                            "ExitCode": {
                                "terms": {
                                    "field": "ExitCode",
                                },
                                "aggs": {
                                    "CpuTimeHr": {
                                        "sum": {
                                            "field": "CpuTimeHr"
                                        }
                                    },
                                    "CommittedCoreHr": {
                                        "sum": {
                                            "field": "CommittedCoreHr"
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }
    res = es.search(index=ind, body=body, request_timeout=1200)

    return res

## Queries for wall-clock

In [None]:
res_wall_clock = make_query_for_wall_clock(task_types_for_query_wall_clock)
res_wall_clock_CoreHr = make_query_for_wall_clock(task_types_for_query_wall_clock_CoreHr)

## Function for calculiation of CPU efficiency and plotting a graph of CPU efficiency for wall-clock

In [None]:
def calculate_and_plot_cpu_eff(res):
    task_types_dic = {}
    buckets_of_TaskType = res["aggregations"]["TaskType"]["buckets"]
    for b_TaskType in buckets_of_TaskType:
        sites = []
        cpu_eff_for_all_jobs = []
        cpu_eff_for_successful_jobs = []
        sites_dict = {}
        task_type = b_TaskType["key"]
        buckets_of_Sites = b_TaskType["Sites"]["buckets"]
        for b_Sites in buckets_of_Sites:
            buckets_of_ExitCode = b_Sites["ExitCode"]["buckets"]
            sum_of_CpuTimeHr = 0
            sum_of_CommittedCoreHr = 0
            for b_ExitCode in buckets_of_ExitCode:
                sum_of_CpuTimeHr = sum_of_CpuTimeHr + b_ExitCode["CpuTimeHr"]["value"]
                sum_of_CommittedCoreHr = sum_of_CommittedCoreHr + b_ExitCode["CommittedCoreHr"]["value"]
                if b_ExitCode["key"] == 0:
                    cpu_eff_for_successful_jobs.append(b_ExitCode["CpuTimeHr"]["value"] / b_ExitCode["CommittedCoreHr"]["value"] * 100)
            if len(cpu_eff_for_successful_jobs) != len(cpu_eff_for_all_jobs):
                cpu_eff_for_all_jobs.append(sum_of_CpuTimeHr / sum_of_CommittedCoreHr * 100)
                sites.append(b_Sites["key"])
                site_upper_case = b_Sites["key"].upper()
                if site_upper_case in sites_dict.keys():
                    sites_dict[site_upper_case] = (sites_dict[site_upper_case][0] + cpu_eff_for_all_jobs[-1], sites_dict[site_upper_case][1] + cpu_eff_for_successful_jobs[-1], sites_dict[site_upper_case][2] + 1)
                else:
                    sites_dict[site_upper_case] = (cpu_eff_for_all_jobs[-1], cpu_eff_for_successful_jobs[-1], 1)
        task_type_upper_case = task_type.upper()
        if task_type_upper_case in task_types_dic.keys():
            for key in sites_dict.keys():
                if key in task_types_dic[task_type_upper_case].keys():
                    task_types_dic[task_type_upper_case][key] = (task_types_dic[task_type_upper_case][key][0] + sites_dict[key][0], task_types_dic[task_type_upper_case][key][1] + sites_dict[key][1], task_types_dic[task_type_upper_case][key][2] + sites_dict[key][2])
                else:
                    task_types_dic[task_type_upper_case][key] = sites_dict[key]
        else:
            task_types_dic[task_type_upper_case] = sites_dict
    for task_type_key, task_type_value in task_types_dic.iteritems():
        list_to_sort = []
        for site_key, site_value in task_type_value.iteritems():
            list_to_sort.append((site_key, site_value[0] / site_value[2], site_value[1] / site_value[2]))
        ordered_list = sorted(list_to_sort, key=lambda site: site[1], reverse=True)
        sites = [i[0] for i in ordered_list]
        cpu_eff_for_all_jobs = [i[1] for i in ordered_list]
        cpu_eff_for_successful_jobs = [i[2] for i in ordered_list]
        plot_cpu_eff_for_all_sites(sites, cpu_eff_for_all_jobs, cpu_eff_for_successful_jobs, task_type_value, " for " + task_type_key)
        histogram(cpu_eff_for_all_jobs, "CPU efficiency by sites for " + task_type_key + " (all jobs)")
        histogram(cpu_eff_for_successful_jobs, "CPU efficiency by sites for " + task_type_key + " (successful jobs)")

## Calculiation of CPU efficiency and plotting graphs of CPU efficiency for wall-clock (average of hours for one core)

In [None]:
calculate_and_plot_cpu_eff(res_wall_clock)

## Calculiation of CPU efficiency and plotting graphs of CPU efficiency for wall-clock (sum of hours)

In [None]:
calculate_and_plot_cpu_eff(res_wall_clock_CoreHr)