# Used packages and general settings

In [None]:
import re
import datetime
from itertools import cycle
import matplotlib.pyplot as plt
import pandas as pd
from elasticsearch import Elasticsearch

%matplotlib inline

# Elasticsearch configuration

In [None]:
username = "username"
password = "password"
es = Elasticsearch([{"host": "es-cms.cern.ch", "port": 9203, "http_auth": username + ":" + password}], use_ssl=True, verify_certs=True, ca_certs="ca-bundle.trust.crt")

# Time filter

In [None]:
def time_filter(days=0, until=0):
    indices = es.cat.indices(index="cms-20*", h="index", request_timeout=600).split("\n")
    indices = sorted(indices)
    indices = [x for x in indices if x != ""]
    if days == 0:
        return ["cms-20*"]
    today = datetime.date.today()
    filtered = []
    datefmt = "%Y-%m-%d"
    for i in indices:
        date = re.sub(r"cms-", "", i).rstrip()
        date = datetime.datetime.strptime(date, datefmt).date()
        diff = today - date
        if until <= diff.days < days + until:
            filtered.append(i.rstrip())
    return filtered

# Indices to be considered

In [None]:
no_of_days = 0
last_day = 0
ind = time_filter(no_of_days, last_day)
ind = ",".join(ind)

# Part 1

Produce a plot of the CPUhrs spent on MINIAOD over the last (say) 12 months and extended also to these task types: AOD, AODSIM, RECO (at least).

## Query

In [None]:
def query(name_of_task_type):
    body = {
        "size": 0,
        "query": {
            "bool": {
                "must": [
                    {
                        "match": {
                            "Status": "Completed"
                        }
                    },
                    {
                        "range": {
                            "RecordTime": {
                                "gte": 1483228800000,
                                "lte": 1600000000000,
                                "format": "epoch_millis"
                            }
                        }
                    },
                    {
                        "range": {
                            "CpuTimeHr": {
                                "gt": 0
                            }
                        }
                    },
                    {
                        "match": {
                            "TaskType": name_of_task_type
                        }
                    }
                ]
            }
        },
        "aggs": {
            "RecordTime": {
                "date_histogram": {
                    "field": "RecordTime",
                    "interval": "week",
                    "time_zone": "Europe/Berlin",
                    "min_doc_count": 1
                },
                "aggs": {
                    "CpuTimeHr": {
                        "sum": {
                            "field": "CpuTimeHr"
                        }
                    }
                }
            }
        }
    }

    res = es.search(index=ind, body=body, request_timeout=1200)

    return res

## Listing of CpuTimeHr

In [None]:
def listing_of_cpu_time_hr(buckets):
    list_of_time_stamps = []
    list_of_cpu_time_hr = []
    for bucket in buckets:
        list_of_time_stamps.append(bucket["key"])
        list_of_cpu_time_hr.append(bucket["CpuTimeHr"]["value"])
    return list_of_time_stamps, list_of_cpu_time_hr

## Function for plotting

In [None]:
def plot_cpu_hours(series_of_time, name_of_task_type, style, color, x_from, x_to):
    plt.rcParams["figure.figsize"] = (25, 10)
    plt.rcParams.update({"font.size": 25})
    series_of_time.plot(label=name_of_task_type, style=style, color=color)
    plt.ylabel("CPU time hours (sum)")
    plt.legend(loc=9, bbox_to_anchor=(1.15, 1.0))
    plt.xlim([pd.Timestamp(x_from), pd.Timestamp(x_to)])

## Graph

In [None]:
date_from = "2017-01-01"
date_to = "2018-04-01"
style = ["-", "--", "-."]
style_cycler = cycle(style)
colors = ["b", "g"]
colors_cycler = cycle(colors)
task_types = ["MINIAOD", "AOD", "AODSIM", "RECO", "MINIAODSIM", "USER"]
for task_type in task_types:
    try:
        res = query(task_type)
        buckets_of_RecordTime = res["aggregations"]["RecordTime"]["buckets"]
        time_stamps, cpu_time_hr = listing_of_cpu_time_hr(buckets_of_RecordTime)
        time_series = pd.Series(cpu_time_hr, index=pd.to_datetime(time_stamps, unit="ms"))
        plot_cpu_hours(time_series, task_type, next(style_cycler), next(colors_cycler), date_from, date_to)
    except TypeError as te:
        print("Oops, found a TypeError for %s. Here it is: %s" % (task_type, te))

# Part 2

Produce a plot of the CPUhrs spent on task types.

## Query

In [None]:
body = {
    "size": 0,
    "query": {
        "bool": {
            "must": [
                {
                    "match": {
                        "Status": "Completed"
                    }
                },
                {
                    "range": {
                        "RecordTime": {
                            "gte": 1483228800000,
                            "lte": 1600000000000,
                            "format": "epoch_millis"
                        }
                    }
                },
                {
                    "range": {
                        "CpuTimeHr": {
                            "gt": 0
                        }
                    }
                }
            ]
        }
    },
    "aggs": {
        "TaskType": {
            "terms": {
                "field": "TaskType",
                "size": 18
            },
            "aggs": {
                "RecordTime": {
                    "date_histogram": {
                        "field": "RecordTime",
                        "interval": "week",
                        "time_zone": "Europe/Berlin",
                        "min_doc_count": 1
                    },
                    "aggs": {
                        "CpuTimeHr": {
                            "sum": {
                                "field": "CpuTimeHr"
                            }
                        }
                    }
                }
            }
        }
    }
}

res = es.search(index=ind, body=body, request_timeout=1200)

## Listing of CpuTimeHr for task types

In [None]:
time_stamps_dict = {}
cpu_time_hr_dict = {}
buckets_of_TaskType = res["aggregations"]["TaskType"]["buckets"]
for b_TaskType in buckets_of_TaskType:
    buckets_of_RecordTime = b_TaskType["RecordTime"]["buckets"]
    time_stamps, cpu_time_hr = listing_of_cpu_time_hr(buckets_of_RecordTime)
    task_type = b_TaskType["key"]
    time_stamps_dict[task_type] = time_stamps
    cpu_time_hr_dict[task_type] = cpu_time_hr

## Function for plotting

In [None]:
def plot_cpu_hours_from_dict(dictionary_of_time_stamps, dictionary_of_cpu_time_hr, list_of_styles, list_of_colors):
    styles_cycler = cycle(list_of_styles)
    colors_cycler = cycle(list_of_colors)
    for key in dictionary_of_time_stamps.keys():
        time_series = pd.Series(dictionary_of_cpu_time_hr[key], index=pd.to_datetime(dictionary_of_time_stamps[key], unit="ms"))
        plot_cpu_hours(time_series, key, next(styles_cycler), next(colors_cycler), date_from, date_to)

## Merging task types by case insensitive names

In [None]:
def merge_two_lists(list_a, list_b):
    list_c = dict(list_a)
    for key, value in list_b:
        list_c[key] = list_c.get(key, 0) + value
    list_c = list(list_c.items())
    list_c = sorted(list_c, key=lambda tup: tup[0])
    return list_c

big_dict = {}
for key in time_stamps_dict.keys():
    big_dict[key.upper()] = []

for key in time_stamps_dict.keys():
    big_dict[key.upper()].append(zip(time_stamps_dict[key], cpu_time_hr_dict[key]))

for key in big_dict.keys():
    while len(big_dict[key]) > 1:
        last = big_dict[key].pop()
        penultimate = big_dict[key].pop()
        big_dict[key].append(merge_two_lists(last, penultimate))
    big_dict[key] = big_dict[key][0]

time_stamps_dict = {}
cpu_time_hr_dict = {}
for key in big_dict.keys():
    time_stamps_dict[key] = [i[0] for i in big_dict[key]]
    cpu_time_hr_dict[key] = [i[1] for i in big_dict[key]]

## Graph

In [None]:
styles = ["-", "--"]
colors = ["b", "g", "r", "m", "y"]
plot_cpu_hours_from_dict(time_stamps_dict, cpu_time_hr_dict, styles, colors)