# Sizing of pods' CPU request

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib ipympl
import matplotlib.pyplot as plt
import numpy as np
from stats.stats_service import StatsService

## Load a time series from disk

In [None]:
stats_service = StatsService(name="CPU usage")
stats_service.load_time_series(path="data/cpu_usage.json")

## Signal processing
- Smoothing with the _moving average_ approach
- Flooring of lower values

In [None]:
from stats import smoothing
stats_service.smooth(algo_type=smoothing.Method.MOVING_AVG, window_size=11).floor(threshold=100e-3)

In [None]:
fig, ax = plt.subplots()
ax.plot(stats_service.time_series.time, stats_service.time_series.resource, linewidth=2.0)
plt.xticks(rotation = 45)
plt.show()

## CPU Peak values

In [None]:
# Find peaks
peaks = stats_service.get_peaks()

In [None]:
# Test peak filtering by threshold
peaks.datetimes(threshold=3300e-3)

In [None]:
peaks.plots.histogram(bins=40, xlim=(2, 3.5))

## Percentiles
The percentiles are computed with a **non-parametric** approach. The assessment is not sensitive to the numerical method.

💡 These non-parametric approach are bounded to the sample range. Using a distribution model would allow a more "robust" result, given a proper determination of the model.

In [None]:
import pandas as pd
percents = [90,95, 99]
methods = [
    "linear",
    "inverted_cdf",
    "averaged_inverted_cdf",
    "closest_observation",
    "interpolated_inverted_cdf",
    "hazen",
    "weibull",
    "median_unbiased",
    "normal_unbiased",
    "lower",
    "higher",
    "midpoint",
    "nearest",
]

records = []
for method in methods:
    percentile_res = peaks.percentiles(percents, method=method)
    records.append({f"q-{perc_val}": quant_val for perc_val, quant_val in zip(percents, percentile_res)})
    
df = pd.DataFrame(records, index=methods)
display(df)
display(df.describe())