In [None]:
import json
from pathlib import Path
from datetime import datetime

import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.ticker import FuncFormatter

from python.snapshots import read_snapshot_json, lookup_snapshot_size
from python.indices import read_indices_json, get_index_snapshot_history


DATE = "2021-12-07"
data_dir = Path.home()/"Notebooks"/"Elasticsearch Data Usage"/"data"/DATE

In [None]:
site_data = {}
for site_dir in data_dir.glob('[!.]*'):
    site = site_dir.name
    print(f"Reading {site} data")
    index_data_file = site_dir/"indices.json"
    snapshots_data_file = site_dir/"local-rgw-snapshots.json"
    snapshot_detail_dir = site_dir/"snapshot_details"

    snapshot_data = read_snapshot_json(snapshots_data_file)
    for snapshot in list(snapshot_data):
        snapshot_data[snapshot]['site'] = site
        detail_file = snapshot_detail_dir/f"{snapshot}.json"
        size = lookup_snapshot_size(detail_file)
        snapshot_data[snapshot].update(size)
    snapshot_data = pd.DataFrame.from_dict(snapshot_data, orient='index')

    index_data = read_indices_json(index_data_file)
    for index in list(index_data):
        snapshot_history = get_index_snapshot_history(snapshots_data_file, index)
        index_data[index].update(snapshot_history)
        
    index_data = pd.DataFrame.from_dict(index_data, orient='index')

    snapshot_data.to_csv(site_dir/f"{site}-snapshot-history.csv")
    index_data.to_csv(site_dir/f"{site}-index-history.csv")

    site_data[site] = {}
    site_data[site]['snapshot_data'] = snapshot_data
    site_data[site]['index_data'] = index_data


snapshot_data = pd.concat(
    [site_data[site]['snapshot_data'] for site in site_data]
)
index_data = pd.concat(
    [site_data[site]['index_data'] for site in site_data]
)

In [3]:
snapshot_data.to_csv(data_dir/f"snapshot-history.csv")
index_data.to_csv(data_dir/f"index-history.csv")

In [4]:
# Anticipated Storage Req based on recent usage patterns
recent_cutoff = datetime.today() - pd.to_timedelta('30d')
recent_snapshots = snapshot_data[snapshot_data.start_time_utc > recent_cutoff]
snapshot_daily_incremental = recent_snapshots.groupby(['kind'])['total_size_bytes']

# avg_snapshot_daily_incremental = snapshot_daily_incremental.groupby(['kind']).mean()
# print(f"Each day (GiB): \n\n{avg_snapshot_daily_incremental}")

daily_size = snapshot_daily_incremental.mean() / 1024 ** 3
print(f"Each day (GiB): \n\n{daily_size}")

Each day (GiB): 

kind
non-security-logs    16.764209
security-logs        24.614419
Name: total_size_bytes, dtype: float64


In [None]:

recent_snapshots[recent_snapshots['snapshot_date'] == datetime.date(2021, 9, 1)]


In [71]:
daily_size = index_data.groupby(['site', 'kind', 'index_date'])['primary_size_bytes'].sum() / 1024 ** 3
daily_size  = daily_size.groupby(['kind']).mean()
print(f"Each day (GiB): \n\n{daily_size}")



Each day (GiB): 

kind
airship            7.357143
auth               0.010093
calico             0.876050
ceph               0.899316
flows              3.535714
ingress            0.001566
journal            0.102448
kernel_syslog      0.181240
kubernetes        11.382143
libvirt            0.097255
lma                2.864286
logstash           1.300000
openstack          5.462069
tenant-ceph        1.785714
utility_access     0.008050
Name: primary_size_bytes, dtype: float64


In [41]:
daily_size.sum()

35.86308782793618