In [8]:
import json
from pathlib import Path
from datetime import datetime

import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.ticker import FuncFormatter

from python.snapshots import read_snapshot_json, lookup_snapshot_size
from python.indices import read_indices_json, get_index_snapshot_history


DATE = "2021-10-01"
data_dir = Path.home()/"Notebooks"/"Elasticsearch Data Usage"/"data"/DATE

In [None]:
site_data = {}
for site_dir in data_dir.glob('[!.]*'):
    site = site_dir.name
    print(f"Reading {site} data")
    index_data_file = site_dir/"indices.json"
    snapshots_data_file = site_dir/"snapshots.json"
    snapshot_detail_dir = site_dir/"snapshot_details"

    snapshot_data = read_snapshot_json(snapshots_data_file)
    for snapshot in list(snapshot_data):
        snapshot_data[snapshot]['site'] = site
        detail_file = snapshot_detail_dir/f"{snapshot}.json"
        size = lookup_snapshot_size(detail_file)
        snapshot_data[snapshot].update(size)
    snapshot_data = pd.DataFrame.from_dict(snapshot_data, orient='index')

    index_data = read_indices_json(index_data_file)
    for index in list(index_data):
        snapshot_history = get_index_snapshot_history(snapshots_data_file, index)
        index_data[index].update(snapshot_history)
    index_data = pd.DataFrame.from_dict(index_data, orient='index')

    snapshot_data.to_csv(site_dir/f"{site}-snapshot-history.csv")
    index_data.to_csv(site_dir/f"{site}-index-history.csv")

    site_data[site] = {}
    site_data[site]['snapshot_data'] = snapshot_data
    site_data[site]['index_data'] = index_data


snapshot_data = pd.concat(
    [site_data[site]['snapshot_data'] for site in site_data]
)
index_data = pd.concat(
    [site_data[site]['index_data'] for site in site_data]
)

In [4]:
# Anticipated Storage Req based on recent usage patterns
recent_cutoff = datetime.today() - pd.to_timedelta('30d')
recent_snapshots = snapshot_data[snapshot_data.start_time_utc > recent_cutoff]

recent_avg_size = recent_snapshots.groupby(['kind'])['incremental_size_bytes'].mean() / 1024 ** 3
anticipated_storage_requirement = (recent_avg_size['security-logs'] * 180) + (recent_avg_size['non-security-logs'] * 30)
print(f"{anticipated_storage_requirement:.2f} GiB storage required for retention on average\n")

print(f"Each day (GiB): \n\n{recent_avg_size}")

5530.75 GiB storage required for retention on average

Each day (GiB): 

kind
non-security-logs    32.991709
security-logs        25.227764
Name: incremental_size_bytes, dtype: float64


In [7]:
snapshot_data.to_csv(data_dir/f"snapshot-history.csv")
index_data.to_csv(data_dir/f"index-history.csv")

In [None]:
# Average Snapshot size across all sites
plt.close()
avg_incremental_size_bytes = snapshot_data.groupby(['kind','snapshot_date'])['incremental_size_bytes'].mean()
fig, ax = plt.subplots()
ax.yaxis.set_major_formatter(FuncFormatter(lambda y, pos: f"{y/1024**3:.2f}"))
fig.set_size_inches(16,9)

plt.plot(avg_incremental_size_bytes['security-logs'], label='security-logs')
plt.plot(avg_incremental_size_bytes['non-security-logs'], label='non-security-logs')

plt.ylabel('Average Incremental Snapshot Size (GiB)')
plt.title("Snapshot Sizes over Time")

plt.style.use("fivethirtyeight")
plt.legend()
plt.tight_layout()
plt.show()

# TODO: Make plot look better 