In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LogNorm

# Functions to generate heatmaps

In [55]:
def heatmap_epss_initial_vs_max(data):
    epss_day_0 = data.groupby('cve')['epss'].apply(lambda l: l.values[0]).round(1).astype(str)
    max_epss_per_cve = data.groupby('cve')['epss'].max().round(1).astype(str)
    heatmap_data = pd.DataFrame(index=[str(round(i, 1)) for i in np.arange(1.0, -0.1, -0.1)], columns=[str(round(i, 1)) for i in np.arange(0.0, 1.1, 0.1)])
    indexes = data['cve'].unique()
    for cve in indexes:
        init, final = str(epss_day_0[cve]), str(max_epss_per_cve[cve])
        count = 1 if pd.isna(heatmap_data.loc[final, init]) else heatmap_data.loc[final, init] + 1
        heatmap_data.loc[final, init] = count
    heatmap_data = heatmap_data.fillna(0)
    plt.figure(figsize=(8, 5))
    sns.heatmap(heatmap_data, annot=True, square=True, fmt='d', norm=LogNorm())
    plt.xlabel('Initial EPSS')
    plt.ylabel('Max EPSS')
    plt.show()

In [61]:
def heatmap_epss_initial_vs_days_needed(data):
    max_epss_per_cve = data.groupby('cve')['epss'].max()
    
    def get_days_before_maximum_epss(series):
        indices_above_threshold = [i for i, v in enumerate(series.values) if v == max_epss_per_cve[series.name]]
        return indices_above_threshold[0] if indices_above_threshold else np.nan

    def compute_days_needed_to_reach_maximum(df):
        return df.groupby('cve')['epss'].apply(get_days_before_maximum_epss).dropna()
    
    epss_day_0 = data.groupby('cve')['epss'].apply(lambda l: l.values[0]).round(1).astype(str)
    days_needed = compute_days_needed_to_reach_maximum(data)
    max_days = days_needed.max()
    step_days = int(max_days / 11)
    indexes = [str(i) for i in range(step_days, (step_days * 11) + step_days, step_days)]
    indexes.reverse()
    heatmap_data = pd.DataFrame(index=indexes, columns=[str(round(i, 1)) for i in np.arange(0.0, 1.1, 0.1)])
    cve_indexes = data['cve'].unique()
    for cve in cve_indexes:
        init = str(epss_day_0[cve])
        days = int((days_needed[cve] // step_days)) * step_days
        if str(days) not in indexes:
            days = int((days_needed[cve] // step_days) + 1) * step_days
        days = str(days)
        count = 1 if pd.isna(heatmap_data.loc[days, init]) else heatmap_data.loc[days, init] + 1
        heatmap_data.loc[days, init] = count
    heatmap_data = heatmap_data.fillna(0)
    plt.figure(figsize=(8, 6))
    sns.heatmap(heatmap_data, annot=True, square=True, fmt='d', norm=LogNorm())
    plt.xlabel('Initial EPSS')
    plt.ylabel('#days to reach max EPSS')
    plt.show()

# Datasets

In [None]:
df_dataset = pd.read_pickle('datasets/epss_v3.pkl')
df_highest_epss = pd.read_pickle('datasets/epss_v3_highest.pkl')
df_lowest_epss = pd.read_pickle('datasets/epss_v3_lowest.pkl')
with open('dataset/cve_info.csv', 'r') as f:
    df_cve_info = pd.read_csv(f)
df_cve_info['published'] = pd.to_datetime(df_cve_info['published'])
df_cve_info = df_cve_info.sort_values(by=['published'])

In [None]:
historical_cve = df_cve_info[df_cve_info['published'] < cut_date]['id'].values
current_cve = df_cve_info[df_cve_info['published'] >= cut_date]['id'].values

In [None]:
df_dataset_historical = df_dataset[df_dataset['cve'].isin(historical_cve)]
df_highest_epss_historical = df_highest_epss[df_highest_epss['cve'].isin(historical_cve)]
df_lowest_epss_historical = df_lowest_epss[df_lowest_epss['cve'].isin(historical_cve)]

df_dataset_current = df_dataset[df_dataset['cve'].isin(current_cve)]
df_highest_epss_current = df_highest_epss[df_highest_epss['cve'].isin(current_cve)]
df_lowest_epss_current = df_lowest_epss[df_lowest_epss['cve'].isin(current_cve)]

_, cve_pz = dataset_creation.download_dataset_and_extract_cve_pz()
df_pz = df_dataset[df_dataset['cve'].isin(cve_pz)]
df_pz_historical = df_dataset[df_dataset['cve'].isin(cve_pz) & df_dataset['cve'].isin(historical_cve)]
df_pz_current = df_dataset[df_dataset['cve'].isin(cve_pz) & df_dataset['cve'].isin(current_cve)]

_, cve_kev = dataset_creation.download_dataset_and_extract_cve_kev()
df_kev = df_dataset[df_dataset['cve'].isin(cve_kev)]
df_kev_historical = df_dataset[df_dataset['cve'].isin(cve_kev) & df_dataset['cve'].isin(historical_cve)]
df_kev_current = df_dataset[df_dataset['cve'].isin(cve_kev) & df_dataset['cve'].isin(current_cve)]