In [2]:
!pip install -q fastplot

In [10]:
import pandas as pd
import os
import fastplot
%matplotlib inline
import numpy as np

In [7]:
df_nvd_v3 = pd.read_pickle('datasets/epss_v3.pkl')
df_nvd_v3_highest = pd.read_pickle('datasets/epss_v3_highest.pkl')
with open('datasets/cve_info.csv', 'r') as f:
    df_cve_info = pd.read_csv(f)
df_cve_info['published'] = pd.to_datetime(df_cve_info['published'])
current_cve = df_cve_info[df_cve_info['published'] >= '2023-03-07']['id'].values
df_nvd_v3_c = df_nvd_v3[df_nvd_v3['cve'].isin(current_cve)]
df_nvd_v3_c_highest = df_nvd_v3[df_nvd_v3['cve'].isin(df_nvd_v3_highest)]

In [27]:
df_features = pd.DataFrame(columns=['cve', '#daysToT', '#daysDT>0', '#daysDT<=0'])
df_features['cve'] = df_nvd_v3_c['cve'].unique()

# Number of days to reach the threshold

In [28]:
def get_days_before_threshold(series):
    indices_above_threshold = [i for i, v in enumerate(series.values) if v > 0.7]
    return indices_above_threshold[0] if indices_above_threshold else np.nan


def compute_days_needed_to_reach_threshold(df):
    return df.groupby('cve')['epss'].apply(get_days_before_threshold)

In [35]:
df_features['#daysToT'] = df_features['cve'].map(compute_days_needed_to_reach_threshold(df_nvd_v3_c))
print(df_features)

                  cve  #daysToT #daysDT>0 #daysDT<=0
0      CVE-2017-20181       NaN       NaN        NaN
1      CVE-2022-25655       NaN       NaN        NaN
2      CVE-2022-25694       NaN       NaN        NaN
3      CVE-2022-25705       NaN       NaN        NaN
4      CVE-2022-25709       NaN       NaN        NaN
...               ...       ...       ...        ...
31133   CVE-2024-2313       NaN       NaN        NaN
31134   CVE-2024-2184       NaN       NaN        NaN
31135   CVE-2024-2365       NaN       NaN        NaN
31136  CVE-2024-28816       NaN       NaN        NaN
31137  CVE-2024-28823       NaN       NaN        NaN

[31138 rows x 4 columns]


# Number of consecutive days with delta-EPSS > 0

In [43]:
from datetime import timedelta


def compute_consecutive_days_feature(df):
    delta_up = 0
    delta_down = 0
    days_up = 0
    days_down = 0
    sorted_dates = sorted(list(df['date']))
    for date in sorted_dates[1:]:
        epss_prev = df[df['date'] == (date - timedelta(days=1))]['epss'].values
        epss_actual = df[df['date'] == date]['epss'].values
        if epss_actual > epss_prev:
            days_up += 1
            delta_up += (epss_actual - epss_prev)
        else:
            days_down += 1
            delta_down += (epss_actual - epss_prev)
    return delta_up, delta_down, days_up, days_down
            
    
    
epss_dates = df_nvd_v3_c.groupby('cve')[['epss', 'date']].apply(compute_consecutive_days_feature)
print(epss_dates)

KeyboardInterrupt: 