In [1]:
!pip install -q fastplot
!pip install -q xgboost

In [2]:
import pandas as pd
import fastplot
%matplotlib inline
import numpy as np
from datetime import timedelta, datetime
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

import seaborn as sns
from matplotlib.colors import LogNorm
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

## Dataset

In [130]:
THRESHOLD = 0.7

### Loading

In [3]:
df_nvd_v3 = pd.read_pickle('datasets/epss_v3.pkl')
df_nvd_v3_highest = pd.read_pickle('datasets/epss_v3_highest.pkl')
with open('datasets/cve_info.csv', 'r') as f:
    df_cve_info = pd.read_csv(f)
df_cve_info['published'] = pd.to_datetime(df_cve_info['published'])
current_cve = df_cve_info[df_cve_info['published'] >= '2023-03-07']['id'].values
df_nvd_v3_c = df_nvd_v3[df_nvd_v3['cve'].isin(current_cve)]

### Filter 60 days and discard already high CVEs

In [162]:
def truncate_timeseries_before_delta_days(df_cve):
    df = df_cve.reset_index(drop=True)
    cve = df['cve'].unique()[0]
    index_threshold = df[df['epss'] > THRESHOLD].index.min()
    index_threshold_delta_days = max(0, index_threshold - delta_days)
    return df.iloc[:index_threshold_delta_days + 1]


highest_current_cve = set(df_nvd_v3_c[df_nvd_v3_c['cve'].isin(df_nvd_v3_highest['cve'].unique())]['cve'].values)
df_interval = df_nvd_v3_c[~df_nvd_v3_c['cve'].isin(highest_current_cve)]
df_highest_current_cve = df_nvd_v3_c[df_nvd_v3_c['cve'].isin(highest_current_cve)]
delta_days = 7
dfs = list()
for cve in highest_current_cve:
    dfs.append(truncate_timeseries_before_delta_days(df_highest_current_cve[df_highest_current_cve['cve'] == cve]))
df_temp = pd.concat(dfs, ignore_index=True)
df_interval = pd.concat([df_interval, df_temp], ignore_index=True)

### Remove 'rejected' CVEs

In [164]:
rejected_cve = df_cve_info[df_cve_info['vulnStatus'] == 'Rejected']['id'].values
df_interval = df_interval[~df_interval['cve'].isin(rejected_cve)]

## Features

In [269]:
features_columns = ['cve',
                    '#days_grow', 
                    '#days_drop', 
                    'epss_grow_sum', 
                    'epss_drop_sum',  
                    'std_dev',
                    'max_delta_epss',
                    '#delta_increments']
df_features = pd.DataFrame(columns=features_columns)
df_features['cve'] = df_interval['cve'].unique()

### Number of days with delta-EPSS >(<) 0 and relative cumulative EPSS

In [166]:
def compute_consecutive_days_feature(df):
    delta_up = 0
    delta_down = 0
    days_up = 0
    days_down = 0
    sorted_dates = sorted(df['date'].unique())
    for date, epss in zip(sorted_dates[1:], df['epss'].iloc[1:]):
        try:
            prev_epss = float(df.loc[df['date'] == date - pd.Timedelta(days=1), 'epss'].values[0])
            if epss > prev_epss:
                days_up += 1
                delta_up += float((epss - prev_epss))
            elif epss < prev_epss:
                days_down += 1
                delta_down += float((epss - prev_epss))
            prev_epss = epss
        except:
            break
    cve = df['cve'].unique()[0]
    df_features.loc[df_features['cve'] == cve, features_columns[1]] = days_up
    df_features.loc[df_features['cve'] == cve, features_columns[2]] = days_down
    df_features.loc[df_features['cve'] == cve, features_columns[3]] = round(delta_up, 3)
    df_features.loc[df_features['cve'] == cve, features_columns[4]] = round(delta_down, 3)

### Standard deviation

In [169]:
def compute_standard_deviation(df):
    cve = df['cve'].unique()[0]
    std_dev = df['epss'].std()
    if np.isnan(std_dev):
        std_dev = 0
    df_features.loc[df_features['cve'] == cve, features_columns[5]] = std_dev

### Maximum delta EPSS

In [170]:
def compute_max_delta_epss(df):
    cve = df['cve'].unique()[0]
    max_peak = df.loc[df['epss'].diff() > 0].max()
    if np.isnan(max_peak['epss']):
        df_features.loc[df_features['cve'] == cve, features_columns[8]] = 0
        return
    date_before_max = max_peak['date'] - pd.Timedelta(days=1)
    try:
        epss_before_max = df[df['date'] == date_before_max]['epss'].values[0]
    except:
        epss_before_max = max_peak['epss']
    df_features.loc[df_features['cve'] == cve, features_columns[6]] = max_peak['epss'] - epss_before_max

### Number of delta-increment in EPSS

In [172]:
delta = 0.02


def count_increment_of_delta(df):
    counter = 0
    sorted_dates = sorted(df['date'].unique())
    for date, epss in zip(sorted_dates[1:], df['epss'].iloc[1:]):
        try:
            prev_epss = float(df.loc[df['date'] == date - pd.Timedelta(days=1), 'epss'].values[0])
            if (epss - prev_epss) >= delta:
                counter += 1
        except:
            break
    cve = df['cve'].unique()[0]
    df_features.loc[df_features['cve'] == cve, features_columns[7]] = counter

## Model

### Features creation

In [None]:
df_interval.groupby('cve').apply(compute_consecutive_days_feature)
df_interval.groupby('cve').apply(compute_standard_deviation)
df_interval.groupby('cve').apply(compute_max_delta_epss)
df_interval.groupby('cve').apply(count_increment_of_delta)

In [None]:
df_features['has_reached_threshold'] = df_interval.groupby('cve').apply(lambda e: True if e.name in highest_current_cve else False).values
cols_to_convert = features_columns[1:]
df_features[cols_to_convert] = df_features[cols_to_convert].apply(pd.to_numeric, errors='coerce')

### Training and results

In [None]:
def print_results(model, predictions):
    n_decimal = 4
    print('Feature importances')
    importance = model.feature_importances_
    for i in range(1, len(features_columns)):
        print(f"{features_columns[i].ljust(20)} {str(round(importance[i - 1], n_decimal)).rjust(10)}")
    
    y_scores = model.predict_proba(X_test)[:, 1]
    print()
    print(f"{'Accuracy'.ljust(20)} {str(round(accuracy_score(y_test, predictions), n_decimal)).rjust(10)}")
    print(f"{'Precision'.ljust(20)} {str(round(precision_score(y_test, predictions), n_decimal)).rjust(10)}")
    print(f"{'Recall'.ljust(20)} {str(round(recall_score(y_test, predictions), n_decimal)).rjust(10)}")
    print(f"{'F1-score'.ljust(20)} {str(round(f1_score(y_test, predictions), n_decimal)).rjust(10)}")
    print(f"{'AUC-ROC'.ljust(20)} {str(round(roc_auc_score(y_test, predictions), n_decimal)).rjust(10)}")
    print()

    conf_matrix = confusion_matrix(y_test, predictions)
    plt.figure(figsize=(5, 3))
    sns.heatmap(conf_matrix, annot=True, square=True, cmap='rocket_r', norm=LogNorm(), fmt='d', xticklabels=model.classes_, yticklabels=model.classes_)
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title(model.__class__.__name__)
    plt.show()

In [None]:
X = df_features.drop(['has_reached_threshold', 'cve'], axis=1)
y = df_features['has_reached_threshold']
rs = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rs)

In [None]:
model_rf = RandomForestClassifier(random_state=rs, class_weight='balanced')
model_rf.fit(X_train, y_train)
predictions_rf = model_rf.predict(X_test)
print_results(model_rf, predictions_rf)

## Features distribution

In [None]:
df_features[~df_features['cve'].isin(highest_cves)].describe()

In [None]:
df_features[df_features['cve'].isin(highest_cves)].describe()

In [None]:
fastplot.plot(df_features[features_columns[1]].values, None, mode='CDF', grid=True, 
              xlabel=features_columns[1], figsize=(6, 3))

In [None]:
fastplot.plot(df_features[features_columns[2]].values, None, mode='CDF', grid=True, 
              xlabel=features_columns[2], figsize=(6, 3))

In [None]:
fastplot.plot(df_features[features_columns[3]].values, None, mode='CDF', grid=True, 
              xlabel=features_columns[3], figsize=(6, 3), ylim=(0.9,1.0))

In [None]:
fastplot.plot(df_features[features_columns[4]].values, None, mode='CDF', grid=True, 
              xlabel=features_columns[4], figsize=(6, 3), ylim=(0.0, 0.01))

In [None]:
fastplot.plot(df_features[features_columns[5]].values, None, mode='CDF', grid=True, 
              xlabel=features_columns[5], figsize=(6, 3), ylim=(0.9, 1.0))

In [None]:
fastplot.plot(df_features[features_columns[6]].values, None, mode='CDF', grid=True, 
              xlabel=features_columns[6], figsize=(6, 3), ylim=(0.9, 1.0))

In [None]:
fastplot.plot(df_features[features_columns[7]].values, None, mode='CDF', grid=True, 
              xlabel=features_columns[7], figsize=(6, 3), ylim=(0.98, 1.0))