In [17]:
!pip install -q fastplot

In [18]:
import pandas as pd
import fastplot
%matplotlib inline

In [3]:
with open('datasets/cve_info.csv', 'r') as f:
    df_cve_nvd = pd.read_csv(f)
df_cve_nvd['published'] = pd.to_datetime(df_cve_nvd['published'])
df_cve_first = pd.read_pickle('datasets/dataset.pkl')
cve_nvd = set(df_cve_nvd['id'].unique())
cve_first = set(df_cve_first['cve'].unique())

In [4]:
print(f"#CVEs from NVD  : {len(cve_nvd)}")
print(f"#CVEs from FIRST: {len(cve_first)}")

#CVEs from NVD  : 241103
#CVEs from FIRST: 239192


In [5]:
missing_first = cve_nvd - cve_first
missing_nvd = cve_first - cve_nvd
print(f"Missing CVEs from FIRST dataset wrt NVD: {len(missing_first)}")
print(f"Missing CVEs from NVD dataset wrt FIRST: {len(missing_nvd)}")

Missing CVEs from FIRST dataset wrt NVD: 1960
Missing CVEs from NVD dataset wrt FIRST: 49


In [6]:
count_rejected = 0
count_received = 0
missing = []
for cve in missing_first:
    data = df_cve_nvd[df_cve_nvd['id'] == cve]['vulnStatus'].values
    if data == 'Rejected':
        count_rejected += 1
    elif data == 'Received':
        count_received += 1
    else:
        missing.append(cve)
print(f"Missing CVEs in FIRST dataset: {missing}")
print(f"Rejected: {count_rejected}")
print(f"Received: {count_received}")
print(f"Total: {count_rejected + count_received}/{len(missing_first)}")

Missing CVEs in FIRST dataset: ['CVE-2005-2759']
Rejected: 1819
Received: 140
Total: 1959/1960


In [27]:
print('Publication year of the missing CVEs in FIRST dataset')
publish_year_dict = {str(year): 0 for year in range(1999, 2025)}
for cve in missing_first:
    date_published = str(df_cve_nvd[df_cve_nvd['id'] == cve]['published'].values[0]).split('-')[0]
    publish_year_dict[date_published] += 1
# data = [(year, count) for year, count in publish_year_dict.items() if count != 0]
# fastplot.plot(data, None, mode='bars', xlabel='Publication year', ylabel='#CVEs', xticks_rotate=30, 
              #figsize=(6, 3), grid=True)
for year, count in publish_year_dict.items():
    if count != 0:
        print(f"{year}: {count}")

Publication year of the missing CVEs in FIRST dataset
2005: 1
2009: 1
2013: 3
2014: 3
2015: 7
2016: 1
2017: 2
2018: 2
2019: 4
2020: 3
2021: 5
2022: 528
2023: 1094
2024: 306


In [26]:
vuln_status = list(df_cve_nvd['vulnStatus'].unique())
print('NVD')
for status in sorted(vuln_status):
    count = len(df_cve_nvd[df_cve_nvd['vulnStatus'] == status])
    print(f"{status.ljust(20)} {str(count).rjust(10)}")
print('\nFIRST')
df_first_in_nvd = df_cve_nvd[df_cve_nvd['id'].isin(cve_first)]
vuln_first_count = df_first_in_nvd['vulnStatus'].value_counts()
for status in sorted(vuln_status):
    print(f"{status.ljust(20)} {str(vuln_first_count[status]).rjust(10)}")
print(f"\n{len(cve_first) - len(df_first_in_nvd['id'].unique())} CVEs of FIRST not in NVD (probably only submitted)")

NVD
Analyzed                 130652
Awaiting Analysis          2521
Modified                  93779
Received                    170
Rejected                  13970
Undergoing Analysis          11

FIRST
Analyzed                 130651
Awaiting Analysis          2521
Modified                  93779
Received                     30
Rejected                  12151
Undergoing Analysis          11

49 CVEs of FIRST not in NVD (probably only submitted)


In [53]:
cve_first_before_increment = set(df_cve_first[df_cve_first['date'] == '2024-02-07']['cve'].unique())
cve_first_on_increment = set(df_cve_first[df_cve_first['date'] == '2024-02-08']['cve'].unique())
cve_first_added = list(cve_first_on_increment - cve_first_before_increment)
print('Distribution of CVEs added on 08/02/2024 in FIRST dataset')
for status in sorted(vuln_status):
    count = df_cve_nvd[df_cve_nvd['id'].isin(cve_first_added) & (df_cve_nvd['vulnStatus'] == status)].shape[0]
    print(f"{status.ljust(20)} {str(count).rjust(10)}")

Distribution of CVEs added on 08/02/2024 in FIRST dataset
Analyzed                    712
Awaiting Analysis            25
Modified                     94
Received                      2
Rejected                  11152
Undergoing Analysis           0
