## 1. Imports & Helper Function

In [40]:
import json
import pandas as pd
from typing import Tuple

def process_nvd_json(file_path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    with open(file_path, 'r', encoding='ISO-8859-1') as f:
        nvd_data = json.load(f)

    cve_items = []
    cpe_items = []
    for item in nvd_data['CVE_Items']:
        meta = item['cve']['CVE_data_meta']
        cve_id = meta['ID']
        descs = item['cve']['description']['description_data']

        prob = item['cve']['problemtype']['problemtype_data']
        try:
            cwe = prob[0]['description'][0]['value']
        except:
            cwe = ''

        impact3 = item.get('impact', {}) \
                      .get('baseMetricV3', {}) \
                      .get('cvssV3', {})
        cve_items.append({
            'id': cve_id,
            'assigner': meta.get('ASSIGNER',''),
            'published_date': item.get('publishedDate'),
            'last_modified_date': item.get('lastModifiedDate'),
            'description': next((d['value'] for d in descs if d['lang']=='en'), ''),
            'cwe': cwe,
            'cvss3_vector': impact3.get('vectorString',''),
            'cvss3_base_score': impact3.get('baseScore', None),
            'cvss3_base_severity': impact3.get('baseSeverity','')
        })

        for node in item['configurations'].get('nodes', []):
            for cm in node.get('cpe_match', []):
                uri = cm.get('cpe23Uri','')
                parts = uri.split(':')
                cpe_items.append({
                    'cve_id': cve_id,
                    'cpe23Uri': uri,
                    'vulnerable': cm.get('vulnerable', False),
                    'versionStartIncluding': cm.get('versionStartIncluding',''),
                    'versionEndIncluding': cm.get('versionEndIncluding',''),
                    'vendor': parts[3] if len(parts)>3 else '',
                    'product': parts[4] if len(parts)>4 else '',
                    'version': parts[5] if len(parts)>5 else ''
                })

cve_df = pd.DataFrame(cve_items)
 cpe_df = pd.DataFrame(cpe_items)
    for col in ['published_date','last_modified_date']:
        cve_df[col] = pd.to_datetime(cve_df[col], errors='coerce')
    cve_df.sort_values('id', inplace=True)
    cpe_df.sort_values(['cve_id','cpe23Uri'], inplace=True)
    return cve_df, cpe_df

Error: File 'Project/nvdcve-1.1-2024.json' not found.


## 2. Load & Process NVD JSON (2023)

In [32]:
nvd_file = 'nvdcve-1.1-2024.json'
cve_df, cpe_df = process_nvd_json(nvd_file)
cve_df.info()
cve_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37730 entries, 0 to 37729
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   id                   37730 non-null  object             
 1   assigner             37730 non-null  object             
 2   published_date       37730 non-null  datetime64[ns, UTC]
 3   last_modified_date   37730 non-null  datetime64[ns, UTC]
 4   description          37730 non-null  object             
 5   cwe                  37730 non-null  object             
 6   cvss3_vector         37730 non-null  object             
 7   cvss3_base_score     20271 non-null  float64            
 8   cvss3_base_severity  37730 non-null  object             
dtypes: datetime64[ns, UTC](2), float64(1), object(6)
memory usage: 2.6+ MB


Unnamed: 0,id,assigner,published_date,last_modified_date,description,cwe,cvss3_vector,cvss3_base_score,cvss3_base_severity
0,CVE-2024-0001,psirt@purestorage.com,2024-09-23 18:15:00+00:00,2024-09-27 14:08:00+00:00,A condition exists in FlashArray Purity whereb...,CWE-1188,CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H,9.8,CRITICAL
1,CVE-2024-0002,psirt@purestorage.com,2024-09-23 18:15:00+00:00,2024-09-27 14:13:00+00:00,A condition exists in FlashArray Purity whereb...,NVD-CWE-noinfo,CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H,9.8,CRITICAL
2,CVE-2024-0003,psirt@purestorage.com,2024-09-23 18:15:00+00:00,2024-09-27 14:23:00+00:00,A condition exists in FlashArray Purity whereb...,NVD-CWE-noinfo,CVSS:3.1/AV:N/AC:L/PR:H/UI:N/S:U/C:H/I:H/A:H,7.2,HIGH
3,CVE-2024-0004,psirt@purestorage.com,2024-09-23 18:15:00+00:00,2024-09-27 14:24:00+00:00,A condition exists in FlashArray Purity whereb...,CWE-94,CVSS:3.1/AV:N/AC:L/PR:H/UI:N/S:U/C:H/I:H/A:H,7.2,HIGH
4,CVE-2024-0005,psirt@purestorage.com,2024-09-23 18:15:00+00:00,2024-09-27 15:25:00+00:00,A condition exists in FlashArray and FlashBlad...,CWE-77,CVSS:3.1/AV:N/AC:L/PR:L/UI:N/S:U/C:H/I:H/A:H,8.8,HIGH


## 3. Load KEV Catalog

In [24]:
kev_file = 'known_exploited_vulnerabilities.csv'
df_kev = pd.read_csv(kev_file)
df_kev.rename(columns={'cveID':'id','dateAdded':'kev_added_date'}, inplace=True)
df_kev['kev_added_date'] = pd.to_datetime(df_kev['kev_added_date'], errors='coerce')
df_kev.info()
df_kev.head()

=== KEV DataFrame ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1323 entries, 0 to 1322
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   id                          1323 non-null   object        
 1   vendorProject               1323 non-null   object        
 2   product                     1323 non-null   object        
 3   vulnerabilityName           1323 non-null   object        
 4   kev_added_date              1323 non-null   datetime64[ns]
 5   shortDescription            1323 non-null   object        
 6   requiredAction              1323 non-null   object        
 7   dueDate                     1323 non-null   object        
 8   knownRansomwareCampaignUse  1323 non-null   object        
 9   notes                       1323 non-null   object        
 10  cwes                        1177 non-null   object        
dtypes: datetime64[ns](1), object(10)
m

Unnamed: 0,id,vendorProject,product,vulnerabilityName,kev_added_date,shortDescription,requiredAction,dueDate,knownRansomwareCampaignUse,notes,cwes
0,CVE-2025-24054,Microsoft,Windows,Microsoft Windows NTLM Hash Disclosure Spoofin...,2025-04-17,Microsoft Windows NTLM contains an external co...,"Apply mitigations per vendor instructions, fol...",2025-05-08,Unknown,https://msrc.microsoft.com/update-guide/vulner...,CWE-73
1,CVE-2025-31201,Apple,Multiple Products,Apple Multiple Products Arbitrary Read and Wri...,2025-04-17,"Apple iOS, iPadOS, macOS, and other Apple prod...","Apply mitigations per vendor instructions, fol...",2025-05-08,Unknown,https://support.apple.com/en-us/122282 ; https...,
2,CVE-2025-31200,Apple,Multiple Products,Apple Multiple Products Memory Corruption Vuln...,2025-04-17,"Apple iOS, iPadOS, macOS, and other Apple prod...","Apply mitigations per vendor instructions, fol...",2025-05-08,Unknown,https://support.apple.com/en-us/122282 ; https...,
3,CVE-2021-20035,SonicWall,SMA100 Appliances,SonicWall SMA100 Appliances OS Command Injecti...,2025-04-16,SonicWall SMA100 appliances contain an OS comm...,"Apply mitigations per vendor instructions, fol...",2025-05-07,Unknown,https://psirt.global.sonicwall.com/vuln-detail...,CWE-78
4,CVE-2024-53150,Linux,Kernel,Linux Kernel Out-of-Bounds Read Vulnerability,2025-04-09,Linux Kernel contains an out-of-bounds read vu...,"Apply mitigations per vendor instructions, fol...",2025-04-30,Unknown,This vulnerability affects a common open-sourc...,CWE-125


## 4. Merge & Flag Exploited

In [34]:
df_merged = cve_df.merge(df_kev[['id','kev_added_date']], on='id', how='left')
df_merged['is_exploited'] = df_merged['kev_added_date'].notna()
df_merged.info()
print("\nFirst 5 rows:")
df_merged.head()
print("\nExploited counts:")
df_merged['is_exploited'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37730 entries, 0 to 37729
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   id                   37730 non-null  object             
 1   assigner             37730 non-null  object             
 2   published_date       37730 non-null  datetime64[ns, UTC]
 3   last_modified_date   37730 non-null  datetime64[ns, UTC]
 4   description          37730 non-null  object             
 5   cwe                  37730 non-null  object             
 6   cvss3_vector         37730 non-null  object             
 7   cvss3_base_score     20271 non-null  float64            
 8   cvss3_base_severity  37730 non-null  object             
 9   kev_added_date       142 non-null    datetime64[ns]     
 10  is_exploited         37730 non-null  bool               
dtypes: bool(1), datetime64[ns, UTC](2), datetime64[ns](1), float64(1), object(6)
mem

is_exploited
False    37588
True       142
Name: count, dtype: int64

## 5. Save Final Table

In [30]:
out_csv = 'nvd_plus_kev_2024.csv'
df_merged.to_csv(out_csv, index=False)
print(f"Saved merged table to {out_csv}")

Saved merged table to nvd_plus_kev_2024.csv
