# Cybersecurity Vulnerability Analysis

This notebook loads NVD, KEV, and CWE data, merges them, and performs basic analysis.

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Load NVD JSON
url = 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2024.json.gz'
data = json.loads(gzip.decompress(requests.get(url).content))
rows = []
for item in data['CVE_Items']:
    cid = item['cve']['CVE_data_meta']['ID']
    pd0 = item['publishedDate']
    score = item.get('impact',{}).get('baseMetricV3',{}).get('cvssV3',{}).get('baseScore')
    cwes = [d['value'] for p in item['cve']['problemtype']['problemtype_data'] for d in p['description'] if d['value'].startswith('CWE-')]
    rows.append({'id':cid,'published':pd0,'cvss':score,'cwe':','.join(set(cwes))})
nvd = pd.DataFrame(rows)
nvd['published'] = pd.to_datetime(nvd['published'])


In [6]:
# Load KEV
kev = pd.DataFrame(requests.get(
    'https://www.cisa.gov/sites/default/files/feeds/known_exploited_vulnerabilities.json'
).json()['vulnerabilities'])
kev = kev.rename(columns={'cveID':'id','dateAdded':'kev_date'})
kev['kev_date'] = pd.to_datetime(kev['kev_date'])


In [7]:
# Load CWE
z = requests.get('https://cwe.mitre.org/data/xml/cwec_latest.xml.zip').content
with gzip.GzipFile(fileobj=BytesIO(z)) as f:
    xml = xmltodict.parse(f.read())
weaks = xml['Weakness_Catalog']['Weaknesses']['Weakness']
cwe = pd.DataFrame([{'cwe':w['@ID'],'name':w.get('Name')} for w in weaks])


NameError: name 'BytesIO' is not defined

In [None]:
# Merge
df = nvd.merge(kev[['id','kev_date']],on='id',how='left')
df['exploited']=df['kev_date'].notna()
df = df.assign(cwe_id=df['cwe'].str.split(',')).explode('cwe_id').merge(cwe,on_left='cwe_id',on='cwe',how='left')


In [None]:
# Feature Engineering
now = datetime.utcnow()
df['age_days'] = (now - df['published']).dt.days
df['sev']=pd.cut(df['cvss'],bins=[0,4,7,9,10],labels=['Low','Med','High','Crit'])


In [15]:
# Time Series
ts = df.set_index('published').resample('M').size()
ts_e = df[df['exploited']].resample('M',on='published').size()
plt.plot(ts,label='All'); plt.plot(ts_e,label='Ex'); plt.legend(); plt.show()

NameError: name 'df' is not defined

In [13]:
# Top CWEs
print(df['cwe_id'].value_counts().head(10))

NameError: name 'df' is not defined

In [None]:
# Stats & Model
print(df.groupby('exploited')['cvss','age_days'].agg(['mean','median']))
X=df[['cvss','age_days']].dropna(); y=df.loc[X.index,'exploited']
m=LogisticRegression(max_iter=500).fit(X,y)
print(m.coef_)

In [None]:
# Survival
km=KaplanMeierFitter(); km.fit(df['age_days'],event_observed=df['exploited']); km.plot_survival_function(); plt.show()