# Cybersecurity Vulnerability Analysis

This notebook performs data acquisition, cleaning, exploratory analysis, and modeling using NVD, KEV, and CWE datasets.

## 1. Imports and Setup

In [None]:
import json
import gzip
import requests
import pandas as pd
import xmltodict
import zipfile
from io import BytesIO
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

pd.options.display.max_columns = None

## 2. Data Processing Functions

In [None]:
def process_nvd_json(url: str) -> pd.DataFrame:
    """
    Download and process NVD JSON feed for a given year.
    Returns DataFrame with columns: id, published_date, cvss3_base_score, cwe_ids.
    """
    resp = requests.get(url)
    data = json.loads(gzip.decompress(resp.content))
    records = []
    for item in data.get('CVE_Items', []):
        cve_id = item['cve']['CVE_data_meta']['ID']
        pub_date = item.get('publishedDate')
        impact = item.get('impact', {}).get('baseMetricV3', {})
        score = impact.get('cvssV3', {}).get('baseScore')
        # extract CWEs
        cwes = []
        for p in item['cve']['problemtype']['problemtype_data']:
            for desc in p['description']:
                if desc['value'].startswith('CWE-'):
                    cwes.append(desc['value'])
        records.append({
            'id': cve_id,
            'published_date': pub_date,
            'cvss3_base_score': score,
            'cwe_ids': ','.join(sorted(set(cwes)))
        })
    df = pd.DataFrame(records)
    df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')
    df['cvss3_base_score'] = pd.to_numeric(df['cvss3_base_score'], errors='coerce')
    return df


In [None]:
def process_kev_json(url: str) -> pd.DataFrame:
    """
    Download and process CISA KEV JSON catalog.
    Returns DataFrame with columns: id, kev_added_date.
    """
    resp = requests.get(url)
    data = resp.json()
        
    df = pd.DataFrame(data.get('vulnerabilities', []))
    df = df.rename(columns={'cveID':'id', 'dateAdded':'kev_added_date'})
    df['id'] = df['id'].str.upper()
    df['kev_added_date'] = pd.to_datetime(df['kev_added_date'], errors='coerce')
    return df[['id', 'kev_added_date']]


In [None]:
def process_cwe_xml(url: str) -> pd.DataFrame:
    """
    Download and parse CWE XML taxonomy.
    Returns DataFrame with columns: cwe_id, name, description.
    """
    resp = requests.get(url)
    with zipfile.ZipFile(BytesIO(resp.content)) as z:
        xml_file = [n for n in z.namelist() if n.endswith('.xml')][0]
        with z.open(xml_file) as f:
            doc = xmltodict.parse(f.read())
    weaknesses = doc['Weakness_Catalog']['Weaknesses']['Weakness']
    records = []
    for w in weaknesses:
        records.append({
            'cwe_id': w['@ID'],
            'name': w.get('Name'),
            'description': w.get('Description', {}).get('Description_Text')
        })
    return pd.DataFrame(records)


## 3. Main ETL Pipeline

In [None]:
def main():
    # URLs
    nvd_url = 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2024.json.gz'
    kev_url = 'https://www.cisa.gov/sites/default/files/feeds/known_exploited_vulnerabilities.json'
    cwe_url = 'https://cwe.mitre.org/data/xml/cwec_latest.xml.zip'
    
    # Process datasets
    print('Processing NVD data...')
    nvd_df = process_nvd_json(nvd_url)
    print('Processing KEV data...')
    kev_df = process_kev_json(kev_url)
    print('Processing CWE taxonomy...')
    cwe_df = process_cwe_xml(cwe_url)

    # Merge NVD and KEV
    merged = nvd_df.merge(kev_df, on='id', how='left', indicator=True)
    merged['is_exploited'] = (merged['_merge'] == 'both')

    # Attach CWE names
    exploded = merged.assign(cwe_id=merged['cwe_ids'].str.split(',')).explode('cwe_id')
    final_df = exploded.merge(cwe_df, on='cwe_id', how='left')

    # Feature engineering
    now = datetime.utcnow()
    final_df['vuln_age_days'] = (now - final_df['published_date']).dt.days
    final_df['severity_level'] = pd.cut(final_df['cvss3_base_score'], bins=[0,4,7,9,10], labels=['Low','Medium','High','Critical'])

    final_df.to_csv('merged_vuln_analysis.csv', index=False)
    print('ETL complete, saved merged_vuln_analysis.csv')

if __name__ == '__main__':
    main()

## 4. Exploratory Data Analysis & Modeling

In [None]:
df = pd.read_csv('merged_vuln_analysis.csv')
df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')
df['kev_added_date'] = pd.to_datetime(df['kev_added_date'], errors='coerce')
df['vuln_age_days'] = (pd.Timestamp.utcnow() - df['published_date']).dt.days


In [None]:
# Time series plot
ts = df.set_index('published_date').resample('M').size()
ts_ke = df[df['is_exploited']].set_index('published_date').resample('M').size()
plt.figure(figsize=(10,5))
plt.plot(ts.index, ts.values, label='All CVEs')
plt.plot(ts_ke.index, ts_ke.values, label='Exploited CVEs')
plt.title('Monthly CVE Counts')
plt.legend()
plt.show()

In [None]:
# CWE frequency
top_cwes = df['cwe_id'].value_counts().nlargest(10)
plt.figure(figsize=(8,4))
top_cwes.plot(kind='barh')
plt.title('Top 10 CWEs')
plt.show()

In [None]:
# Descriptive stats
stats = df.groupby('is_exploited')[['cvss3_base_score','vuln_age_days']].agg(['mean','median','std'])
stats

In [None]:
# Logistic regression
X = df[['cvss3_base_score','vuln_age_days']].dropna()
y = df.loc[X.index, 'is_exploited']
model = LogisticRegression(max_iter=1000)
model.fit(X, y)
print('Coefficients:', model.coef_, 'Intercept:', model.intercept_)

In [None]:
# Survival analysis
kmf = KaplanMeierFitter()
T = df['vuln_age_days']
E = df['is_exploited']
kmf.fit(T, event_observed=E)
kmf.plot_survival_function()
plt.title('Time to Exploitation Survival')
plt.show()