# 7.create CVE manual analysis file

In [1]:
import codecs
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import random
from IPython.display import clear_output
from ast import literal_eval

import settings
from utils import vplot, vprint

In [2]:
DISTILLED_DIR = settings.DATA_CONFIG['distilled_dir']
RAW_DIR = settings.DATA_CONFIG['raw_dir']
MANUAL_DIR = settings.DATA_CONFIG['manual_dir']

FRAMEWORKS = ['tensorflow','caffe','opencv','pytorch','keras']
# MANUAL_COLUMNS = ['CVE ID', 'framework',  'Description', 'commit hash', 'commit description', 'CWE ID',  'CVSS2 Access Complexity',
#        'CVSS2 Authentication Required', 'CVSS2 Availability Impact',
#        'CVSS2 Confidentiality Impact', 'CVSS2 Score', 'CVSS2 Integrity Impact',
#        'CVSS3attackVector', 'CVSS3attackComplexity', 'CVSS3privilegesRequired',
#        'CVSS3userInteraction', 'CVSS3scope', 'CVSS3confidentialityImpact',
#        'CVSS3integrityImpact', 'CVSS3availabilityImpact', 'CVSS3baseScore',
#        'CVSS3baseSeverity']

MANUAL_COLUMNS = ['CVE ID', 'framework',  'Description', 'Taxonomy','Root Cause','Fixing Pattern', 'Symptom', 
                  'CWE ID', 'CWE Name', 'commit hash', 'commit description', 'CVSS2 Access Complexity',
       'CVSS2 Authentication Required', 'CVSS2 Availability Impact',
       'CVSS2 Confidentiality Impact', 'CVSS2 Score', 'CVSS2 Integrity Impact',
       'CVSS3attackVector', 'CVSS3attackComplexity', 'CVSS3privilegesRequired',
       'CVSS3userInteraction', 'CVSS3scope', 'CVSS3confidentialityImpact',
       'CVSS3integrityImpact', 'CVSS3availabilityImpact', 'CVSS3baseScore',
       'CVSS3baseSeverity',]

OUTPUT_FILE = MANUAL_DIR + 'vulnerability_official.xlsx'

In [3]:
def get_cve_file(framework):
    return DISTILLED_DIR + f'nvd_{framework}.csv'
def get_vuln_pr_file(framework):
    return DISTILLED_DIR + f'vuln_{framework}_pr.csv'
def get_vuln_commit_file(framework):
    return DISTILLED_DIR + f'vuln_{framework}_commit.csv'
def get_vuln_pr_commit_file(framework):
    return DISTILLED_DIR + f'vuln_{framework}_pr+commit.csv'
def get_pr_file(framework):
    return RAW_DIR + f'{framework}_pr.csv'
def get_commit_file(framework):
    return RAW_DIR + f'{framework}_commit.csv'
def get_issue_file(framework):
    return RAW_DIR + f'{framework}_issue.csv'

In [4]:
REGEX_ONLY_FIRST_COMMA = r'^([^,]*),'
REGEX_ONLY_FIRST_WHITESPACE = r'^([^\s]*)\s'
REGEX_TEXT_BEFORE_FIRST_COMMA = r'^[^,]*'
REGEX_TEXT_AFTER_LAST_COMMA = r'[^,]*$'
REGEX_TEXT_BETWEEN_FIRST_AND_LAST_COMMA = r'(?<=,).+(?=,)'

### Coverage Analysis

Check how many overlap between latent vulnerability search result and official records from CVE

In [5]:
def stat_verified_vuln_search_coverage(frameworks):
    print(f'| Framework | Total | Commit | Cover | Cover Rate |')
    for framework in frameworks:
        cve_df = pd.read_csv(get_cve_file(framework=framework))
        cve_df = cve_df[['commit hash','CWE ID','CVE ID']] 
        cve_commit_df = cve_df[~cve_df['commit hash'].isnull()].reset_index(drop=True)
        cve_commit_df['hash'] = cve_commit_df['commit hash'].str[:7]
        commit_df = pd.read_csv(get_commit_file(framework=framework), sep=REGEX_ONLY_FIRST_WHITESPACE, engine='python', header=0, names=["tmp", "commit","description"]).drop(columns=["tmp"])
        commit_df['hash'] = commit_df['commit'].str[:7]
        ex_cve_df = pd.merge(cve_commit_df, commit_df, on=["hash"])[['hash','CWE ID', 'CVE ID', 'description']]
        ex_cve_df['cover'] = False
        with open(get_vuln_pr_commit_file(framework), errors='ignore') as f:
            file = f.read()
            for i, row in ex_cve_df.iterrows():
                if row['hash'] in file:
                    ex_cve_df.at[i,'cover'] = True
        total, commit, cover = len(cve_df), len(ex_cve_df), len(ex_cve_df[ex_cve_df['cover']])
        print(f"| {framework:10}| {total:5} | {commit:6} | {cover:5} | {round(cover/total*100, 2):8} % |")
stat_verified_vuln_search_coverage(frameworks=FRAMEWORKS)

| Framework | Total | Commit | Cover | Cover Rate |
| tensorflow|   407 |    395 |    68 |    16.71 % |
| caffe     |    12 |      0 |     0 |      0.0 % |
| opencv    |    38 |      2 |     1 |     2.63 % |
| pytorch   |     4 |      1 |     0 |      0.0 % |
| keras     |     4 |      0 |     0 |      0.0 % |


### Merge CVE

In [6]:
def get_all_framework(frameworks):
    df_list = []
    for framework in FRAMEWORKS:
        # load
        cve_df = pd.read_csv(get_cve_file(framework=framework))
        raw_commit_df = pd.read_csv(get_commit_file(framework=framework), sep=REGEX_ONLY_FIRST_WHITESPACE, engine='python', header=0, names=["tmp", "commit","description"]).drop(columns=["tmp"])
        # extra key
        cve_df['hash'] = cve_df['commit hash'].str[:7]
        raw_commit_df['hash'] = raw_commit_df['commit'].str[:7]
        # filter
        cve_commit_df = cve_df[~cve_df['commit hash'].isnull()].reset_index(drop=True)
        # join on key, cve_commit_df,raw_commit_df
        merged_df = pd.merge(cve_commit_df,raw_commit_df,on='hash')
        merged_df['commit description'] = merged_df['description']
        merged_df = merged_df[['hash', 'commit description']]
        # join on key
        df = pd.merge(cve_df,merged_df.drop_duplicates(subset=['hash']),on='hash',how='left')
        df['framework'] = framework
        df_list.append(df)
    return pd.concat(df_list, ignore_index=True)
dfs = get_all_framework(frameworks=FRAMEWORKS)

## Save

In [7]:
if not os.path.isfile(OUTPUT_FILE):
    dfs['Taxonomy'] = ''
    dfs['Root Cause'] = ''
    dfs['Fixing Pattern'] = ''
    dfs['Symptom'] = ''
    dfs['CWE Name'] = ''
    dfs = dfs[MANUAL_COLUMNS]
    dfs.to_excel(OUTPUT_FILE,sheet_name='manual', index=False)