In [1]:
import codecs
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import random
from IPython.display import clear_output
from ast import literal_eval

import settings
from utils import vplot, vprint

In [2]:
FRAMEWORKS = ['tensorflow','opencv','pytorch','keras', 'caffe']

FILTERED_DIR = settings.DATA_CONFIG['distilled_dir']
MANUAL_DIR = settings.DATA_CONFIG['manual_dir']

VULN_FILE = MANUAL_DIR + 'vulnerability.xlsx'
VULN_SHEET_NAME = 'manual'

RELEASE_COLS = ['Framework', 'Description', 'PR Number', 'Keyword', 'Root Cause', 'Fixing Pattern', 
                'Symptom', 'Taxonomy', 'CVE ID', 'CWE ID', 'CWE Name', 'CWE Pillar',
                'Line Addition', 'Line Deletion', 'Line Changed', 'File Changed',
                'Created At', 'Merged At', 'Time Cost (Hours)', 'Comment Count',
                'Review Comment Count', 'Patch URLs']

In [3]:
def read_csv(framework):
    filename = MANUAL_DIR + f'{framework}_manual.xlsx'
    df = pd.read_excel(filename,sheet_name='manual')
    return df

In [4]:
dfs = {}
for framework in FRAMEWORKS:
    dfs[framework] = read_csv(framework).astype(str)
print(list(dfs[FRAMEWORKS[0]].columns))

['title', 'pr_number', 'keyword', 'taxonomy', 'root_cause', 'fixing_pattern', 'symptom', 'pr_html']


In [5]:
def get_vuln():
    df = pd.read_excel(VULN_FILE,sheet_name=VULN_SHEET_NAME)
    return df

In [6]:
vuln_df = get_vuln()

In [7]:
REGEX_PR_NUM = "(?<=pull\/)(\d+)"
vuln_df['PR Number'] = vuln_df['Patch URLs'].str.extract(f"({REGEX_PR_NUM})", flags=re.IGNORECASE)[0].str.lower()

In [8]:
def find_row(framework, pr_num, df):    
    result = df[(df['PR Number'] == str(pr_num)) & (df['Framework'] == framework)]
    if len(result) > 0:
        return result.iloc[0]
    return None

In [9]:
for framework in FRAMEWORKS:
    df = dfs[framework]
    total = len(df)
    for idx, row in df.iterrows():
        pr_num = row['pr_number']
        row = find_row(framework=framework, pr_num=pr_num, df=vuln_df)
        if row is not None:
            df.at[idx, 'taxonomy'] = row['Taxonomy']
            df.at[idx, 'root_cause'] = row['Root Cause']
            df.at[idx, 'fixing_pattern'] = row['Fixing Pattern']
            df.at[idx, 'symptom'] = row['Symptom']
            clear_output(wait=True)
            print(f"[{framework}]({idx+1}/{total})")

[caffe](96/96)


In [10]:
# for framework in FRAMEWORKS:
#     MANUAL_FILE = MANUAL_DIR + f'{framework}_manual.xlsx'
#     dfs[framework].to_excel(MANUAL_FILE,sheet_name='manual', index=False)  