In [1]:
import codecs
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import random
from IPython.display import clear_output
from ast import literal_eval

import settings
from utils import vplot, vprint

In [65]:
FRAMEWORKS = ['tensorflow','opencv','pytorch','keras', 'caffe']

MANUAL_DIR = settings.DATA_CONFIG['manual_dir']

OLD_FILE = MANUAL_DIR + 'vulnerability.xlsx'
NEW_FILE = MANUAL_DIR + 'vulnerability_manual.xlsx'
OUT_FILE = MANUAL_DIR + 'vulnerability_sync.xlsx'

COLUMNS = ['Source','Framework','Description','PR Num', 'Keyword', 'CVE ID','Taxonomy','CWE ID','CWE Name',
           'CWE Pillar','Root Cause', 'Fixing Pattern', 'Symptom',
           'Line Addition', 'Line Deletion', 'Line Changed', 'File Changed',
           'Created At', 'Merged At', 'Time Cost (Hours)', 
           'Comment Count', 'Review Comment Count',
           'Patch URLs']

In [4]:
def read_csv(filename):
    df = pd.read_excel(filename,sheet_name='manual').astype('string')
    return df

In [54]:
old_df = read_csv(filename=OLD_FILE)
new_df = read_csv(filename=NEW_FILE)
new_df['CWE Pillar'] = ""

In [16]:
fdf = old_df[(~old_df['Created At'].isnull()) & (~old_df['Merged At'].isnull())]

In [18]:
fdf[:1]

Unnamed: 0,Framework,Description,CVE ID,Root Cause,Fixing Pattern,Symptom,Taxonomy,CWE ID,CWE Name,CWE Pillar,Line Addition,Line Deletion,Line Changed,File Changed,Created At,Merged At,Time Cost (Hours),Comment Count,Review Comment Count,Patch URLs
0,tensorflow,[Crash fix] Fix cudaMallocAsync crashes.,,Improper Type Conversion,Fix Type Conversion,Crash,Access of Resource Using Incompatible Type ('T...,CWE-843,Access of Resource Using Incompatible Type ('T...,Improper Control of a Resource Through its Lif...,55.0,12.0,67.0,6.0,2021-05-13T22:51:35Z,2021-06-17T04:27:01Z,821.59,10.0,12.0,['https://github.com/tensorflow/tensorflow/pul...


In [20]:
def find_row(framework, created_at, merged_at, df):    
    result = df[(df['Framework'] == framework) & (df['Created At'] == created_at) & (df['Merged At'] == merged_at)]
    if len(result) > 0:
        return result.iloc[0]
    return None

In [21]:
find_row(framework='tensorflow',created_at='2021-05-13T22:51:35Z', merged_at='2021-06-17T04:27:01Z', df=fdf)

Framework                                                      tensorflow
Description                      [Crash fix] Fix cudaMallocAsync crashes.
CVE ID                                                               <NA>
Root Cause                                       Improper Type Conversion
Fixing Pattern                                        Fix Type Conversion
Symptom                                                             Crash
Taxonomy                Access of Resource Using Incompatible Type ('T...
CWE ID                                                            CWE-843
CWE Name                Access of Resource Using Incompatible Type ('T...
CWE Pillar              Improper Control of a Resource Through its Lif...
Line Addition                                                        55.0
Line Deletion                                                        12.0
Line Changed                                                         67.0
File Changed                          

In [60]:
total = len(new_df)
for idx, row in new_df.iterrows():
    # skip if time not exist
    if f"{row['Created At']}" == "<NA>":
        continue
    if f"{row['Merged At']}" == "<NA>":
        continue
        
    result = find_row(framework=row['Framework'],
                      created_at=row['Created At'], 
                      merged_at=row['Merged At'], 
                      df=fdf)
    if result is not None:
        new_df.at[idx, 'Taxonomy'] = result['Taxonomy']
        new_df.at[idx, 'Root Cause'] = result['Root Cause']
        new_df.at[idx, 'Fixing Pattern'] = result['Fixing Pattern']
        new_df.at[idx, 'Symptom'] = result['Symptom']
        new_df.at[idx, 'CWE Name'] = result['CWE Name']
        new_df.at[idx, 'CWE ID'] = result['CWE ID']
        new_df.at[idx, 'CWE Pillar'] = result['CWE Pillar']
    clear_output(wait=True)
    print(f"({idx+1}/{total})") 

(3611/3611)


In [70]:
def find_cve_row(cve, df):    
    result = df[df['CVE ID'] == cve]
    if len(result) > 0:
        return result.iloc[0]
    return None

In [80]:
total = len(new_df)
for idx, row in new_df.iterrows():
    # skip if time not exist
    if f"{row['CVE ID']}" != "<NA>":
        result = find_cve_row(cve=row['CVE ID'], df=old_df)
        if result is not None:
            new_df.at[idx, 'CWE Pillar'] = result['CWE Pillar']
    clear_output(wait=True)
    print(f"({idx+1}/{total})") 

(3611/3611)


In [82]:
new_df[COLUMNS].to_excel(OUT_FILE,sheet_name='manual', index=False)  