# Merge Manual Files

In [1]:
import codecs
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import random
from IPython.display import clear_output
from ast import literal_eval

import settings
from utils import vplot, vprint

In [3]:
MANUAL_DIR = settings.DATA_CONFIG['manual_dir']
FILTERED_DIR = settings.DATA_CONFIG['distilled_dir']

MANUAL_COLUMNS = ['pr_number','title','keyword','taxonomy','root_cause','fixing_pattern','symptom',]
VULN_PR_COLUMNS = ['number','comments_count','review_comments_count',
                   'commits_count','line_additions','line_deletions',
                   'changed_files_count','created_at','updated_at','closed_at','merged_at']
OUTPUT_COLUMNS = ['framework'] + MANUAL_COLUMNS
MANUAL_SHEET_NAME = 'manual'
FRAMEWORKS = ['tensorflow', 'pytorch','opencv','keras','caffe']
OUTPUT_FILE = MANUAL_DIR + f'manual_vulnerability.xlsx'

In [4]:
def get_manual(framework):
    filename=MANUAL_DIR + f'{framework}_manual.xlsx'
    df = pd.read_excel(filename,sheet_name=MANUAL_SHEET_NAME)
    return df[MANUAL_COLUMNS]

In [5]:
def get_vuln_pr(framework):
    filename=FILTERED_DIR+f'vuln_{framework}_pr.csv'
    df = pd.read_csv(filename)
    return df[VULN_PR_COLUMNS]

### Concatenate All Frameworks

In [6]:
df = pd.DataFrame(columns=MANUAL_COLUMNS)
df_list = []
for framework in FRAMEWORKS:
    df_ = get_manual(framework)
    df_['framework'] = framework
    df_list.append(df_)
df = pd.concat(df_list, ignore_index=True)
df = df[OUTPUT_COLUMNS]

In [7]:
df[:1]

Unnamed: 0,framework,pr_number,title,keyword,taxonomy,root_cause,fixing_pattern,symptom
0,tensorflow,59038,Fix memory leaks in xla::CpuGpuFusionTest,memory leak,Missing Release of Memory after Effective Life...,Incorrect Indices,Update Memory Allocation Method,


### Concate All Vuln PR

In [8]:
pr_df = pd.DataFrame(columns=MANUAL_COLUMNS)
pr_df_list = []
for framework in FRAMEWORKS:
    df_ = get_vuln_pr(framework)
    df_['framework'] = framework
    df_['pr_number'] = df_['number']
    pr_df_list.append(df_)
pr_df = pd.concat(pr_df_list, ignore_index=True).drop(columns=['number'])

In [10]:
pr_df[:1]

Unnamed: 0,comments_count,review_comments_count,commits_count,line_additions,line_deletions,changed_files_count,created_at,updated_at,closed_at,merged_at,framework,pr_number
0,0,0,1,3,3,1,2022-12-28T18:27:54Z,2023-01-09T09:08:06Z,2022-12-30T21:05:05Z,2022-12-30T21:05:05Z,tensorflow,59038


### Merge

In [11]:
print(len(df),len(pr_df))

3163 3164


In [12]:
merged_df = df.merge(pr_df, on=['framework','pr_number'])

In [13]:
merged_df.columns

Index(['framework', 'pr_number', 'title', 'keyword', 'taxonomy', 'root_cause',
       'fixing_pattern', 'symptom', 'comments_count', 'review_comments_count',
       'commits_count', 'line_additions', 'line_deletions',
       'changed_files_count', 'created_at', 'updated_at', 'closed_at',
       'merged_at'],
      dtype='object')

In [14]:
if not os.path.isfile(OUTPUT_FILE):
    merged_df.to_excel(OUTPUT_FILE,sheet_name='manual', index=False)