## [tool] 2. Data Cleaning

In [9]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import random
from IPython.display import clear_output

import settings
from utils import vplot, vprint

In [52]:
FILTERED_DIR = settings.DATA_CONFIG['distilled_dir']
RAW_DIR = settings.DATA_CONFIG['raw_dir']

FRAMEWORK = "tensorflow"

VULN_PR_FILE = FILTERED_DIR + f'vuln_{FRAMEWORK}_pr.csv'
VULN_COMMIT_FILE = FILTERED_DIR + f'vuln_{FRAMEWORK}_commit.csv'
VULN_PR_COMMIT_FILE = FILTERED_DIR + f'vuln_{FRAMEWORK}_pr+commit.csv'

In [53]:
vuln_pr_df = pd.read_csv(VULN_PR_FILE)
vuln_commit_df = pd.read_csv(VULN_COMMIT_FILE)
print(f'Pull Request: {len(vuln_pr_df.index)} | Commit:{len(vuln_commit_df.index)}')

Pull Request: 1950 | Commit:3072


#### (Paper) Table 3:  Statistics of the identified latent Vulnerabilities in Iteration 1

Notices:
* Here we have to ignore give up the issues. there are two reason:
    * Not all PR mention it's corresponding issue in describution which mean the issue is not traceable.
    * Each PR has it's corresponding issue link with same ID
* Here we only consider those merged PRs which has completed live cycle.

| Repository  | Official CVE |Pull Requests|   Commits   |    Issue    | 
| ----------- | -----------  | ----------- | ----------- | ----------- |
| TensorFlow  |              |     1950    |    3072     |     N/A     |            
| Caffe       |              |     96      |    241      |     N/A     |            
| OpenCV      |              |     735     |    2137     |     N/A     |            
| PyTorch     |              |     286     |    930      |     N/A     |            
| Keras       |              |     97      |    373      |     N/A     |       

###### <center>Current Version: Data Collected On: 11 Jan 2023</center>

| Repository  | Official CVE |Pull Requests|   Commits   |    Issue    | 
| ----------- | -----------  | ----------- | ----------- | ----------- |
| TensorFlow  |    323       |    1665     |    2654     |     N/A     |            
| Caffe       |     12       |      90     |     242     |     N/A     |            
| OpenCV      |     38       |     667     |    2083     |     N/A     |            
| PyTorch     |      3       |     270     |     925     |     N/A     |            
| Keras       |      3       |      83     |     366     |     N/A     |       

###### <center>Deprecated Version: Data Collected On: 16 July 2022</center>


| Repository  | Official CVE |Pull Requests|   Commits   |    Issue    | 
| ----------- | -----------  | ----------- | ----------- | ----------- |
| TensorFlow  |    301       |     216     |    5285     |    1148     |            
| Caffe       |      0       |      31     |     133     |    107      |            
| OpenCV      |      0       |      71     |     749     |    314      |            
| PyTorch     |      0       |    1588     |    1485     |    857      |            
| Keras       |      0       |      94     |     194     |    206      |

###### <center>Deprecated Version: Data Collected On: 1 February 2022</center>


## Extract Information for Manually Investigate

##### Find all vulnerability keyword for each records

To do so, we match keywords one by one

### Extract PR

##### Apply Keyword

In [61]:
from searchkey import VulnerabilityRegex
REGEX_VULN = VulnerabilityRegex.basic() + '|' + VulnerabilityRegex.enhence()

pr_df = vuln_pr_df[['number', 'html', 'title', 'description']].fillna('').copy()
pr_df.rename(columns = {'number':'pr_number', 'html':'pr_html'}, inplace = True)
pr_df['text'] = pr_df['title'] + ':::' + pr_df['description']
pr_df['keyword'] = pr_df['text'].str.extract(f"({REGEX_VULN})", flags=re.IGNORECASE)[0].str.lower()
pr_df = pr_df.drop(columns=['text'])
print(len(pr_df.index))

1950


### Extract Commit

In [62]:
commit_df = vuln_commit_df[['pr_number','html','sha','message','changed_files']].copy()
commit_df.rename(columns = {'html':'commit_html'}, inplace = True)
print(f"Total Unique PR: {len(commit_df['pr_number'].unique())}")
commit_df[:1]

Total Unique PR: 1949


Unnamed: 0,pr_number,commit_html,sha,message,changed_files
0,59038,https://github.com/tensorflow/tensorflow/commi...,ecabacdde8c7d9531fcab969f144a6ecaaa72ce5,Fix memory leaks in xla::CpuGpuFusionTest,['tensorflow/compiler/xla/tests/cpu_gpu_fusion...


##### Missing Commit Validation

In [63]:
def missing_commit_validation(commit_df, pr_df):
    """
    the function return the missing commits for pr
    """
    missing = []
    commit_pr_numbers = np.array(commit_df['pr_number'])
    for number in pr_df['pr_number']:
        if number not in commit_pr_numbers:
            missing.append(number)
    return missing
# print(f"Commit Missing:" + str(missing_commit_validation(commit_df, pr_df)))

### Group Commit by PR Number

In [64]:
from ast import literal_eval

In [65]:
nochange = lambda x: list(x)
def merged_array(x):
    array = []
    for arraystr in x:
        for item in literal_eval(arraystr):
            array.append(item)
    return list(array)
grouped_commit_df = commit_df.groupby("pr_number").agg({'commit_html':nochange, 'message':nochange, 'changed_files':merged_array})
grouped_commit_df[:1]

Unnamed: 0_level_0,commit_html,message,changed_files
pr_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
673,[https://github.com/tensorflow/tensorflow/comm...,"[PoolAlloc: Remove div by zero, demote WARN->I...",[tensorflow/core/common_runtime/gpu/pool_alloc...


### Merge PR with Commit

In [66]:
merged_df = pd.merge(pr_df, grouped_commit_df, on="pr_number")
len(merged_df)

1949

## Output File

In [67]:
# remove comma
# merged_df = merged_df.replace(',',' ', regex=True)
# merged_df = merged_df.replace({r'\r\n|\r|\n': ' '}, regex=True)
merged_df.to_csv(VULN_PR_COMMIT_FILE, index=False)