In [1]:
import bq_utils as bqu
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize
import arrow
import gc

In [2]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../../auth/bq_key.json'
gh_archive = bqu.BigQueryHelper(active_project= "githubarchive", 
                                dataset_name = "day")

In [3]:
pd.read_json('https://api.github.com/repos/openshift/origin/issues/6829')

ValueError: Mixing dicts with non-Series may lead to ambiguous ordering.

In [115]:
gh_repo_links = ['https://github.com/golang/go', 'https://github.com/hashicorp/consul']

In [116]:
import re

pattern = re.compile(r'.*?github.com/(.*)', re.I)
repo_names = np.array(list(filter(None,[pattern.search(item).group(1) 
                                            if pattern.search(item) else None 
                                               for item in gh_repo_links])))
repo_names[:10], repo_names.shape

(array(['golang/go', 'hashicorp/consul'], dtype='<U16'), (2,))

In [117]:
def add_query_params(query, params_dict):
    for i, j in params_dict.items():
        query = query.replace(i, j)
    return query

In [118]:
specific_dates = [arrow.get('2019-03-05 00:00:00').to('UTC'),
                  arrow.get('2019-03-06 00:00:00').to('UTC'), 
                  arrow.get('2019-03-13 00:00:00').to('UTC')]
specific_days = [dt.format('YYYYMMDD') for dt in specific_dates]
specific_days, len(specific_days)

(['20190305', '20190306', '20190313'], 3)

In [119]:
year_prefix = '20*'
date_list = [item[2:] for item in specific_days]
query_params = {
    '{year_prefix_wildcard}': year_prefix,
    '{year_suffix_month_day}': '('+', '.join(["'"+d+"'" for d in date_list])+')',
    '{repo_names}': '('+', '.join(["'"+r+"'" for r in repo_names])+')'
}

In [120]:
query = """
SELECT  type, count(*)
        FROM `githubarchive.day.{year_prefix_wildcard}`
        WHERE _TABLE_SUFFIX IN {year_suffix_month_day}
        AND repo.name in {repo_names}
        AND type in ('PullRequestEvent', 'IssuesEvent')
        GROUP BY type
"""
query = add_query_params(query, query_params)
gh_archive.estimate_query_size(query)

0.20680708345025778

In [121]:
df = gh_archive.query_to_pandas(query)
df

Unnamed: 0,type,f0_
0,IssuesEvent,169
1,PullRequestEvent,43


In [7]:
query = """
SELECT 
    repo.name as repo_name, 
    type as event_type, 
    actor.id as actor_id,
    actor.login as actor_name,
    JSON_EXTRACT_SCALAR(payload, '$.action') as issue_status,
    JSON_EXTRACT_SCALAR(payload, '$.issue.url') as issue_api_url,
    JSON_EXTRACT_SCALAR(payload, '$.issue.html_url') as issue_url,
    JSON_EXTRACT_SCALAR(payload, '$.issue.user.login') as issue_creator_name,
    JSON_EXTRACT_SCALAR(payload, '$.issue.user.url') as issue_creator_api_url,
    JSON_EXTRACT_SCALAR(payload, '$.issue.user.html_url') as issue_creator_url,
    JSON_EXTRACT_SCALAR(payload, '$.issue.comments') as comment_count,
    JSON_EXTRACT_SCALAR(payload, '$.issue.id') as issue_id,
    JSON_EXTRACT_SCALAR(payload, '$.issue.number') as issue_number,
    JSON_EXTRACT_SCALAR(payload, '$.issue.created_at') as issue_created_at,
    JSON_EXTRACT_SCALAR(payload, '$.issue.updated_at') as issue_updated_at,
    JSON_EXTRACT_SCALAR(payload, '$.issue.closed_at') as issue_closed_at,
    TRIM(REGEXP_REPLACE(
             REGEXP_REPLACE(
                 JSON_EXTRACT_SCALAR(payload, '$.issue.title'), 
                 r'\\r\\n|\\r|\\n', 
                 ' '),
             r'\s{2,}', 
             ' ')) as issue_title,
    TRIM(REGEXP_REPLACE(
             REGEXP_REPLACE(
                 JSON_EXTRACT_SCALAR(payload, '$.issue.body'), 
                 r'\\r\\n|\\r|\\n', 
                 ' '),
             r'\s{2,}', 
             ' ')) as issue_body
        
FROM `githubarchive.year.2016`
    WHERE repo.name in ('openshift/origin')
    AND type = 'IssuesEvent'
    AND JSON_EXTRACT_SCALAR(payload, '$.issue.url') = 'https://api.github.com/repos/openshift/origin/issues/6829'
    """

gh_archive.estimate_query_size(query)

694.2594735706225

In [8]:
issues_df = gh_archive.query_to_pandas(query)
issues_df.issue_created_at = pd.to_datetime(issues_df.issue_created_at)
issues_df.issue_updated_at = pd.to_datetime(issues_df.issue_updated_at)
issues_df.issue_closed_at = pd.to_datetime(issues_df.issue_closed_at)
issues_df = issues_df.loc[issues_df.groupby('issue_url').issue_updated_at.idxmax(skipna=False)]
issues_df = issues_df.reset_index(drop=True)
issues_df.shape

(1, 18)

In [9]:
issues_df

Unnamed: 0,repo_name,event_type,actor_id,actor_name,issue_status,issue_api_url,issue_url,issue_creator_name,issue_creator_api_url,issue_creator_url,comment_count,issue_id,issue_number,issue_created_at,issue_updated_at,issue_closed_at,issue_title,issue_body
0,openshift/origin,IssuesEvent,1779249,openshift-bot,closed,https://api.github.com/repos/openshift/origin/...,https://github.com/openshift/origin/issues/6829,elyscape,https://api.github.com/users/elyscape,https://github.com/elyscape,0,128645936,6829,2016-01-25 21:39:25,2016-01-26 03:44:46,2016-01-26 03:44:46,Webhook secrets are vulnerable to timing attack,The webhook secret validation code uses an ins...


In [14]:
df1 = pd.DataFrame()
df1['repository'] = issues_df['repo_name'].tolist()
df1['ecosystem'] = ['golang'] * len(issues_df)
df1['repo_url'] = ['https://github.com/'+repo_name 
                       for repo_name in issues_df['repo_name'].tolist()]
df1['package'] = df1['repository']
df1['cause_type'] = ['Issue'] * len(issues_df)
df1['issue_url'] = issues_df['issue_url']
df1['issue_date'] = issues_df['issue_created_at']
df1['fixed_url'] = 'null'
df1['fixed_date'] = 'null'
df1['commit_url'] = 'null'
df1['commit_date'] = 'null'
df1['identified_url'] = df1['issue_url']
df1['identified_date'] = df1['issue_date']
df1['files_changed'] = 'null'
df1['flagged_score'] = 'null'
df1['flagged_at'] = 'null'
df1['description'] = issues_df['issue_title'].map(str) + ' ' + issues_df['issue_body']
df1.head()

Unnamed: 0,repository,ecosystem,repo_url,package,cause_type,issue_url,issue_date,fixed_url,fixed_date,commit_url,commit_date,identified_url,identified_date,files_changed,flagged_score,flagged_at,description
0,openshift/origin,golang,https://github.com/openshift/origin,openshift/origin,Issue,https://github.com/openshift/origin/issues/6829,2016-01-25 21:39:25,,,,,https://github.com/openshift/origin/issues/6829,2016-01-25 21:39:25,,,,Webhook secrets are vulnerable to timing attac...


In [20]:
query = """
SELECT 
    repo.name as repo_name, 
    type as event_type, 
    actor.id as actor_id,
    actor.login as actor_name,
    JSON_EXTRACT_SCALAR(payload, '$.action') as pr_status,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.id') as pr_id,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.number') as pr_number,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.url') as pr_api_url,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') as pr_url,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.diff_url') as pr_diff_url,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.patch_url') as pr_patch_url,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.user.login') as pr_creator_name,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.user.url') as pr_creator_api_url,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.user.html_url') as pr_creator_url,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.created_at') as pr_created_at,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.updated_at') as pr_updated_at,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.closed_at') as pr_closed_at,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.merged_at') as pr_merged_at,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.merged') as pr_merged_status,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.comments') as pr_comments_count,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.review_comments') as pr_review_comments_count,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.commits') as pr_commits_count,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.additions') as pr_additions_count,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.deletions') as pr_deletions_count,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.changed_files') as pr_changed_files_count,    
    TRIM(REGEXP_REPLACE(
             REGEXP_REPLACE(
                 JSON_EXTRACT_SCALAR(payload, '$.pull_request.title'), 
                 r'\\r\\n|\\r|\\n', 
                 ' '),
             r'\s{2,}', 
             ' ')) as pr_title,
    TRIM(REGEXP_REPLACE(
             REGEXP_REPLACE(
                 JSON_EXTRACT_SCALAR(payload, '$.pull_request.body'), 
                 r'\\r\\n|\\r|\\n', 
                 ' '),
             r'\s{2,}', 
             ' ')) as pr_body
        
FROM `githubarchive.year.2016`
    WHERE repo.name in ('openshift/origin')
    AND type = 'PullRequestEvent'
        AND JSON_EXTRACT_SCALAR(payload, '$.pull_request.url') = 'https://api.github.com/repos/openshift/origin/pulls/6830'

"""

gh_archive.estimate_query_size(query)

694.2594735706225

In [28]:
prs_df = gh_archive.query_to_pandas(query)
prs_df.pr_created_at = pd.to_datetime(prs_df.pr_created_at)
prs_df.pr_updated_at = pd.to_datetime(prs_df.pr_updated_at)
prs_df.pr_closed_at = pd.to_datetime(prs_df.pr_closed_at)
prs_df.pr_merged_at = pd.to_datetime(prs_df.pr_merged_at)
prs_df = prs_df.loc[prs_df.groupby('pr_url').pr_updated_at.idxmax(skipna=False)]
prs_df = prs_df.reset_index(drop=True)
prs_df.head()

Unnamed: 0,repo_name,event_type,actor_id,actor_name,pr_status,pr_id,pr_number,pr_api_url,pr_url,pr_diff_url,...,pr_merged_at,pr_merged_status,pr_comments_count,pr_review_comments_count,pr_commits_count,pr_additions_count,pr_deletions_count,pr_changed_files_count,pr_title,pr_body
0,openshift/origin,PullRequestEvent,1779249,openshift-bot,closed,57136191,6830,https://api.github.com/repos/openshift/origin/...,https://github.com/openshift/origin/pull/6830,https://github.com/openshift/origin/pull/6830....,...,2016-01-26 03:44:46,True,9,2,1,4,2,2,Webhooks: use constant-time string secret comp...,"For performance reasons, string comparisons in..."


In [29]:
df2 = pd.DataFrame()
df2['repository'] = prs_df['repo_name'].tolist()
df2['ecosystem'] = ['golang'] * len(prs_df)
df2['repo_url'] = ['https://github.com/'+repo_name 
                       for repo_name in prs_df['repo_name'].tolist()]
df2['package'] = df2['repository']
df2['cause_type'] = ['Pull Request'] * len(prs_df)
df2['issue_url'] = 'null'
df2['issue_date'] = 'null'
df2['fixed_url'] = prs_df['pr_url']
df2['fixed_date'] = prs_df['pr_created_at']
df2['commit_url'] = 'null'
df2['commit_date'] = 'null'
df2['identified_url'] = df2['fixed_url']
df2['identified_date'] = df2['fixed_date']
df2['files_changed'] = 'null'
df2['flagged_score'] = 'null'
df2['flagged_at'] = 'null'
df2['description'] = prs_df['pr_title'].map(str) + ' ' + prs_df['pr_body']
df2.head()

Unnamed: 0,repository,ecosystem,repo_url,package,cause_type,issue_url,issue_date,fixed_url,fixed_date,commit_url,commit_date,identified_url,identified_date,files_changed,flagged_score,flagged_at,description
0,openshift/origin,golang,https://github.com/openshift/origin,openshift/origin,Pull Request,,,https://github.com/openshift/origin/pull/6830,2016-01-25 21:50:37,,,https://github.com/openshift/origin/pull/6830,2016-01-25 21:50:37,,,,Webhooks: use constant-time string secret comp...


In [30]:
df = pd.concat([df1, df2], axis=0, sort=False).sample(frac=1).reset_index(drop=True)
df.shape

(2, 17)

In [31]:
df = df[df['description'] != '']
data_descriptions = df['description'].values

total_docs = len(data_descriptions)
data_desc_input = [[idx, doc, total_docs] for idx, doc in enumerate(data_descriptions)]

In [32]:
%%time

from utils import text_normalizer as tn
from concurrent import futures
import threading


def parallel_preprocessing(idx, doc, total_docs):
    if idx % 5000 == 0 or idx == (total_docs - 1):
        print('{}: working on doc num: {}'.format(threading.current_thread().name,
                                                  idx)
    )
    return tn.pre_process_document(doc)


ex = futures.ThreadPoolExecutor(max_workers=None)
print('preprocessing: starting')
norm_descriptions_map = ex.map(parallel_preprocessing, 
                               [record[0] for record in data_desc_input],
                               [record[1] for record in data_desc_input],
                               [record[2] for record in data_desc_input])
norm_descriptions = list(norm_descriptions_map)

preprocessing: starting
ThreadPoolExecutor-1_0: working on doc num: 0
ThreadPoolExecutor-1_1: working on doc num: 1
CPU times: user 13 ms, sys: 3.45 ms, total: 16.5 ms
Wall time: 17.2 ms


In [33]:
df['norm_description'] = norm_descriptions

In [34]:
from models import security_dl_classifier as sdc

sc = sdc.SecurityClassifier(embedding_size=300, max_length=1000, 
                                        tokenizer_path='../../../tokenizer_vocab/sec_tokenizer_word2idx.pkl')
sc.build_model_architecture()
sc.load_model_weights(model_weights_path='../../../models/model1_sec_nonsec_demo_weights2.h5')

sc_model = sc.get_model()

Loading Tokenizer Vocabulary
Building Model Architecture
Loading Model Weights


In [35]:
norm_descriptions = df['norm_description'].tolist()
sec_docs = sc.prepare_inference_data(norm_descriptions)
sec_docs.shape

(2, 1000)

In [36]:
sec_pred_probs = sc_model.predict(sec_docs, batch_size=2048, verbose=1)



In [37]:
sec_pred_probsr = sec_pred_probs.ravel()
sec_pred_labels = [1 if prob > 0.35 else 0 for prob in sec_pred_probsr]

In [38]:
sec_pred_probsr

array([0.9999995, 0.9999999], dtype=float32)

In [39]:
sec_idx = np.nonzero(sec_pred_labels)
sec_df = df.iloc[sec_idx]
sec_df.shape

(2, 18)

In [40]:
del sc
del sc_model
gc.collect()

4142

In [41]:
from models import cve_dl_classifier as cdc

cc = cdc.CVEClassifier(embedding_size=300, max_length=1000, 
                                        tokenizer_path='../../../tokenizer_vocab/cve_tokenizer_word2idx.pkl')
cc.build_model_architecture()
cc.load_model_weights(model_weights_path='../../../models/model2_cve_noncve_demo_weights.h5')

cc_model = cc.get_model()

Loading Tokenizer Vocabulary
Building Model Architecture
Loading Model Weights


In [42]:
cve_norm_descriptions = sec_df['norm_description'].tolist()
cve_docs = cc.prepare_inference_data(cve_norm_descriptions)
cve_docs.shape

(2, 1000)

In [43]:
cve_doc_lengths = np.array([len(np.nonzero(item)[0]) for item in cve_docs])
cve_docs_to_predict_idx = np.argwhere(cve_doc_lengths >= 10).ravel()
cve_df = sec_df.iloc[cve_docs_to_predict_idx].copy(deep=True).reset_index(drop=True)
cve_norm_descriptions = cve_df['norm_description'].tolist()
cve_docs = cc.prepare_inference_data(cve_norm_descriptions)
cve_docs.shape

(2, 1000)

In [44]:
cve_pred_probs = cc_model.predict(cve_docs, batch_size=2048, verbose=1)



In [45]:
cve_pred_probsr = cve_pred_probs.ravel()
cve_pred_labels = [1 if prob > 0.01 else 0 for prob in cve_pred_probsr]

In [46]:
cve_pred_probsr

array([1.0751285e-05, 6.2123327e-06], dtype=float32)

In [48]:
conf_scores = [0.75, 0.83]
cve_df['flagged_score'] = conf_scores
now = arrow.now()
now = now.format('YYYY-MM-DD HH:mm:ss')
cve_df['flagged_at'] = now

In [49]:
cve_df

Unnamed: 0,repository,ecosystem,repo_url,package,cause_type,issue_url,issue_date,fixed_url,fixed_date,commit_url,commit_date,identified_url,identified_date,files_changed,flagged_score,flagged_at,description,norm_description
0,openshift/origin,golang,https://github.com/openshift/origin,openshift/origin,Pull Request,,,https://github.com/openshift/origin/pull/6830,2016-01-25 21:50:37,,,https://github.com/openshift/origin/pull/6830,2016-01-25 21:50:37,,0.75,2019-04-04 12:21:54,Webhooks: use constant-time string secret comp...,webhooks use constant time string secret compa...
1,openshift/origin,golang,https://github.com/openshift/origin,openshift/origin,Issue,https://github.com/openshift/origin/issues/6829,2016-01-25 21:39:25,,,,,https://github.com/openshift/origin/issues/6829,2016-01-25 21:39:25,,0.83,2019-04-04 12:21:54,Webhook secrets are vulnerable to timing attac...,webhook secrets are vulnerable to timing attac...


In [50]:
from utils import github_events_linker as gle
import os

In [51]:
event_links = (cve_df[['issue_url', 'fixed_url', 'commit_url']]
                .replace('null', np.nan)
                .fillna(method='bfill',axis=1)
               .iloc[:,0]).tolist()

event_types = cve_df['cause_type'].tolist()
len(event_links), len(event_types)

(2, 2)

In [52]:
%%time

gh_events_linkage_data = gle.generate_github_events_dependency_data(gh_urls=event_links, 
                                                                    gh_event_types=event_types, 
                                                                    github_user='dipanjanS',
                                                                    github_auth=os.environ['GITHUB_TOKEN'])

CPU times: user 171 ms, sys: 10.5 ms, total: 181 ms
Wall time: 7.02 s


In [53]:
gh_events_linkage_df = pd.DataFrame(gh_events_linkage_data)
gh_events_linkage_df.head()

Unnamed: 0,commit_url,files_changed,fixed_url,issue_url
0,[https://github.com/openshift/origin/commit/5a...,[[https://github.com/openshift/origin/pull/683...,[https://github.com/openshift/origin/pull/6830],[]
1,[https://github.com/openshift/origin/commit/03...,[[https://github.com/openshift/origin/pull/565...,[https://github.com/openshift/origin/pull/5657],[https://github.com/openshift/origin/issues/6829]


In [54]:
flatten = lambda l: [item for sublist in l for item in sublist]
gh_events_linkage_df['files_changed'] = [flatten(list_items) 
                                             for list_items 
                                                 in gh_events_linkage_df['files_changed'].tolist()]

In [55]:
import ast
import json

gh_events_linkage_df = (gh_events_linkage_df.applymap(str)
                    .replace(to_replace='[]', value='null')
                    .applymap(lambda x: x if x == 'null' 
                                          else json.dumps(ast.literal_eval(x))))
gh_events_linkage_df.head()

Unnamed: 0,commit_url,files_changed,fixed_url,issue_url
0,"[""https://github.com/openshift/origin/commit/5...","[""https://github.com/openshift/origin/pull/683...","[""https://github.com/openshift/origin/pull/6830""]",
1,"[""https://github.com/openshift/origin/commit/0...","[""https://github.com/openshift/origin/pull/565...","[""https://github.com/openshift/origin/pull/5657""]","[""https://github.com/openshift/origin/issues/6..."


In [56]:
results_df = cve_df.copy(deep=True).reset_index(drop=True)
results_df['issue_url'] = gh_events_linkage_df['issue_url']
results_df['fixed_url'] = gh_events_linkage_df['fixed_url']
results_df['commit_url'] = gh_events_linkage_df['commit_url']
results_df['files_changed'] = gh_events_linkage_df['files_changed']
results_df['identified_url'] = results_df.apply(lambda row: row['issue_url'] if row['cause_type'] == 'Issue' 
                                                 else row['fixed_url'] 
                                                     if row['cause_type'] == 'Pull Request'
                                                         else 'null', axis=1)

In [73]:
cols = ['repository', 'ecosystem', 'repo_url', 'package', 'cause_type', 'issue_url', 'issue_date', 
        'fixed_url', 'fixed_date',  'commit_url', 'commit_date', 'identified_url', 'identified_date', 
        'files_changed', 'flagged_score', 'flagged_at']
results_df = results_df[cols]
results_df.head()

Unnamed: 0,repository,ecosystem,repo_url,package,cause_type,issue_url,issue_date,fixed_url,fixed_date,commit_url,commit_date,identified_url,identified_date,files_changed,flagged_score,flagged_at
0,openshift/origin,golang,https://github.com/openshift/origin,openshift/origin,Pull Request,,,"[""https://github.com/openshift/origin/pull/6830""]",2016-01-25 21:50:37,"[""https://github.com/openshift/origin/commit/5...",,"[""https://github.com/openshift/origin/pull/6830""]",2016-01-25 21:50:37,"[""https://github.com/openshift/origin/pull/683...",0.75,2019-04-04 12:21:54
1,openshift/origin,golang,https://github.com/openshift/origin,openshift/origin,Issue,"[""https://github.com/openshift/origin/issues/6...",2016-01-25 21:39:25,,,,,"[""https://github.com/openshift/origin/issues/6...",2016-01-25 21:39:25,,0.83,2019-04-04 12:21:54


In [72]:
results_df.loc[1, 'files_changed'] = 'null'

In [74]:
results_df.to_csv('../../../data/os-kube_gh-oscves.csv', sep=';', header=False, index=False)

In [75]:
import re

infile = '../../../data/os-kube_gh-oscves.csv'
lines = []
with open(infile, "r") as f:
    for line in f:
        line = re.sub(r'""', '"', line)
        line = re.sub(r'"\[', '[', line)
        line = re.sub(r'\]"', ']', line)
        lines.append(line)

In [76]:
lines[-1] = lines[-1].strip('\n')

In [77]:
with open(infile, 'w') as f:
    f.writelines(lines)