In [None]:
import pandas as pd
import json
from guesslang import Guess
from tqdm.auto import tqdm
import pickle
import numpy
import os
import subprocess
from sklearn.metrics import classification_report
from gensim.models.word2vec import Word2Vec
from utils.functions import parse
import warnings
import nltk
import torch
from numpyencoder import NumpyEncoder

warnings.filterwarnings('ignore') 



# Comparison for not vulnerable
## Static analysis

In [3]:
with open('ReVeal/non-vulnerables.json','r') as f:
    data = json.loads(f.read())
df_non_vulnerable = pd.json_normalize(data)

In [4]:
df_non_vulnerable

Unnamed: 0,code,hash,project,size
0,"int curl_mvsprintf ( char * buffer , const cha...",-8228664527580018723,debian,6
1,"char * curl_mvaprintf ( const char * format , ...",-8228664527580018723,debian,18
2,"int curl_mprintf ( const char * format , ... )...",-8228664527580018723,debian,8
3,static __inline __uint16_t __uint16_identity (...,-8228664527580018723,debian,3
4,extern rtype gnu_dev_ ## name proto __THROW __...,-8228664527580018723,debian,2
...,...,...,...,...
20489,IN_PROC_BROWSER_TEST_F ( ExtensionMessageBubbl...,-5300415683778646743,chrome,3
20490,IN_PROC_BROWSER_TEST_F ( ExtensionMessageBubbl...,-5300415683778646743,chrome,3
20491,IN_PROC_BROWSER_TEST_F ( ExtensionMessageBubbl...,-5300415683778646743,chrome,3
20492,IN_PROC_BROWSER_TEST_F ( ExtensionMessageBubbl...,-5300415683778646743,chrome,3


In [5]:
for index in tqdm(range(df_non_vulnerable.shape[0])):
    file_name = f"{index}.cpp"
    with open("data/code/"+file_name, "w", encoding="utf-8") as f:
        f.write(df_non_vulnerable.iloc[index, 0])

HBox(children=(FloatProgress(value=0.0, max=20494.0), HTML(value='')))




In [6]:
y_true = numpy.zeros(df_non_vulnerable.shape[0], dtype=int)

In [7]:
len(y_true)

20494

In [17]:
result_static_analysis = dict()
index = 0

for file in tqdm(os.listdir('data/code')):
    try:
        i = int(file.split('.')[0])
        proc = subprocess.run(["cppcheck", "--enable=style", "--template={cwe}:{file}:{line}:{severity}:{code}:{message}", "data/code/"+file], capture_output=True)
        result = proc.stderr.decode('utf-8')
        if result:
            items = result.split('\r\n')
            items = list(filter(None, items))

            for x, y in zip(*[iter(items)] * 2):
                row = x + y
                row = row.split(':')
                result_static_analysis[index]={'code': df_non_vulnerable.iloc[i, :]['code'], 'cwe':row[0], 'file': file, 'line': row[2], 'severity': row[3],
                                      'line_code': row[4], 'message': row[5], 'is_vulnerable': 1}
                index = index + 1
                break; #take only first error
        else:
            result_static_analysis[index]={'code': df_non_vulnerable.iloc[i, :]['code'], 'cwe':0, 'file': file, 'line': '', 'severity': '',
                                      'line_code': '', 'message': '', 'is_vulnerable': 0}
            index = index + 1
    except UnicodeDecodeError as e:
        print(e)
        print(file)
        break;

HBox(children=(FloatProgress(value=0.0, max=20494.0), HTML(value='')))




In [18]:
df_result_static_analysis = pd.DataFrame.from_dict(result_static_analysis, orient='index')
df_result_static_analysis

Unnamed: 0,code,cwe,file,line,severity,line_code,message,is_vulnerable
0,"int curl_mvsprintf ( char * buffer , const cha...",0,0.cpp,,,,,0
1,"char * curl_mvaprintf ( const char * format , ...",0,1.cpp,,,,,0
2,extern rtype gnu_dev_ ## name proto __THROW __...,0,10.cpp,,,,,0
3,"int i2d_ ## name ( type * a , unsigned char * ...",0,100.cpp,40,error,} ^,Unmatched '}'. Configuration,1
4,static void curses_kill_connections ( void ) {...,0,1000.cpp,,,,,0
...,...,...,...,...,...,...,...,...
20489,static void dissect_zcl_color_control_move_to_...,0,9995.cpp,,,,,0
20490,static void dissect_zcl_color_control_move_hue...,0,9996.cpp,,,,,0
20491,static void dissect_zcl_color_control_move_to_...,0,9997.cpp,,,,,0
20492,static void dissect_zcl_color_control_move_col...,0,9998.cpp,,,,,0


In [19]:
df_result_static_analysis.loc[df_result_static_analysis['is_vulnerable']==0]

Unnamed: 0,code,cwe,file,line,severity,line_code,message,is_vulnerable
0,"int curl_mvsprintf ( char * buffer , const cha...",0,0.cpp,,,,,0
1,"char * curl_mvaprintf ( const char * format , ...",0,1.cpp,,,,,0
2,extern rtype gnu_dev_ ## name proto __THROW __...,0,10.cpp,,,,,0
4,static void curses_kill_connections ( void ) {...,0,1000.cpp,,,,,0
5,void dissect_zcl_color_control_attr_data ( pro...,0,10000.cpp,,,,,0
...,...,...,...,...,...,...,...,...
20489,static void dissect_zcl_color_control_move_to_...,0,9995.cpp,,,,,0
20490,static void dissect_zcl_color_control_move_hue...,0,9996.cpp,,,,,0
20491,static void dissect_zcl_color_control_move_to_...,0,9997.cpp,,,,,0
20492,static void dissect_zcl_color_control_move_col...,0,9998.cpp,,,,,0


In [20]:
df_result_static_analysis.loc[df_result_static_analysis['is_vulnerable']==1]

Unnamed: 0,code,cwe,file,line,severity,line_code,message,is_vulnerable
3,"int i2d_ ## name ( type * a , unsigned char * ...",0,100.cpp,40,error,} ^,Unmatched '}'. Configuration,1
17,static int dissect_zbee_zcl_color_control ( tv...,398,10011.cpp,8,style,zcl = ( zbee_zcl_packet * ) data ; ^,C-style pointer casting,1
34,int qemuMonitorTextGetPtyPaths ( qemuMonitorPt...,0,10027.cpp,1,error,int qemuMonitorTextGetPtyPaths ( qemuMonitorPt...,Unmatched '{'. Configuration,1
46,int qemuMonitorTextIOProcess ( qemuMonitorPtr ...,758,10038.cpp,68,error,} ^,Found a exit path from function with non-void ...,1
50,int qemuMonitorTextSetVNCPassword ( qemuMonito...,398,10041.cpp,3,style,if ( qemuMonitorTextCommandWithHandler ( mon ...,C-style pointer casting,1
...,...,...,...,...,...,...,...,...
20475,static picture_t * DecodeBlock ( decoder_t * p...,398,9982.cpp,50,style,p_pic = ( ( struct picture_free_t * ) p_schro...,C-style pointer casting,1
20478,static int OpenEncoder ( vlc_object_t * p_this...,0,9985.cpp,61,error,"SCHRO_SET_ENUM ( enc_rate_control_list , ENC_...",There is an unknown macro here somewhere. Conf...,1
20480,static void CloseDecoder ( vlc_object_t * p_th...,398,9987.cpp,2,style,decoder_t * p_dec = ( decoder_t * ) p_this ; ...,C-style pointer casting,1
20483,static void join_print_po ( struct packet_obje...,398,999.cpp,5,style,if ( GBL_OPTIONS -> regex && regexec ( GBL_OP...,C-style pointer casting,1


In [21]:
print("Static analysis report")
print(classification_report(y_true, numpy.array(df_result_static_analysis['is_vulnerable'])))

Static analysis report
              precision    recall  f1-score   support

           0       1.00      0.65      0.79     20494
           1       0.00      0.00      0.00         0

    accuracy                           0.65     20494
   macro avg       0.50      0.33      0.40     20494
weighted avg       1.00      0.65      0.79     20494



In [22]:
df_result_static_analysis.to_csv('ReVeal/df_result_static_analysis_not_vul.csv')

## Machine Learning

In [23]:
proc = subprocess.Popen(["echo", "%BASEDIR%"], stdout=subprocess.PIPE, shell=True)
(out, err) = proc.communicate()
path_to_joern = out.decode('utf-8').split('\r')[0]

In [24]:
subprocess.run([path_to_joern + "/joern-parse.bat", "data/code"],
                                      stdout=subprocess.PIPE, text=True, check=True)

CompletedProcess(args=['C:\\Users\\Grazia\\Desktop\\Tesi\\StackOverflow_Vulnerabilities\\1-Vulnerability_Detection\\joern/joern-parse.bat', 'data/code'], returncode=0, stdout='\nC:\\Users\\Grazia\\Desktop\\Tesi\\StackOverflow_Vulnerabilities\\1-Vulnerability_Detection>java -cp "C:\\Users\\Grazia\\Desktop\\Tesi\\StackOverflow_Vulnerabilities\\1-Vulnerability_Detection\\joern/projects/extensions/joern-fuzzyc/build/libs/joern-fuzzyc.jar;C:\\Users\\Grazia\\Desktop\\Tesi\\StackOverflow_Vulnerabilities\\1-Vulnerability_Detection\\joern/projects/extensions/jpanlib/build/libs/jpanlib.jar;C:\\Users\\Grazia\\Desktop\\Tesi\\StackOverflow_Vulnerabilities\\1-Vulnerability_Detection\\joern/projects/octopus/lib/*" tools.parser.ParserMain -outformat csv -outdir parsed data/code \ndata\\code\\0.cpp\ndata\\code\\1.cpp\ndata\\code\\10.cpp\ndata\\code\\100.cpp\ndata\\code\\1000.cpp\ndata\\code\\10000.cpp\ndata\\code\\10001.cpp\ndata\\code\\10002.cpp\ndata\\code\\10003.cpp\ndata\\code\\10004.cpp\ndata\\cod

In [25]:
df_non_vulnerable = df_non_vulnerable.rename(columns={'code':'tokens'})

df_non_vulnerable.head()

Unnamed: 0,tokens,hash,project,size
0,"int curl_mvsprintf ( char * buffer , const cha...",-8228664527580018723,debian,6
1,"char * curl_mvaprintf ( const char * format , ...",-8228664527580018723,debian,18
2,"int curl_mprintf ( const char * format , ... )...",-8228664527580018723,debian,8
3,static __inline __uint16_t __uint16_identity (...,-8228664527580018723,debian,3
4,extern rtype gnu_dev_ ## name proto __THROW __...,-8228664527580018723,debian,2


In [26]:
for index in tqdm(range(df_non_vulnerable.shape[0])):
    df_non_vulnerable['tokens'][index] = parse.tokenizer(df_non_vulnerable['tokens'][index])

HBox(children=(FloatProgress(value=0.0, max=20494.0), HTML(value='')))




In [27]:
df_non_vulnerable

Unnamed: 0,tokens,hash,project,size
0,"[int, FUN1, (, char, *, VAR1, ,, const, char, ...",-8228664527580018723,debian,6
1,"[char, *, FUN1, (, const, char, *, VAR1, ,, va...",-8228664527580018723,debian,18
2,"[int, FUN1, (, const, char, *, VAR1, ,, ., ., ...",-8228664527580018723,debian,8
3,"[static, __inline, __uint16_t, FUN1, (, __uint...",-8228664527580018723,debian,3
4,"[extern, rtype, VAR1, ##, name, proto, __THROW...",-8228664527580018723,debian,2
...,...,...,...,...
20489,"[FUN1, (, VAR1, ,, VAR2, ), {, FUN2, (, ), ;, }]",-5300415683778646743,chrome,3
20490,"[FUN1, (, VAR1, ,, VAR2, ), {, FUN2, (, ), ;, }]",-5300415683778646743,chrome,3
20491,"[FUN1, (, VAR1, ,, VAR2, ), {, FUN2, (, ), ;, }]",-5300415683778646743,chrome,3
20492,"[FUN1, (, VAR1, ,, VAR2, ), {, FUN2, (, ), ;, }]",-5300415683778646743,chrome,3


In [28]:
df_non_vulnerable.to_csv("ReVeal/df_non_vulnerable_tokenized.csv", index=False)

In [None]:
codes = pd.read_csv("ReVeal/df_non_vulnerable_tokenized.csv")
print(codes.shape)
w2vmodel = Word2Vec(min_count=1)
w2vmodel.build_vocab(sentences=tqdm(codes.tokens))
w2vmodel.train(sentences=tqdm(codes.tokens), total_examples=w2vmodel.corpus_count, epochs=1)

In [30]:
print("Saving w2vmodel.")
w2vmodel.save(f"ReVeal/w2v_model_not_vul")

Saving w2vmodel.


In [31]:
EDGE_TYPES = {
    "CONTROLS": 0,
    "DECLARES": 1,
    "DEF": 2,
    "DOM": 3,
    "FLOWS_TO": 4,
    "IS_AST_PARENT": 5,
    "IS_CLASS_OF": 6,
    "IS_FILE_OF": 7,
    "IS_FUNCTION_OF_AST": 8,
    "IS_FUNCTION_OF_CFG": 9,
    "POST_DOM": 10,
    "REACHES": 11,
    "USE": 12,
}

TYPE_MAP = {
    "AndExpression": 1,
    "Sizeof": 2,
    "Identifier": 3,
    "ForInit": 4,
    "ReturnStatement": 5,
    "SizeofOperand": 6,
    "InclusiveOrExpression": 7,
    "PtrMemberAccess": 8,
    "AssignmentExpression": 9,
    "ParameterList": 10,
    "IdentifierDeclType": 11,
    "SizeofExpression": 12,
    "SwitchStatement": 13,
    "IncDec": 14,
    "Function": 15,
    "BitAndExpression": 16,
    "UnaryExpression": 17,
    "DoStatement": 18,
    "GotoStatement": 19,
    "Callee": 20,
    "OrExpression": 21,
    "ShiftExpression": 22,
    "Decl": 23,
    "CFGErrorNode": 24,
    "WhileStatement": 25,
    "InfiniteForNode": 26,
    "RelationalExpression": 27,
    "CFGExitNode": 28,
    "Condition": 29,
    "BreakStatement": 30,
    "CompoundStatement": 31,
    "UnaryOperator": 32,
    "CallExpression": 33,
    "CastExpression": 34,
    "ConditionalExpression": 35,
    "ArrayIndexing": 36,
    "PostIncDecOperationExpression": 37,
    "Label": 38,
    "ArgumentList": 39,
    "EqualityExpression": 40,
    "ReturnType": 41,
    "Parameter": 42,
    "Argument": 43,
    "Symbol": 44,
    "ParameterType": 45,
    "Statement": 46,
    "AdditiveExpression": 47,
    "PrimaryExpression": 48,
    "DeclStmt": 49,
    "CastTarget": 50,
    "IdentifierDeclStatement": 51,
    "IdentifierDecl": 52,
    "CFGEntryNode": 53,
    "TryStatement": 54,
    "Expression": 55,
    "ExclusiveOrExpression": 56,
    "ClassDef": 57,
    "File": 58,
    "UnaryOperationExpression": 59,
    "ClassDefStatement": 60,
    "FunctionDef": 61,
    "IfStatement": 62,
    "MultiplicativeExpression": 63,
    "ContinueStatement": 64,
    "MemberAccess": 65,
    "ExpressionStatement": 66,
    "ForStatement": 67,
    "InitializerList": 68,
    "ElseStatement": 69,
    'ThrowStatement': 70,
    'CFGExceptionNode': 71,
    'CatchStatement': 72,
    'CatchList': 73
}

TYPE_MAP_OH = numpy.eye(len(TYPE_MAP))

In [32]:
def create_input_from_cpg(filepath, w2v, etypemap = EDGE_TYPES):
    try:
        nodes = pd.read_csv(filepath+"/nodes.csv", sep="\t", engine='python')
        edges = pd.read_csv(filepath+"/edges.csv", sep="\t", engine='python')
    except Exception as E:
        print(E)
        print(filepath)
        return [None, None]
    if len(nodes) == 0 or len(edges) == 0:
        print("Empty node / edge CSV")
        return [None, None]
    n, e = format_node_edges(nodes, edges)
    e = e[e.type != "IS_FILE_OF"]
    e = e[e.type.isin(etypemap)]
    
    node_id_dict = (
    n.reset_index(drop=True)
    .reset_index()[["key", "index"]]
    .set_index("key")
    .to_dict()["index"]
    )
    e = e[(e.start.isin(n.key)) & (e.end.isin(n.key))].copy()
    n.key = n.key.apply(lambda x: node_id_dict[x])
    e.start = e.start.apply(lambda x: node_id_dict[x])
    e.end = e.end.apply(lambda x: node_id_dict[x])
    
    n.code = n.code.fillna("")
    n.code = n.code.apply(embed_code, w2v=w2v).to_list()
    n.type = n.type.apply(one_hot_encode_type)
    n = n[n.type != -1]
    
    src = e["start"].to_numpy()
    dst = e["end"].to_numpy()
    nnodes = len(n)
    nfeat = torch.tensor([list(i.type) + list(i.code) for i in n.itertuples()]).float()
    etype = torch.tensor([etypemap[i] for i in e.type]).int()
    return n, e
    
    
    
def format_node_edges(n, e):
    """Format node and edges into appropriate input form."""
    nodes = n.copy()
    edges = e.copy()
    nodes.key -= 1
    edges.start -= 1
    edges.end -= 1
    node_key_type_map = nodes[["key", "type"]].set_index("key").to_dict()["type"]
    node_key_code_map = nodes[["key", "code"]].set_index("key").to_dict()["code"]
    edges["src"] = edges.start.apply(lambda x: node_key_type_map[x])
    edges["dest"] = edges.end.apply(lambda x: node_key_type_map[x])
    edges["src_feat"] = edges.start.apply(lambda x: node_key_code_map[x])
    edges["dest_feat"] = edges.end.apply(lambda x: node_key_code_map[x])
    return nodes, edges

def one_hot_encode_type(type_):
    try:
        return TYPE_MAP_OH[TYPE_MAP[type_] - 1].tolist()
    except:
        return -1
    
def embed_code(code: str, w2v):
    """Embed code using given word2vec model by averaging code embeddings."""
    code = nltk.word_tokenize(code.strip())
    if len(code) == 0:
        return numpy.zeros(100)
    wvecs = []
    for word in code:
        try:
            wvecs.append(w2v.wv[word])
        except:
            wvecs.append(numpy.zeros(100))
    return numpy.array(sum(numpy.array(wvecs)) / len(code), dtype="float32")

In [34]:
base_dir = "parsed/data/code/"
index_ben = 0
index_vul = 0
w2vmodel = Word2Vec.load("ReVeal/w2v_model_not_vul")


for file in tqdm(os.listdir(base_dir)):  
    flaw_loc = 0
    n, e = create_input_from_cpg(base_dir+file, w2vmodel)

    if n is not None and e is not None:
        edges_list = list()
        targets = list()
        map_line = list()

        for index_edge in range(e.shape[0]):
            if index_edge in e.index:
                type_edge = EDGE_TYPES[e['type'][index_edge]]

                edges_list.append([e['start'][index_edge], type_edge, e['end'][index_edge]])


        targets.append([0])


        for index_node, row in n.iterrows():
            if pd.isnull(n['location'][index_node]):
                map_line.append(0)
            else:
                map_line.append(int(n['location'][index_node].split(':')[0]))


        dictionary = {"node_features": n.code.tolist(), "graph": edges_list, "targets": targets, 'map_line': map_line, 
                      'index': [int(file.split('.')[0]), int(file.split('.')[0])]}

        with open("ReVeal/input_not_vuln/eval/"+file.split('.')[0]+".txt", 'w') as file:
            json.dump(dictionary, file, cls=NumpyEncoder)
            file.close()

HBox(children=(FloatProgress(value=0.0, max=20496.0), HTML(value='')))

Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node

Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node

Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node

Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node

Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node

In [35]:
%run VELVET\src\run_model.py ReVeal/input_not_vuln VELVET\src\config.yml -t True -x gnn -y transformer -p gnn/log.txt -q transformer/log.txt

Training with configuration: {'model': {'configuration': 'ggnn', 'base': {'hidden_dim': 256, 'dropout_rate': 0.1, 'num_edge_types': 24}, 'evaluation': {'top': 1, 'node_of_line': 7, 'window_size': 9}, 'ggnn': {'time_steps': [3, 1, 3, 1], 'residuals': {'1': [0], '3': [0, 1]}, 'add_type_bias': True}, 'transformer': {'ff_dim': 2048, 'num_layers': 6, 'attention_dim': 512, 'num_heads': 8}}, 'model_2': {'configuration': 'transformer', 'base': {'hidden_dim': 256, 'dropout_rate': 0.1, 'num_edge_types': 24}, 'evaluation': {'top': 1, 'node_of_line': 7, 'window_size': 9}, 'ggnn': {'time_steps': [3, 1, 3, 1], 'residuals': {'1': [0], '3': [0, 1]}, 'add_type_bias': True}, 'transformer': {'ff_dim': 2048, 'num_layers': 6, 'attention_dim': 512, 'num_heads': 8}}, 'data': {'max_batch_size': 1250, 'max_buffer_size': 100, 'max_node_size': 1024, 'valid_interval': 2500, 'max_valid_samples': 25000, 'max_token_length': 10, 'w2v_dimension': 256}, 'training': {'max_steps': 50, 'print_freq': 500, 'learning_rate': 

In [37]:
results_ml_for_not_vul = pd.read_csv('ReVeal/results_ml_not_vul.csv')

In [38]:
results_ml_for_not_vul

Unnamed: 0,index_code,id_post,is_vul,vul_line
0,10077,10077,0,0
1,1015,1015,0,0
2,10083,10083,0,0
3,1017,1017,0,0
4,10,10,0,0
...,...,...,...,...
18224,7573,7573,0,0
18225,9488,9488,0,0
18226,8208,8208,0,0
18227,8990,8990,0,0


In [39]:
y_true = numpy.zeros(results_ml_for_not_vul.shape[0], dtype=int)
y_true

array([0, 0, 0, ..., 0, 0, 0])

In [43]:
print("Machine Learning report")
print(classification_report(y_true, numpy.array(results_ml_for_not_vul['is_vul'])))

Machine Learning report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18229
           1       0.00      0.00      0.00         0

    accuracy                           1.00     18229
   macro avg       0.50      0.50      0.50     18229
weighted avg       1.00      1.00      1.00     18229



In [6]:
%run VELVET/src/run_model.py ReVeal/input_not_vuln VELVET/src/config.yml -e True -m transformer -l transformer/log.txt

Training with configuration: {'model': {'configuration': 'ggnn', 'base': {'hidden_dim': 256, 'dropout_rate': 0.1, 'num_edge_types': 24}, 'evaluation': {'top': 1, 'node_of_line': 7, 'window_size': 9}, 'ggnn': {'time_steps': [3, 1, 3, 1], 'residuals': {'1': [0], '3': [0, 1]}, 'add_type_bias': True}, 'transformer': {'ff_dim': 2048, 'num_layers': 6, 'attention_dim': 512, 'num_heads': 8}}, 'model_2': {'configuration': 'transformer', 'base': {'hidden_dim': 256, 'dropout_rate': 0.1, 'num_edge_types': 24}, 'evaluation': {'top': 1, 'node_of_line': 7, 'window_size': 9}, 'ggnn': {'time_steps': [3, 1, 3, 1], 'residuals': {'1': [0], '3': [0, 1]}, 'add_type_bias': True}, 'transformer': {'ff_dim': 2048, 'num_layers': 6, 'attention_dim': 512, 'num_heads': 8}}, 'data': {'max_batch_size': 1250, 'max_buffer_size': 100, 'max_node_size': 1024, 'valid_interval': 2500, 'max_valid_samples': 25000, 'max_token_length': 10, 'w2v_dimension': 256}, 'training': {'max_steps': 50, 'print_freq': 500, 'learning_rate': 

In [8]:
results_ml_for_not_vul = pd.read_csv('ReVeal/results_ml_not_vul_transformer.csv')

In [9]:
results_ml_for_not_vul

Unnamed: 0,index_code,id_post,is_vul,vul_line
0,10123,10123,0,0
1,10087,10087,1,0
2,10094,10094,0,0
3,10119,10119,0,0
4,10046,10046,0,0
...,...,...,...,...
18224,7903,7903,0,0
18225,7019,7019,0,0
18226,7681,7681,0,0
18227,9716,9716,0,0


In [12]:
y_true = numpy.zeros(results_ml_for_not_vul.shape[0], dtype=int)
y_true

array([0, 0, 0, ..., 0, 0, 0])

In [13]:
print("Machine Learning report")
print(classification_report(y_true, numpy.array(results_ml_for_not_vul['is_vul'])))

Machine Learning report
              precision    recall  f1-score   support

           0       1.00      0.68      0.81     18229
           1       0.00      0.00      0.00         0

    accuracy                           0.68     18229
   macro avg       0.50      0.34      0.41     18229
weighted avg       1.00      0.68      0.81     18229



## Both approaches predict non vulnerable

In [20]:
df_result_static_analysis = pd.read_csv('ReVeal/df_result_static_analysis_not_vul.csv')
df_non_vulnerable = pd.read_csv("ReVeal/df_non_vulnerable_tokenized.csv")

In [21]:
df_result_static_analysis_not_vul_predicted = df_result_static_analysis.loc[df_result_static_analysis['is_vulnerable']==0]
predicted_by_static = [int(x.split('.')[0]) for x in df_result_static_analysis_not_vul_predicted['file']]
df_result_ml_not_vul_predicted = results_ml_for_not_vul.loc[results_ml_for_not_vul['is_vul']==0]
predicted_by_ml = [x for x in df_result_ml_not_vul_predicted['index_code']]

print(len(predicted_by_static))
print(len(predicted_by_ml))

13418
12433


In [22]:
# index files of df_non_vulnerable_only_c predicted not_vulnerable from both approaches
indexes = set(predicted_by_static).intersection(predicted_by_ml)
len(list(indexes))

7425

In [23]:
df_non_vulnerable.iloc[list(indexes)]

Unnamed: 0,tokens,hash,project,size
1,"['char', '*', 'FUN1', '(', 'const', 'char', '*...",-8228664527580018723,debian,18
3,"['static', '__inline', '__uint16_t', 'FUN1', '...",-8228664527580018723,debian,3
4,"['extern', 'rtype', 'VAR1', '##', 'name', 'pro...",-8228664527580018723,debian,2
5,"['static', '__inline', '__uint64_t', 'FUN1', '...",-8228664527580018723,debian,3
7,"['int', 'FUN1', '(', 'char', '*', 'VAR1', ',',...",-8228664527580018723,debian,9
...,...,...,...,...
20438,"['FUN1', '(', 'VAR1', ',', 'VAR2', ')', '{', '...",8073241806104522127,chrome,28
20441,"['FUN1', '(', 'VAR1', ',', 'VAR2', ')', '{', '...",8073241806104522127,chrome,38
20442,"['FUN1', '(', 'VAR1', ',', 'VAR2', ')', '{', '...",8073241806104522127,chrome,22
20463,"['FUN1', '(', 'VAR1', ',', 'VAR2', ')', '{', '...",7484321273243811670,chrome,33


## Both approaches predict vulnerable

In [24]:
df_result_static_analysis_not_vul_not_predicted = df_result_static_analysis.loc[df_result_static_analysis['is_vulnerable']==1]
predicted_by_static = [int(x.split('.')[0]) for x in df_result_static_analysis_not_vul_not_predicted['file']]
df_result_ml_not_vul_not_predicted = results_ml_for_not_vul.loc[results_ml_for_not_vul['is_vul']==1]
predicted_by_ml = [x for x in df_result_ml_not_vul_not_predicted['index_code']]

print(len(predicted_by_static))
print(len(predicted_by_ml))

7076
5796


In [25]:
# index files of df_non_vulnerable_only_c predicted_vulnerable from both approaches
indexes = set(predicted_by_static).intersection(predicted_by_ml)
len(list(indexes))

1403

In [26]:
df_non_vulnerable.iloc[list(indexes)]

Unnamed: 0,tokens,hash,project,size
16392,"['static', 'int', 'FUN1', '(', 'VAR1', '*', 'V...",5917118798851365454,debian,40
8203,"['static', 'void', 'FUN1', '(', 'VAR1', '*', '...",-4055702019813176658,debian,36
12,"['static', 'int', 'FUN1', '(', 'int', 'VAR1', ...",-8228664527580018723,debian,7
21,"['static', 'int', 'FUN1', '(', 'int', 'VAR1', ...",-8228664527580018723,debian,11
8213,"['static', 'guint32', 'FUN1', '(', 'VAR1', '*'...",-4055702019813176658,debian,27
...,...,...,...,...
8125,"['static', 'bool', 'FUN1', '(', 'const', 'stru...",2335116444795821310,debian,20
16319,"['static', 'void', 'FUN1', '(', 'VAR1', '*', '...",-8884965407551053630,debian,36
16365,"['static', 'int', 'FUN1', '(', 'struct', 'VAR1...",-8575765905118000367,debian,11
16367,"['int', 'FUN1', '(', 'struct', 'VAR1', '*', 'V...",-8575765905118000367,debian,19


## Static analysis predict non vulnerable, ML not

In [27]:
index_saved = []

for file in tqdm(df_result_static_analysis['file']):
    index = int(file.split('.')[0])
    index_row = df_result_static_analysis.loc[df_result_static_analysis['file'] == file].index

    row_ml_to_check = results_ml_for_not_vul.loc[results_ml_for_not_vul['index_code']==index].index.tolist()
    if row_ml_to_check:
        if df_result_static_analysis['is_vulnerable'][index_row[0]]==0 and  results_ml_for_not_vul['is_vul'][row_ml_to_check[0]]==1:
                index_saved.append(index)

HBox(children=(FloatProgress(value=0.0, max=20494.0), HTML(value='')))




In [28]:
df_non_vulnerable.iloc[list(index_saved)]

Unnamed: 0,tokens,hash,project,size
0,"['int', 'FUN1', '(', 'char', '*', 'VAR1', ',',...",-8228664527580018723,debian,6
1000,"['static', 'void', 'FUN1', '(', 'void', ')', '...",8895005875911486570,debian,5
10002,"['void', 'FUN1', '(', 'void', ')', '{', 'FUN2'...",-7236893719053583356,debian,3
10003,"['static', 'void', 'FUN1', '(', 'VAR1', '*', '...",-7236893719053583356,debian,6
10006,"['static', 'void', 'FUN1', '(', 'VAR1', '*', '...",-7236893719053583356,debian,9
...,...,...,...,...
9994,"['static', 'void', 'FUN1', '(', 'VAR1', '*', '...",-7236893719053583356,debian,14
9995,"['static', 'void', 'FUN1', '(', 'VAR1', '*', '...",-7236893719053583356,debian,8
9996,"['static', 'void', 'FUN1', '(', 'VAR1', '*', '...",-7236893719053583356,debian,12
9998,"['static', 'void', 'FUN1', '(', 'VAR1', '*', '...",-7236893719053583356,debian,10


## ML predict non vulnerable, static analysis not

In [29]:
index_saved = []

for file in tqdm(df_result_static_analysis['file']):
    index = int(file.split('.')[0])
    index_row = df_result_static_analysis.loc[df_result_static_analysis['file'] == file].index

    row_ml_to_check = results_ml_for_not_vul.loc[results_ml_for_not_vul['index_code']==index].index.tolist()
    if row_ml_to_check:
        if results_ml_for_not_vul['is_vul'][row_ml_to_check[0]]==0 and df_result_static_analysis['is_vulnerable'][index_row[0]]==1:
                index_saved.append(index)

HBox(children=(FloatProgress(value=0.0, max=20494.0), HTML(value='')))




In [30]:
df_non_vulnerable.iloc[list(index_saved)]

Unnamed: 0,tokens,hash,project,size
100,"['int', 'VAR1', '##', 'FUN1', '(', 'VAR2', '*'...",3599206110384554647,debian,97
10011,"['static', 'int', 'FUN1', '(', 'VAR1', '*', 'V...",-7236893719053583356,debian,58
10027,"['int', 'FUN1', '(', 'qemuMonitorPtr', 'VAR1',...",378220832151730865,debian,44
10038,"['int', 'FUN1', '(', 'qemuMonitorPtr', 'mon', ...",378220832151730865,debian,71
10041,"['int', 'FUN1', '(', 'qemuMonitorPtr', 'VAR1',...",378220832151730865,debian,9
...,...,...,...,...
9975,"['gboolean', 'FUN1', '(', 'VAR1', '*', 'VAR2',...",-1381237731082039179,debian,15
9976,"['static', 'int', 'FUN1', '(', 'VAR1', '*', 'V...",2125998987781803371,debian,24
9978,"['static', 'void', 'FUN1', '(', 'VAR1', '*', '...",2125998987781803371,debian,9
9982,"['static', 'VAR1', '*', 'FUN1', '(', 'VAR2', '...",2125998987781803371,debian,68


# Comparison for vulnerable
## Static analysis

In [59]:
with open('ReVeal/vulnerables.json','r') as f:
    data = json.loads(f.read())
df_vulnerable = pd.json_normalize(data)

In [60]:
df_vulnerable

Unnamed: 0,code,hash,project,size
0,"static int alloc_addbyter ( int output , FILE ...",-8228664527580018723,debian,26
1,static int jbig2_word_stream_buf_get_next_word...,-1768728156572022708,debian,16
2,"int jbig2_text_region ( Jbig2Ctx * ctx , Jbig2...",4991783214228010911,debian,343
3,int jbig2_decode_text_region ( Jbig2Ctx * ctx ...,4991783214228010911,debian,337
4,int jbig2_immediate_generic_region ( Jbig2Ctx ...,8458201026073747179,debian,64
...,...,...,...,...
2235,"TEST_F ( AutocompleteResultTest , SortAndCullR...",8073241806104522127,chrome,37
2236,"TEST_F ( AutocompleteResultTest , SortAndCullW...",8073241806104522127,chrome,35
2237,"TEST_F ( AutocompleteResultTest , SortAndCullW...",8073241806104522127,chrome,48
2238,"TEST_F ( AutocompleteResultTest , SortAndCullD...",8073241806104522127,chrome,40


In [62]:
for index in tqdm(range(df_vulnerable.shape[0])):
    file_name = f"{index}.cpp"
    with open("data/code/"+file_name, "w", encoding="utf-8") as f:
        f.write(df_vulnerable.iloc[index, 0])

HBox(children=(FloatProgress(value=0.0, max=2240.0), HTML(value='')))




In [63]:
y_true = numpy.ones(df_vulnerable.shape[0], dtype=int)

In [64]:
len(y_true)

2240

In [65]:
result_static_analysis = dict()
index = 0

for file in tqdm(os.listdir('data/code')):
    try:
        i = int(file.split('.')[0])
        proc = subprocess.run(["cppcheck", "--enable=style", "--template={cwe}:{file}:{line}:{severity}:{code}:{message}", "data/code/"+file], capture_output=True)
        result = proc.stderr.decode('utf-8')
        if result:
            items = result.split('\r\n')
            items = list(filter(None, items))

            for x, y in zip(*[iter(items)] * 2):
                row = x + y
                row = row.split(':')
                result_static_analysis[index]={'code': df_vulnerable.iloc[i, :]['code'], 'cwe':row[0], 'file': file, 'line': row[2], 'severity': row[3],
                                      'line_code': row[4], 'message': row[5], 'is_vulnerable': 1}
                index = index + 1
                break; #take only first error
        else:
            result_static_analysis[index]={'code': df_vulnerable.iloc[i, :]['code'], 'cwe':0, 'file': file, 'line': '', 'severity': '',
                                      'line_code': '', 'message': '', 'is_vulnerable': 0}
            index = index + 1
    except UnicodeDecodeError:
        print(file)
        break;

HBox(children=(FloatProgress(value=0.0, max=2240.0), HTML(value='')))




In [66]:
df_result_static_analysis = pd.DataFrame.from_dict(result_static_analysis, orient='index')
df_result_static_analysis

Unnamed: 0,code,cwe,file,line,severity,line_code,message,is_vulnerable
0,"static int alloc_addbyter ( int output , FILE ...",398,0.cpp,2,style,struct asprintf * infop = ( struct asprintf *...,C-style pointer casting,1
1,static int jbig2_word_stream_buf_get_next_word...,398,1.cpp,2,style,Jbig2WordStreamBuf * z = ( Jbig2WordStreamBuf...,C-style pointer casting,1
2,"static void jbig2_set_bits ( byte * line , int...",398,10.cpp,2,style,"int a0 , a1 , b0 , b1 , a ; ...",The scope of the variable 'a' can be reduced.,1
3,void name ## _free ( type * a ) ;\n # define D...,0,100.cpp,35,error,} ^,Unmatched '}'. Configuration,1
4,static char * * create_argv_command ( struct r...,398,1000.cpp,4,style,const char * program ; ^,The scope of the variable 'program' can be red...,1
...,...,...,...,...,...,...,...,...
2235,static guint32 dissect_minivideopacket ( tvbuf...,398,995.cpp,5,style,proto_item * item ; ^,The scope of the variable 'item' can be reduced.,1
2236,static guint32 dissect_minipacket ( tvbuff_t *...,398,996.cpp,4,style,proto_item * item ; ^,The scope of the variable 'item' can be reduced.,1
2237,void proto_register_iax2 ( void ) {\n static h...,0,997.cpp,,,,,0
2238,static guint32 dissect_fullpacket ( tvbuff_t *...,398,998.cpp,20,style,iax_packet = ( iax_packet_data * ) p_get_prot...,C-style pointer casting,1


In [67]:
df_result_static_analysis.loc[df_result_static_analysis['is_vulnerable']==0]

Unnamed: 0,code,cwe,file,line,severity,line_code,message,is_vulnerable
5,static void mspack_fmap_free ( void * mem ) {\...,0,1001.cpp,,,,,0
7,int uwsgi_php_init ( void ) {\n struct uwsgi_s...,0,1003.cpp,,,,,0
8,"void bn_mul_comba4 ( BN_ULONG * r , BN_ULONG *...",0,1004.cpp,,,,,0
9,"void bn_sqr_comba8 ( BN_ULONG * r , const BN_U...",0,1005.cpp,,,,,0
10,"void bn_sqr_comba4 ( BN_ULONG * r , const BN_U...",0,1006.cpp,,,,,0
...,...,...,...,...,...,...,...,...
2228,struct config_filter_parser * const * config_f...,0,989.cpp,,,,,0
2231,static int execstack_continue ( i_ctx_t * i_ct...,0,991.cpp,,,,,0
2232,static int execstack2_continue ( i_ctx_t * i_c...,0,992.cpp,,,,,0
2233,int archive_read_support_format_all ( struct a...,0,993.cpp,,,,,0


In [68]:
df_result_static_analysis.loc[df_result_static_analysis['is_vulnerable']==1]

Unnamed: 0,code,cwe,file,line,severity,line_code,message,is_vulnerable
0,"static int alloc_addbyter ( int output , FILE ...",398,0.cpp,2,style,struct asprintf * infop = ( struct asprintf *...,C-style pointer casting,1
1,static int jbig2_word_stream_buf_get_next_word...,398,1.cpp,2,style,Jbig2WordStreamBuf * z = ( Jbig2WordStreamBuf...,C-style pointer casting,1
2,"static void jbig2_set_bits ( byte * line , int...",398,10.cpp,2,style,"int a0 , a1 , b0 , b1 , a ; ...",The scope of the variable 'a' can be reduced.,1
3,void name ## _free ( type * a ) ;\n # define D...,0,100.cpp,35,error,} ^,Unmatched '}'. Configuration,1
4,static char * * create_argv_command ( struct r...,398,1000.cpp,4,style,const char * program ; ^,The scope of the variable 'program' can be red...,1
...,...,...,...,...,...,...,...,...
2234,static void iax2_add_ts_fields ( packet_info *...,398,994.cpp,4,style,proto_item * item ; ^,The scope of the variable 'item' can be reduced.,1
2235,static guint32 dissect_minivideopacket ( tvbuf...,398,995.cpp,5,style,proto_item * item ; ^,The scope of the variable 'item' can be reduced.,1
2236,static guint32 dissect_minipacket ( tvbuff_t *...,398,996.cpp,4,style,proto_item * item ; ^,The scope of the variable 'item' can be reduced.,1
2238,static guint32 dissect_fullpacket ( tvbuff_t *...,398,998.cpp,20,style,iax_packet = ( iax_packet_data * ) p_get_prot...,C-style pointer casting,1


In [69]:
print("Static analysis report")
print(classification_report(y_true, numpy.array(df_result_static_analysis['is_vulnerable'])))

Static analysis report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.57      0.73      2240

    accuracy                           0.57      2240
   macro avg       0.50      0.29      0.36      2240
weighted avg       1.00      0.57      0.73      2240



In [70]:
df_result_static_analysis.to_csv('ReVeal/df_result_static_analysis_vuln.csv')

## Machine Learning

In [71]:
proc = subprocess.Popen(["echo", "%BASEDIR%"], stdout=subprocess.PIPE, shell=True)
(out, err) = proc.communicate()
path_to_joern = out.decode('utf-8').split('\r')[0]

In [72]:
subprocess.run([path_to_joern + "/joern-parse.bat", "data/code"],
                                      stdout=subprocess.PIPE, text=True, check=True)

CompletedProcess(args=['C:\\Users\\Grazia\\Desktop\\Tesi\\StackOverflow_Vulnerabilities\\1-Vulnerability_Detection\\joern/joern-parse.bat', 'data/code'], returncode=0, stdout='\nC:\\Users\\Grazia\\Desktop\\Tesi\\StackOverflow_Vulnerabilities\\1-Vulnerability_Detection>java -cp "C:\\Users\\Grazia\\Desktop\\Tesi\\StackOverflow_Vulnerabilities\\1-Vulnerability_Detection\\joern/projects/extensions/joern-fuzzyc/build/libs/joern-fuzzyc.jar;C:\\Users\\Grazia\\Desktop\\Tesi\\StackOverflow_Vulnerabilities\\1-Vulnerability_Detection\\joern/projects/extensions/jpanlib/build/libs/jpanlib.jar;C:\\Users\\Grazia\\Desktop\\Tesi\\StackOverflow_Vulnerabilities\\1-Vulnerability_Detection\\joern/projects/octopus/lib/*" tools.parser.ParserMain -outformat csv -outdir parsed data/code \ndata\\code\\0.cpp\ndata\\code\\1.cpp\ndata\\code\\10.cpp\ndata\\code\\100.cpp\ndata\\code\\1000.cpp\ndata\\code\\1001.cpp\ndata\\code\\1002.cpp\ndata\\code\\1003.cpp\ndata\\code\\1004.cpp\ndata\\code\\1005.cpp\ndata\\code\\10

In [73]:
df_vulnerable = df_vulnerable.rename(columns={'code':'tokens'})

df_vulnerable.head()

Unnamed: 0,tokens,hash,project,size
0,"static int alloc_addbyter ( int output , FILE ...",-8228664527580018723,debian,26
1,static int jbig2_word_stream_buf_get_next_word...,-1768728156572022708,debian,16
2,"int jbig2_text_region ( Jbig2Ctx * ctx , Jbig2...",4991783214228010911,debian,343
3,int jbig2_decode_text_region ( Jbig2Ctx * ctx ...,4991783214228010911,debian,337
4,int jbig2_immediate_generic_region ( Jbig2Ctx ...,8458201026073747179,debian,64


In [74]:
for index in tqdm(range(df_vulnerable.shape[0])):
    df_vulnerable['tokens'][index] = parse.tokenizer(df_vulnerable['tokens'][index])

HBox(children=(FloatProgress(value=0.0, max=2240.0), HTML(value='')))




In [75]:
df_vulnerable

Unnamed: 0,tokens,hash,project,size
0,"[static, int, FUN1, (, int, VAR1, ,, VAR2, *, ...",-8228664527580018723,debian,26
1,"[static, int, FUN1, (, VAR1, *, VAR2, ,, int, ...",-1768728156572022708,debian,16
2,"[int, FUN1, (, VAR1, *, VAR2, ,, VAR3, *, VAR4...",4991783214228010911,debian,343
3,"[int, FUN1, (, VAR1, *, VAR2, ,, VAR3, *, VAR4...",4991783214228010911,debian,337
4,"[int, FUN1, (, VAR1, *, VAR2, ,, VAR3, *, VAR4...",8458201026073747179,debian,64
...,...,...,...,...
2235,"[FUN1, (, VAR1, ,, VAR2, ), {, TestData, VAR3,...",8073241806104522127,chrome,37
2236,"[FUN1, (, VAR1, ,, VAR2, ), {, ACMatches, VAR3...",8073241806104522127,chrome,35
2237,"[FUN1, (, VAR1, ,, VAR2, ), {, TemplateURLData...",8073241806104522127,chrome,48
2238,"[FUN1, (, VAR1, ,, VAR2, ), {, TemplateURLData...",8073241806104522127,chrome,40


In [76]:
df_vulnerable.to_csv("ReVeal/df_vulnerable_tokenized.csv", index=False)

In [None]:
codes = pd.read_csv("ReVeal/df_vulnerable_tokenized.csv")
print(codes.shape)
w2vmodel = Word2Vec(min_count=1)
w2vmodel.build_vocab(sentences=tqdm(codes.tokens))
w2vmodel.train(sentences=tqdm(codes.tokens), total_examples=w2vmodel.corpus_count, epochs=1)

In [78]:
print("Saving w2vmodel.")
w2vmodel.save(f"ReVeal/w2v_model_vul")

Saving w2vmodel.


In [79]:
EDGE_TYPES = {
    "CONTROLS": 0,
    "DECLARES": 1,
    "DEF": 2,
    "DOM": 3,
    "FLOWS_TO": 4,
    "IS_AST_PARENT": 5,
    "IS_CLASS_OF": 6,
    "IS_FILE_OF": 7,
    "IS_FUNCTION_OF_AST": 8,
    "IS_FUNCTION_OF_CFG": 9,
    "POST_DOM": 10,
    "REACHES": 11,
    "USE": 12,
}

TYPE_MAP = {
    "AndExpression": 1,
    "Sizeof": 2,
    "Identifier": 3,
    "ForInit": 4,
    "ReturnStatement": 5,
    "SizeofOperand": 6,
    "InclusiveOrExpression": 7,
    "PtrMemberAccess": 8,
    "AssignmentExpression": 9,
    "ParameterList": 10,
    "IdentifierDeclType": 11,
    "SizeofExpression": 12,
    "SwitchStatement": 13,
    "IncDec": 14,
    "Function": 15,
    "BitAndExpression": 16,
    "UnaryExpression": 17,
    "DoStatement": 18,
    "GotoStatement": 19,
    "Callee": 20,
    "OrExpression": 21,
    "ShiftExpression": 22,
    "Decl": 23,
    "CFGErrorNode": 24,
    "WhileStatement": 25,
    "InfiniteForNode": 26,
    "RelationalExpression": 27,
    "CFGExitNode": 28,
    "Condition": 29,
    "BreakStatement": 30,
    "CompoundStatement": 31,
    "UnaryOperator": 32,
    "CallExpression": 33,
    "CastExpression": 34,
    "ConditionalExpression": 35,
    "ArrayIndexing": 36,
    "PostIncDecOperationExpression": 37,
    "Label": 38,
    "ArgumentList": 39,
    "EqualityExpression": 40,
    "ReturnType": 41,
    "Parameter": 42,
    "Argument": 43,
    "Symbol": 44,
    "ParameterType": 45,
    "Statement": 46,
    "AdditiveExpression": 47,
    "PrimaryExpression": 48,
    "DeclStmt": 49,
    "CastTarget": 50,
    "IdentifierDeclStatement": 51,
    "IdentifierDecl": 52,
    "CFGEntryNode": 53,
    "TryStatement": 54,
    "Expression": 55,
    "ExclusiveOrExpression": 56,
    "ClassDef": 57,
    "File": 58,
    "UnaryOperationExpression": 59,
    "ClassDefStatement": 60,
    "FunctionDef": 61,
    "IfStatement": 62,
    "MultiplicativeExpression": 63,
    "ContinueStatement": 64,
    "MemberAccess": 65,
    "ExpressionStatement": 66,
    "ForStatement": 67,
    "InitializerList": 68,
    "ElseStatement": 69,
    'ThrowStatement': 70,
    'CFGExceptionNode': 71,
    'CatchStatement': 72,
    'CatchList': 73
}

TYPE_MAP_OH = numpy.eye(len(TYPE_MAP))

In [80]:
def create_input_from_cpg(filepath, w2v, etypemap = EDGE_TYPES):
    try:
        nodes = pd.read_csv(filepath+"/nodes.csv", sep="\t", engine='python')
        edges = pd.read_csv(filepath+"/edges.csv", sep="\t", engine='python')
    except Exception as E:
        print(E)
        print(filepath)
        return [None, None]
    if len(nodes) == 0 or len(edges) == 0:
        print("Empty node / edge CSV")
        return [None, None]
    n, e = format_node_edges(nodes, edges)
    e = e[e.type != "IS_FILE_OF"]
    e = e[e.type.isin(etypemap)]
    
    node_id_dict = (
    n.reset_index(drop=True)
    .reset_index()[["key", "index"]]
    .set_index("key")
    .to_dict()["index"]
    )
    e = e[(e.start.isin(n.key)) & (e.end.isin(n.key))].copy()
    n.key = n.key.apply(lambda x: node_id_dict[x])
    e.start = e.start.apply(lambda x: node_id_dict[x])
    e.end = e.end.apply(lambda x: node_id_dict[x])
    
    n.code = n.code.fillna("")
    n.code = n.code.apply(embed_code, w2v=w2v).to_list()
    n.type = n.type.apply(one_hot_encode_type)
    n = n[n.type != -1]
    
    src = e["start"].to_numpy()
    dst = e["end"].to_numpy()
    nnodes = len(n)
    nfeat = torch.tensor([list(i.type) + list(i.code) for i in n.itertuples()]).float()
    etype = torch.tensor([etypemap[i] for i in e.type]).int()
    return n, e
    
    
    
def format_node_edges(n, e):
    """Format node and edges into appropriate input form."""
    nodes = n.copy()
    edges = e.copy()
    nodes.key -= 1
    edges.start -= 1
    edges.end -= 1
    node_key_type_map = nodes[["key", "type"]].set_index("key").to_dict()["type"]
    node_key_code_map = nodes[["key", "code"]].set_index("key").to_dict()["code"]
    edges["src"] = edges.start.apply(lambda x: node_key_type_map[x])
    edges["dest"] = edges.end.apply(lambda x: node_key_type_map[x])
    edges["src_feat"] = edges.start.apply(lambda x: node_key_code_map[x])
    edges["dest_feat"] = edges.end.apply(lambda x: node_key_code_map[x])
    return nodes, edges

def one_hot_encode_type(type_):
    try:
        return TYPE_MAP_OH[TYPE_MAP[type_] - 1].tolist()
    except:
        return -1
    
def embed_code(code: str, w2v):
    """Embed code using given word2vec model by averaging code embeddings."""
    code = nltk.word_tokenize(code.strip())
    if len(code) == 0:
        return numpy.zeros(100)
    wvecs = []
    for word in code:
        try:
            wvecs.append(w2v.wv[word])
        except:
            wvecs.append(numpy.zeros(100))
    return numpy.array(sum(numpy.array(wvecs)) / len(code), dtype="float32")

In [81]:
base_dir = "parsed/data/code/"
index_ben = 0
index_vul = 0
w2vmodel = Word2Vec.load("ReVeal/w2v_model_vul")


for file in tqdm(os.listdir(base_dir)):  
    flaw_loc = 0
    n, e = create_input_from_cpg(base_dir+file, w2vmodel)

    if n is not None and e is not None:
        edges_list = list()
        targets = list()
        map_line = list()

        for index_edge in range(e.shape[0]):
            if index_edge in e.index:
                type_edge = EDGE_TYPES[e['type'][index_edge]]

                edges_list.append([e['start'][index_edge], type_edge, e['end'][index_edge]])


        targets.append([0])


        for index_node, row in n.iterrows():
            if pd.isnull(n['location'][index_node]):
                map_line.append(0)
            else:
                map_line.append(int(n['location'][index_node].split(':')[0]))


        dictionary = {"node_features": n.code.tolist(), "graph": edges_list, "targets": targets, 'map_line': map_line, 
                      'index': [int(file.split('.')[0]), int(file.split('.')[0])]}

        with open("ReVeal/input_vuln/eval/"+file.split('.')[0]+".txt", 'w') as file:
            json.dump(dictionary, file, cls=NumpyEncoder)
            file.close()

HBox(children=(FloatProgress(value=0.0, max=2242.0), HTML(value='')))

Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node

In [82]:
%run VELVET\src\run_model.py ReVeal/input_vuln VELVET\src\config.yml -t True -x gnn -y transformer -p gnn/log.txt -q transformer/log.txt

Training with configuration: {'model': {'configuration': 'ggnn', 'base': {'hidden_dim': 256, 'dropout_rate': 0.1, 'num_edge_types': 24}, 'evaluation': {'top': 1, 'node_of_line': 7, 'window_size': 9}, 'ggnn': {'time_steps': [3, 1, 3, 1], 'residuals': {'1': [0], '3': [0, 1]}, 'add_type_bias': True}, 'transformer': {'ff_dim': 2048, 'num_layers': 6, 'attention_dim': 512, 'num_heads': 8}}, 'model_2': {'configuration': 'transformer', 'base': {'hidden_dim': 256, 'dropout_rate': 0.1, 'num_edge_types': 24}, 'evaluation': {'top': 1, 'node_of_line': 7, 'window_size': 9}, 'ggnn': {'time_steps': [3, 1, 3, 1], 'residuals': {'1': [0], '3': [0, 1]}, 'add_type_bias': True}, 'transformer': {'ff_dim': 2048, 'num_layers': 6, 'attention_dim': 512, 'num_heads': 8}}, 'data': {'max_batch_size': 1250, 'max_buffer_size': 100, 'max_node_size': 1024, 'valid_interval': 2500, 'max_valid_samples': 25000, 'max_token_length': 10, 'w2v_dimension': 256}, 'training': {'max_steps': 50, 'print_freq': 500, 'learning_rate': 

In [84]:
results_ml_for_vul = pd.read_csv('ReVeal/results_ml_vul.csv')

In [85]:
results_ml_for_vul

Unnamed: 0,index_code,id_post,is_vul,vul_line
0,1022,1022,0,0
1,1082,1082,0,0
2,1184,1184,0,0
3,1020,1020,0,0
4,1079,1079,0,0
...,...,...,...,...
1893,6,6,0,0
1894,674,674,0,0
1895,963,963,0,0
1896,1348,1348,0,0


In [86]:
y_true = numpy.ones(results_ml_for_vul.shape[0], dtype=int)
y_true

array([1, 1, 1, ..., 1, 1, 1])

In [87]:
print("Machine Learning report")
print(classification_report(y_true, numpy.array(results_ml_for_vul['is_vul'])))

Machine Learning report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00    1898.0

    accuracy                           0.00    1898.0
   macro avg       0.00      0.00      0.00    1898.0
weighted avg       0.00      0.00      0.00    1898.0



In [32]:
%run VELVET/src/run_model.py ReVeal/input_vuln VELVET/src/config.yml -e True -m transformer -l transformer/log.txt

Training with configuration: {'model': {'configuration': 'ggnn', 'base': {'hidden_dim': 256, 'dropout_rate': 0.1, 'num_edge_types': 24}, 'evaluation': {'top': 1, 'node_of_line': 7, 'window_size': 9}, 'ggnn': {'time_steps': [3, 1, 3, 1], 'residuals': {'1': [0], '3': [0, 1]}, 'add_type_bias': True}, 'transformer': {'ff_dim': 2048, 'num_layers': 6, 'attention_dim': 512, 'num_heads': 8}}, 'model_2': {'configuration': 'transformer', 'base': {'hidden_dim': 256, 'dropout_rate': 0.1, 'num_edge_types': 24}, 'evaluation': {'top': 1, 'node_of_line': 7, 'window_size': 9}, 'ggnn': {'time_steps': [3, 1, 3, 1], 'residuals': {'1': [0], '3': [0, 1]}, 'add_type_bias': True}, 'transformer': {'ff_dim': 2048, 'num_layers': 6, 'attention_dim': 512, 'num_heads': 8}}, 'data': {'max_batch_size': 1250, 'max_buffer_size': 100, 'max_node_size': 1024, 'valid_interval': 2500, 'max_valid_samples': 25000, 'max_token_length': 10, 'w2v_dimension': 256}, 'training': {'max_steps': 50, 'print_freq': 500, 'learning_rate': 

In [33]:
results_ml_for_vul = pd.read_csv('ReVeal/results_ml_vul_transformer.csv')

In [34]:
results_ml_for_vul

Unnamed: 0,index_code,id_post,is_vul,vul_line
0,1023,1023,0,0
1,1024,1024,0,0
2,1118,1118,0,0
3,1122,1122,0,0
4,1124,1124,0,0
...,...,...,...,...
1893,614,614,0,0
1894,869,869,0,0
1895,722,722,0,0
1896,1587,1587,0,0


In [36]:
y_true = numpy.ones(results_ml_for_vul.shape[0], dtype=int)
y_true

array([1, 1, 1, ..., 1, 1, 1])

In [37]:
print("Machine Learning report")
print(classification_report(y_true, numpy.array(results_ml_for_vul['is_vul'])))

Machine Learning report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.15      0.26      1898

    accuracy                           0.15      1898
   macro avg       0.50      0.07      0.13      1898
weighted avg       1.00      0.15      0.26      1898



## Both approaches predict vulnerable

In [47]:
df_result_static_analysis = pd.read_csv('ReVeal/df_result_static_analysis_vuln.csv')
df_vulnerable = pd.read_csv("ReVeal/df_vulnerable_tokenized.csv")

In [48]:
df_result_static_analysis_vul_predicted = df_result_static_analysis.loc[df_result_static_analysis['is_vulnerable']==1]
predicted_by_static = [int(x.split('.')[0]) for x in df_result_static_analysis_vul_predicted['file']]
df_result_ml_vul_predicted = results_ml_for_vul.loc[results_ml_for_vul['is_vul']==1]
predicted_by_ml = [x for x in df_result_ml_vul_predicted['index_code']]

print(len(predicted_by_static))
print(len(predicted_by_ml))

1277
279


In [49]:
indexes = set(predicted_by_static).intersection(predicted_by_ml)
len(list(indexes))

113

In [50]:
df_vulnerable.iloc[list(indexes)]

Unnamed: 0,tokens,hash,project,size
1,"['static', 'int', 'FUN1', '(', 'VAR1', '*', 'V...",-1768728156572022708,debian,16
1541,"['int', 'FUN1', '(', 'void', '*', 'VAR1', ',',...",-7283224897680777091,chrome,6
1043,"['static', 'const', 'char', '*', 'FUN1', '(', ...",-5493081488206619677,debian,14
2069,"['static', 'void', 'FUN1', '(', 'void', '*', '...",-6216181551924087864,chrome,5
21,"['static', 'ossl_inline', 'FUN1', '(', 'VAR1',...",3599206110384554647,debian,64
...,...,...,...,...
968,"['unsigned', 'long', '#', 'define', 'BN_LONG',...",-6554160531728146583,debian,247
1997,"['static', 'err_status_t', 'FUN1', '(', 'VAR1'...",-8620097311326519087,chrome,66
465,"['static', 'inline', 'int', 'FUN1', '(', 'VAR1...",-1658505547857059267,debian,15
94,"['int', 'VAR1', '##', 'FUN1', '(', 'VAR2', '*'...",1366214969540455677,debian,72


## Both approaches predict non vulnerable

In [51]:
df_result_static_analysis_non_vul_predicted = df_result_static_analysis.loc[df_result_static_analysis['is_vulnerable']==0]
predicted_by_static = [int(x.split('.')[0]) for x in df_result_static_analysis_non_vul_predicted['file']]
df_result_ml_non_vul_predicted = results_ml_for_vul.loc[results_ml_for_vul['is_vul']==0]
predicted_by_ml = [x for x in df_result_ml_non_vul_predicted['index_code']]

print(len(predicted_by_static))
print(len(predicted_by_ml))

963
1619


In [52]:
indexes = set(predicted_by_static).intersection(predicted_by_ml)
len(list(indexes))

721

In [53]:
df_vulnerable.iloc[list(indexes)]

Unnamed: 0,tokens,hash,project,size
2050,"['static', 'void', 'FUN1', '(', 'VAR1', '*', '...",-4527380754569407959,chrome,3
3,"['int', 'FUN1', '(', 'VAR1', '*', 'VAR2', ',',...",4991783214228010911,debian,337
2052,"['static', 'void', 'FUN1', '(', 'VAR1', '*', '...",-4527380754569407959,chrome,3
2053,"['static', 'void', 'FUN1', '(', 'VAR1', '*', '...",-4527380754569407959,chrome,3
5,"['int', 'FUN1', '(', 'VAR1', '*', 'VAR2', ',',...",-3955770052982962380,debian,51
...,...,...,...,...
2034,"['static', 'inline', 'void', 'FUN1', '(', 'VAR...",-3165961362620712443,chrome,42
2035,"['static', 'void', 'FUN1', '(', 'const', 'VAR1...",6298409723927571872,chrome,68
2039,"['static', 'void', 'FUN1', '(', 'const', 'VAR1...",6298409723927571872,chrome,16
2040,"['static', 'void', 'FUN1', '(', 'const', 'VAR1...",6298409723927571872,chrome,13


## Static analysis predict vulnerable, ML not

In [54]:
index_saved = []

for file in df_result_static_analysis['file']:
    index = int(file.split('.')[0])
    index_row = df_result_static_analysis.loc[df_result_static_analysis['file'] == file].index

    row_ml_to_check = results_ml_for_vul.loc[results_ml_for_vul['index_code']==index].index.tolist()
    if row_ml_to_check:
        if df_result_static_analysis['is_vulnerable'][index_row[0]]==1 and  results_ml_for_vul['is_vul'][row_ml_to_check[0]]==0:
                index_saved.append(index)

In [55]:
df_vulnerable.iloc[list(index_saved)]

Unnamed: 0,tokens,hash,project,size
0,"['static', 'int', 'FUN1', '(', 'int', 'VAR1', ...",-8228664527580018723,debian,26
10,"['static', 'void', 'FUN1', '(', 'VAR1', '*', '...",1286774465219423144,debian,17
1000,"['static', 'char', '*', '*', 'FUN1', '(', 'str...",9175972616340456250,debian,44
1010,"['int', 'FUN1', '(', 'void', ')', '#', 'else',...",-7034148853749334444,debian,1040
1011,"['static', 'int', 'FUN1', '(', 'VAR1', '*', 'V...",-3502382902740740634,debian,24
...,...,...,...,...
990,"['static', 'int', 'FUN1', '(', 'VAR1', '*', 'V...",-6119142299306266167,debian,30
994,"['static', 'void', 'FUN1', '(', 'VAR1', '*', '...",-4055702019813176658,debian,31
995,"['static', 'guint32', 'FUN1', '(', 'VAR1', '*'...",-4055702019813176658,debian,27
996,"['static', 'guint32', 'FUN1', '(', 'VAR1', '*'...",-4055702019813176658,debian,23


## ML predict vulnerable, static analysis not

In [56]:
index_saved = []

for file in df_result_static_analysis['file']:
    index = int(file.split('.')[0])
    index_row = df_result_static_analysis.loc[df_result_static_analysis['file'] == file].index

    row_ml_to_check = results_ml_for_vul.loc[results_ml_for_vul['index_code']==index].index.tolist()
    if row_ml_to_check:
        if results_ml_for_vul['is_vul'][row_ml_to_check[0]]==1 and df_result_static_analysis['is_vulnerable'][index_row[0]]==0:
                index_saved.append(index)

In [57]:
df_vulnerable.iloc[list(index_saved)]

Unnamed: 0,tokens,hash,project,size
1001,"['static', 'void', 'FUN1', '(', 'void', '*', '...",6028689830752105670,debian,3
1039,"['char', '*', 'FUN1', '(', 'const', 'char', '*...",2518148041181868265,debian,8
1065,"['static', 'void', 'FUN1', '(', 'VAR1', '*', '...",-331579171381918893,debian,8
1068,"['tdata_t', 'FUN1', '(', 'tsize_t', 'VAR1', ')...",-6468986929796051453,debian,4
1070,"['static', 'VALUE', 'FUN1', '(', 'VALUE', 'VAR...",3668456668028959955,debian,18
...,...,...,...,...
917,"['static', 'VAR1', '*', 'FUN1', '(', 'VAR2', '...",-4979531020783644238,debian,6
918,"['char', '*', 'FUN1', '(', 'VAR1', '*', 'VAR2'...",-4979531020783644238,debian,6
927,"['static', 'void', 'FUN1', '(', 'MPI', 'VAR1',...",-2751047985690578371,debian,16
991,"['static', 'int', 'FUN1', '(', 'VAR1', '*', 'V...",-6119142299306266167,debian,4
