In [1]:
import gc
import glob
import json
import os
import re
import subprocess
import warnings
from itertools import islice

import numpy as np
import pandas as pd
import torch
from gensim.models.keyedvectors import Word2VecKeyedVectors
from gensim.models.word2vec import Word2Vec

from utils import log as logger
from utils.functions import parse, cpg
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
import shutil
from pathlib import Path
import nltk
from numpyencoder import NumpyEncoder

warnings.filterwarnings("ignore")



# Get codes from Juliet

In [8]:
base_directory = 'data/Juliet/C'
new_base_directory = 'data/Juliet'

In [7]:
shutil.move(os.path.join(base_directory, 'manifest.xml'), new_base_directory)

'data/Juliet\\manifest.xml'

In [12]:
for dirpath, subdirs, files in tqdm(os.walk(base_directory)):
    for file in files:
        if file.endswith('.c') or file.endswith('.cpp') and not file.startswith('main'):
            shutil.move(os.path.join(dirpath, file), new_base_directory)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [13]:
shutil.rmtree(base_directory)

In [2]:
tree = ElementTree.parse('data/Juliet/manifest.xml')

In [3]:
root = tree.getroot()

In [34]:
juliet = pd.DataFrame(columns=['filename', 'code', 'flaw', 'flaw_loc'])

for testcase in tqdm(root):
    files = [element for element in testcase if element.tag  == 'file']
    
    for file in files:
        if file:
            for mixed in file:
                flaw = mixed.attrib['name'].split(':')[0]
                flaw_loc = mixed.attrib['line']
    
    for file in files:
        if file:
            if os.path.exists(os.path.join(new_base_directory, file.attrib['path'])):
                new_row = {
                "filename": file.attrib['path'],
                "code": None,
                "flaw": flaw,
                "flaw_loc": flaw_loc
                }
                with open(os.path.join(new_base_directory, file.attrib['path'])) as code_file:
                    new_row['code'] = code_file.read()
                juliet = juliet.append(new_row, ignore_index=True)

HBox(children=(FloatProgress(value=0.0, max=64123.0), HTML(value='')))




In [35]:
juliet.to_csv("Datasets for VELVET/juliet.csv", index=False)

# Splitting scripts into Good and Bad

In [36]:
juliet = pd.read_csv("Datasets for VELVET/juliet.csv")

In [37]:
bad_df = juliet.copy()
good_df = juliet.copy()

In [38]:
def extract_good(text):
    text = text.split('\n')
    output_txt = []
    
    in_OMITBAD_block = False
    
    for line in text:
        if line == '#ifndef OMITBAD':
            in_OMITBAD_block = True
            
        if not in_OMITBAD_block:
            output_txt.append(line)
            
        if line == '#endif /* OMITBAD */':
            in_OMITBAD_block = False
    return "\n".join(output_txt)

def extract_bad(text):
    text = text.split('\n')
    output_txt = []
    
    in_OMITGOOD_block = False
    
    for line in text:
        if line == '#ifndef OMITGOOD':
            in_OMITGOOD_block = True
            
        if not in_OMITGOOD_block:
            output_txt.append(line)
            
        if line == '#endif /* OMITGOOD */':
            in_OMITGOOD_block = False
    return "\n".join(output_txt)

In [39]:
good_df.code = good_df.code.apply(extract_good)
bad_df.code = bad_df.code.apply(extract_bad)

In [45]:
good_df['flaw'] = 0
good_df['flaw_loc'] = 0

In [46]:
juliet = good_df.append(bad_df)
juliet = juliet.sample(frac=1).reset_index(drop=True)

In [47]:
juliet

Unnamed: 0,filename,code,flaw,flaw_loc
0,CWE400_Resource_Exhaustion__listen_socket_for_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-400,30
1,CWE190_Integer_Overflow__short_max_square_15.c,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-190,44
2,CWE127_Buffer_Underread__wchar_t_alloca_loop_14.c,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-127,42
3,CWE124_Buffer_Underwrite__malloc_char_loop_65b.c,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-124,33
4,CWE78_OS_Command_Injection__char_console_popen...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,0,0
...,...,...,...,...
111357,CWE464_Addition_of_Data_Structure_Sentinel__ba...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-464,43
111358,CWE114_Process_Control__w32_char_environment_01.c,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,0,0
111359,CWE126_Buffer_Overread__CWE170_wchar_t_memcpy_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-126,33
111360,CWE36_Absolute_Path_Traversal__char_file_ifstr...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-036,41


In [48]:
juliet.to_csv("Datasets for VELVET/juliet.csv", index=False)

# Create cpg

In [5]:
raw = pd.read_csv("Datasets for VELVET/juliet.csv")

In [25]:
proc = subprocess.Popen(["echo", "%BASEDIR%"], stdout=subprocess.PIPE, shell=True)
(out, err) = proc.communicate()
path_to_joern = out.decode('utf-8').split('\r')[0]

In [19]:
def to_files(data_frame: pd.DataFrame, out_path):
    for idx, row in tqdm(data_frame.iterrows()):
        extension = row.filename.split('.')[-1]
        file_name = f"{idx}.{extension}"
        with open(out_path + file_name, 'w') as f:
            f.write(row.code)

In [18]:
to_files(raw, 'data/code/')

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [26]:
subprocess.run([path_to_joern + "/joern-parse.bat", "data/code"],
                                      stdout=subprocess.PIPE, text=True, check=True)

CompletedProcess(args=['C:\\Users\\Grazia\\Desktop\\Tesi\\StackOverflow_Vulnerabilities\\1-Vulnerability_Detection\\joern/joern-parse.bat', 'data/code'], returncode=0, stdout='\nC:\\Users\\Grazia\\Desktop\\Tesi\\StackOverflow_Vulnerabilities\\1-Vulnerability_Detection>java -cp "C:\\Users\\Grazia\\Desktop\\Tesi\\StackOverflow_Vulnerabilities\\1-Vulnerability_Detection\\joern/projects/extensions/joern-fuzzyc/build/libs/joern-fuzzyc.jar;C:\\Users\\Grazia\\Desktop\\Tesi\\StackOverflow_Vulnerabilities\\1-Vulnerability_Detection\\joern/projects/extensions/jpanlib/build/libs/jpanlib.jar;C:\\Users\\Grazia\\Desktop\\Tesi\\StackOverflow_Vulnerabilities\\1-Vulnerability_Detection\\joern/projects/octopus/lib/*" tools.parser.ParserMain -outformat csv -outdir parsed data/code \ndata\\code\\0.cpp\ndata\\code\\1.c\ndata\\code\\10.cpp\ndata\\code\\100.c\ndata\\code\\1000.cpp\ndata\\code\\10000.c\ndata\\code\\100000.cpp\ndata\\code\\100001.cpp\ndata\\code\\100002.c\ndata\\code\\100003.cpp\ndata\\code\\1

# Embedding

In [33]:
raw = raw.rename(columns={'code':'tokens'})

raw.head()

Unnamed: 0,filename,tokens,flaw,flaw_loc
0,CWE400_Resource_Exhaustion__listen_socket_for_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-400,30
1,CWE190_Integer_Overflow__short_max_square_15.c,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-190,44
2,CWE127_Buffer_Underread__wchar_t_alloca_loop_14.c,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-127,42
3,CWE124_Buffer_Underwrite__malloc_char_loop_65b.c,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-124,33
4,CWE78_OS_Command_Injection__char_console_popen...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,0,0


In [36]:
for index in tqdm(range(raw.shape[0])):
    raw['tokens'][index] = parse.tokenizer(raw['tokens'][index])

HBox(children=(FloatProgress(value=0.0, max=111362.0), HTML(value='')))




In [39]:
raw.to_csv("Datasets for VELVET/juliet_tokenized.csv", index=False)

In [44]:
w2vmodel = Word2Vec(min_count=1)
w2vmodel.build_vocab(sentences=tqdm(raw.tokens))
w2vmodel.train(sentences=tqdm(raw.tokens), total_examples=w2vmodel.corpus_count, epochs=1)
print("Saving w2vmodel.")
w2vmodel.save(f"data/w2v/w2v_model")

HBox(children=(FloatProgress(value=0.0, max=111362.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=111362.0), HTML(value='')))


Saving w2vmodel.


# Creating input files 

In [2]:
EDGE_TYPES = {
    "CONTROLS": 0,
    "DECLARES": 1,
    "DEF": 2,
    "DOM": 3,
    "FLOWS_TO": 4,
    "IS_AST_PARENT": 5,
    "IS_CLASS_OF": 6,
    "IS_FILE_OF": 7,
    "IS_FUNCTION_OF_AST": 8,
    "IS_FUNCTION_OF_CFG": 9,
    "POST_DOM": 10,
    "REACHES": 11,
    "USE": 12,
}

TYPE_MAP = {
    "AndExpression": 1,
    "Sizeof": 2,
    "Identifier": 3,
    "ForInit": 4,
    "ReturnStatement": 5,
    "SizeofOperand": 6,
    "InclusiveOrExpression": 7,
    "PtrMemberAccess": 8,
    "AssignmentExpression": 9,
    "ParameterList": 10,
    "IdentifierDeclType": 11,
    "SizeofExpression": 12,
    "SwitchStatement": 13,
    "IncDec": 14,
    "Function": 15,
    "BitAndExpression": 16,
    "UnaryExpression": 17,
    "DoStatement": 18,
    "GotoStatement": 19,
    "Callee": 20,
    "OrExpression": 21,
    "ShiftExpression": 22,
    "Decl": 23,
    "CFGErrorNode": 24,
    "WhileStatement": 25,
    "InfiniteForNode": 26,
    "RelationalExpression": 27,
    "CFGExitNode": 28,
    "Condition": 29,
    "BreakStatement": 30,
    "CompoundStatement": 31,
    "UnaryOperator": 32,
    "CallExpression": 33,
    "CastExpression": 34,
    "ConditionalExpression": 35,
    "ArrayIndexing": 36,
    "PostIncDecOperationExpression": 37,
    "Label": 38,
    "ArgumentList": 39,
    "EqualityExpression": 40,
    "ReturnType": 41,
    "Parameter": 42,
    "Argument": 43,
    "Symbol": 44,
    "ParameterType": 45,
    "Statement": 46,
    "AdditiveExpression": 47,
    "PrimaryExpression": 48,
    "DeclStmt": 49,
    "CastTarget": 50,
    "IdentifierDeclStatement": 51,
    "IdentifierDecl": 52,
    "CFGEntryNode": 53,
    "TryStatement": 54,
    "Expression": 55,
    "ExclusiveOrExpression": 56,
    "ClassDef": 57,
    "File": 58,
    "UnaryOperationExpression": 59,
    "ClassDefStatement": 60,
    "FunctionDef": 61,
    "IfStatement": 62,
    "MultiplicativeExpression": 63,
    "ContinueStatement": 64,
    "MemberAccess": 65,
    "ExpressionStatement": 66,
    "ForStatement": 67,
    "InitializerList": 68,
    "ElseStatement": 69,
    'ThrowStatement': 70,
    'CFGExceptionNode': 71,
    'CatchStatement': 72,
    'CatchList': 73
}

TYPE_MAP_OH = np.eye(len(TYPE_MAP))

In [3]:
def create_input_from_cpg(filepath, flaw_loc, w2v, etypemap = EDGE_TYPES):
    try:
        nodes = pd.read_csv(filepath+"/nodes.csv", sep="\t")
        edges = pd.read_csv(filepath+"/edges.csv", sep="\t")
    except Exception as E:
        print(E)
        return [None, None]
    if len(nodes) == 0 or len(edges) == 0:
        print("Empty node / edge CSV")
        return [None, None]
    n, e = format_node_edges(nodes, edges)
    e = e[e.type != "IS_FILE_OF"]
    e = e[e.type.isin(etypemap)]
    
    node_id_dict = (
    n.reset_index(drop=True)
    .reset_index()[["key", "index"]]
    .set_index("key")
    .to_dict()["index"]
    )
    e = e[(e.start.isin(n.key)) & (e.end.isin(n.key))].copy()
    n.key = n.key.apply(lambda x: node_id_dict[x])
    e.start = e.start.apply(lambda x: node_id_dict[x])
    e.end = e.end.apply(lambda x: node_id_dict[x])
    
    n.code = n.code.fillna("")
    n.code = n.code.apply(embed_code, w2v=w2v).to_list()
    n.type = n.type.apply(one_hot_encode_type)
    n = n[n.type != -1]
    
    src = e["start"].to_numpy()
    dst = e["end"].to_numpy()
    nnodes = len(n)
    nfeat = torch.tensor([list(i.type) + list(i.code) for i in n.itertuples()]).float()
    etype = torch.tensor([etypemap[i] for i in e.type]).int()
    return n, e
    
    
    
def format_node_edges(n, e):
    """Format node and edges into appropriate input form."""
    nodes = n.copy()
    edges = e.copy()
    nodes.key -= 1
    edges.start -= 1
    edges.end -= 1
    node_key_type_map = nodes[["key", "type"]].set_index("key").to_dict()["type"]
    node_key_code_map = nodes[["key", "code"]].set_index("key").to_dict()["code"]
    edges["src"] = edges.start.apply(lambda x: node_key_type_map[x])
    edges["dest"] = edges.end.apply(lambda x: node_key_type_map[x])
    edges["src_feat"] = edges.start.apply(lambda x: node_key_code_map[x])
    edges["dest_feat"] = edges.end.apply(lambda x: node_key_code_map[x])
    return nodes, edges

def one_hot_encode_type(type_):
    try:
        return TYPE_MAP_OH[TYPE_MAP[type_] - 1].tolist()
    except:
        return -1
    
def embed_code(code: str, w2v):
    """Embed code using given word2vec model by averaging code embeddings."""
    code = nltk.word_tokenize(code.strip())
    if len(code) == 0:
        return np.zeros(100)
    wvecs = []
    for word in code:
        try:
            wvecs.append(w2v.wv[word])
        except:
            wvecs.append(np.zeros(100))
    return np.array(sum(np.array(wvecs)) / len(code), dtype="float32")

In [8]:
base_dir = "parsed/data/code/"
index_ben = 0
index_vul = 0
w2vmodel = Word2Vec.load("data/w2v/w2v_model")

for file in tqdm(os.listdir('data/code')):
    filename = file.split('.')[0]
    flaw_loc = raw.iloc[int(filename), 3]
    n, e = create_input_from_cpg(base_dir+file, flaw_loc, w2vmodel)
    
    if n is not None and e is not None:
        edges_list = list()
        targets = list()
        map_line = list()

        for index_edge in range(e.shape[0]):
            if index_edge in e.index:
                type_edge = EDGE_TYPES[e['type'][index_edge]]

                edges_list.append([e['start'][index_edge], type_edge, e['end'][index_edge]])

        if flaw_loc == 0:
            targets.append([0])
            index_ben = index_ben + 1
        else:
            index_vul = index_vul + 1
            locations = n[n['location'].notnull()]

            for index_location,row in locations.iterrows():
                if locations['location'][index_location].split(':')[0]==str(flaw_loc):
                    node_idx = (locations['key'][index_location])
                    targets.append([node_idx])
                    break;

        for index_node, row in n.iterrows():
            if pd.isnull(n['location'][index_node]):
                map_line.append(0)
            else:
                map_line.append(int(n['location'][index_node].split(':')[0]))


        dictionary = {"node_features": n.code.tolist(), "graph": edges_list, "targets": targets, 'map_line': map_line, 'index': [index_ben, index_vul]}

        with open("data/input/"+str(filename)+".txt", 'w') as file:
            json.dump(dictionary, file, cls=NumpyEncoder)
            file.close()

HBox(children=(FloatProgress(value=0.0, max=111362.0), HTML(value='')))

Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node

Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node

Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node

Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node

Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node

Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node

Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node

Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node

Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node

Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV
Empty node / edge CSV



# Train/Test/Validation split

In [26]:
train_files, rest_files = train_test_split(os.listdir('data/input'), train_size=0.9)
validation_files, test_files = train_test_split(rest_files, test_size=0.5)

In [27]:
print(len(train_files))
print(len(validation_files))
print(len(test_files))

94905
5272
5273


In [28]:
for file in train_files:
    shutil.move('data/input/'+file, 'data/dataset_juliet/train/'+file)

In [None]:
for file in validation_files:
    shutil.move('data/input/'+file, 'data/dataset_juliet/eval/'+file)

In [30]:
for file in test_files:
    shutil.move('data/input/'+file, 'data/dataset_juliet/dev/'+file)