In [31]:
##############################################################################
#                                                                            #
#  Code for the USENIX Security '22 paper:                                   #
#  How Machine Learning Is Solving the Binary Function Similarity Problem.   #
#                                                                            #
#  MIT License                                                               #
#                                                                            #
#  Copyright (c) 2019-2022 Cisco Talos                                       #
#                                                                            #
#  Permission is hereby granted, free of charge, to any person obtaining     #
#  a copy of this software and associated documentation files (the           #
#  "Software"), to deal in the Software without restriction, including       #
#  without limitation the rights to use, copy, modify, merge, publish,       #
#  distribute, sublicense, and/or sell copies of the Software, and to        #
#  permit persons to whom the Software is furnished to do so, subject to     #
#  the following conditions:                                                 #
#                                                                            #
#  The above copyright notice and this permission notice shall be            #
#  included in all copies or substantial portions of the Software.           #
#                                                                            #
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,           #
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF        #
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                     #
#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE    #
#  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION    #
#  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION     #
#  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.           #
#                                                                            #
#  Dataset-1 creation                                                        #
#                                                                            #
##############################################################################

In [32]:
import json
import numpy as np
import os
import pandas as pd
import random
import shutil

from collections import Counter
from collections import defaultdict
from itertools import chain
from itertools import compress
from tqdm import tqdm

pdcsv = lambda x: pd.read_csv(x, index_col=0)

The following table summarizes the criteria used to generate positive pairs for each task:
* The `X` indicates that the variable is required to be different in each pair
* The `*` indicates that the variable is free and may differ (but it isn't required).

```
|       | Architecture | Bitness | Compiler | Version | Optimization |
|-------|--------------|---------|----------|---------|--------------|
| arch  | X            |         |          |         |              |
| bit   |              | X       |          |         |              |
| comp  |              |         | X        | X       |              |
| ver   |              |         |          | X       |              |
| opt   |              |         |          |         | X            |
| XA    | X            | X       |          |         |              |
| XA+XO | X            | X       |          |         | X            |
| XC    |              |         | X        | X       | X            |
| XC+XB |              | X       | X        | X       | X            |
| XM    | *            | *       | *        | *       | *            |
```

In [33]:
CATEGORIES = [
    "project",
    "library",
    "arch",
    "bit",
    "compiler",
    "version",
    "optimizations",
]

TASKS_DICT = {
    # For any positive pair, the project and the library are the same.
    #   True: the variable is required to have the same value in the positive pair
    #   False: the variable is required to have different values in the negative pair.
    "arch": [
        True, True, False, True, True, True, True],
    "bit": [
        True, True, True, False, True, True, True],
    "comp": [
        True, True, True, True, False, False, True],
    "ver": [
        True, True, True, True, True, False, True],
    "opt": [
        True, True, True, True, True, True, False],
    "XA": [
        True, True, False, False, True, True, True],
    "XA+XO": [
        True, True, False, False, True, True, False],
    "XC": [
        True, True, True, True, False, False, False],
    "XC+XB": [
        True, True, True, False, False, False, False],
    # The following would be the XA+XC test
    # "XA+XC": [
    #    True, True, False, False, False, False, False]
}

# The XO test is the same as the opt one.
TASKS_DICT["XO"] = TASKS_DICT["opt"]

In [34]:
DATASET_ONE_DICT = {
    "projects": {
        "training": ["openssl", "clamav", "curl", "unrar"],
        "validation": ["zlib"],
        "test": ["z3", "nmap"],
    },
    "eval": {
        "validation": {
            "similarity": {"XA": 10000, "XC": 10000, "XC+XB": 10000, "XM": 10000}
        },
        "test": {
            "similarity": {
                "XA": 50000,
                "XC": 50000,
                "XC+XB": 50000,
                "XM": 50000,
                "arch": 50000,
                "bit": 50000,
                "comp": 50000,
                "opt": 50000,
                "ver": 50000,
            },
            "rank": {"XA": 200, "XC": 200, "XC+XB": 200, "XM": 200},
        },
    }
}

In [35]:
# where to save the new dataset
OUTPUT_DIR = "../Dataset-1/"

if not os.path.isdir(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    print(f"[D] DIR created: {OUTPUT_DIR}")
    
for dirname in ['validation', 'testing']:
    tmp_path = os.path.join(OUTPUT_DIR, "pairs", dirname)
    if not os.path.isdir(tmp_path):
        os.makedirs(tmp_path)
        print(f"[D] DIR created: {tmp_path}")

for dirname in ['training', 'validation', 'testing']:
    tmp_path = os.path.join(OUTPUT_DIR, "features", dirname)
    if not os.path.isdir(tmp_path):
        os.makedirs(tmp_path)
        print(f"[D] DIR created: {tmp_path}")

### Create a training / validation / test split

In [36]:
# The starting point
CSV_FLOWCHART_FP = "features/flowchart_Dataset-1.csv"

# Copy the flowchart file to the new folder
shutil.copy(CSV_FLOWCHART_FP, os.path.join(OUTPUT_DIR, "features", "flowchart_Dataset-1.csv")) 

SameFileError: 'features/flowchart_Dataset-1.csv' and '../Dataset-1/features/flowchart_Dataset-1.csv' are the same file

Summary:
   * Step 0 - Read the list of functions from the output of IDA flowchart
   * Step 1 -  Filter the functions with less than 5 BBs
   * Step 2 - Remove duplicated hashopcodes to remove duplicated functions
   * Step 3 - Extract compilation variables from idb_path
   * Step 4 - Create training, validation and test splits
   * Step 5 - Remove common function names across splits
   * Step 6 - Remove singleton functions.

In [37]:
# Step0 - Read the list of functions from the output of IDA flowchart
df = pd.read_csv(CSV_FLOWCHART_FP)
print(f"Shape: {df.shape}")

Shape: (8664141, 8)


In [38]:
# Remove the column with the list of basic-blocks
del df['bb_list']
print(f"Shape: {df.shape}")

Shape: (8664141, 7)


In [39]:
# Step1 -  Filter the functions with less than 5 BBs
df = df[df['bb_num'] >= 5]
print(f"Shape: {df.shape}")

Shape: (8664141, 7)


In [None]:
# Step2 - Remove duplicated hashopcodes to remove duplicated functions
df.drop_duplicates('hashopcodes', keep='first', inplace=True)
print(f"Shape: {df.shape}")

Shape: (3296093, 7)


In [40]:
# Step3 - Extract compilation variables from idb_path
compilation_var = list()
for path in df['idb_path']:
    slist = path.split("/")[2:]
    project = slist[0]
    slist = slist[1].split("_")
    library = slist[1].replace(".i64", "")
    arch, comp, ver, opt = slist[0].split("-")
    bit = "32" if "32" in arch.replace("86", "32") else "64"
    arch = arch.replace("32", "").replace("64", "").replace("86", "")
    if comp == "gcc":
        ver = "gcc_" + ver
    compilation_var.append([project, library, arch, bit, comp, ver, opt])

# Convert to NumPy Array
compilation_var = np.array(compilation_var)

# Add compilation variables to the DataFrame
df['project'] = compilation_var[:,0].tolist()
df['library'] = compilation_var[:,1].tolist()
df['arch'] = compilation_var[:,2].tolist()
df['bit'] = compilation_var[:,3].tolist()
df['compiler'] = compilation_var[:,4].tolist()
df['version'] = compilation_var[:,5].tolist()
df['optimizations'] = compilation_var[:,6].tolist()

print(f"Shape: {df.shape}")

Shape: (8664141, 14)


In [41]:
# Step4 - Create training, validation and test splits
df_training = df[df['project'].isin(DATASET_ONE_DICT['projects']['training'])]
print(f"Shape df_training: \t{df_training.shape}")

df_validation = df[df['project'].isin(DATASET_ONE_DICT['projects']['validation'])]
print(f"Shape df_validation: \t{df_validation.shape}")

df_test = df[df['project'].isin(DATASET_ONE_DICT['projects']['test'])]
print(f"Shape df_test: \t\t{df_test.shape}")

# Reset indexes
df_training.reset_index(inplace=True, drop=True)
df_validation.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)

Shape df_training: 	(2800498, 14)
Shape df_validation: 	(75217, 14)
Shape df_test: 		(5788426, 14)


In [42]:
# Step5 - Remove common function names across splits

# Check for common function names in training and test
r1 = set(df_training['func_name'].values) & set(df_test['func_name'].values)
print(f"# function names to remove: {len(r1)} (train & test)")

# Check for common function names in training and validation
r2 = set(df_training['func_name'].values) & set(df_validation['func_name'].values)
print(f"# function names to remove: {len(r2)} (train & validation)")

df_training = df_training[~df_training['func_name'].isin(r1 | r2)]
df_training.reset_index(inplace=True, drop=True)
print(f"Shape df_training: \t{df_training.shape}")

print()

# Check for common function names in validation and test
r3 = set(df_validation['func_name'].values) & set(df_test['func_name'].values)
print(f"# function names to remove: {len(r3)} (validation & test)")

df_test = df_test[~df_test['func_name'].isin(r3)]
df_test.reset_index(inplace=True, drop=True)
print(f"Shape df_test: \t\t{df_test.shape}")

# function names to remove: 232 (train & test)
# function names to remove: 50 (train & validation)
Shape df_training: 	(2778985, 14)

# function names to remove: 125 (validation & test)
Shape df_test: 		(5778795, 14)


In [43]:
# Step6 - Remove singleton functions
for df_t in [df_training, df_validation, df_test]:
    sl = [x for x, y in df_t[["library", "func_name"]].value_counts().items() if y < 2]
    gg = df_t.groupby(["library", "func_name"]).groups
    idx_list = list(chain(*[list(gg[i]) for i in sl]))
    print(f"[D] # function to remove: {len(idx_list)}")
    
    df_t.drop(idx_list, inplace=True)
    df_t.reset_index(inplace=True, drop=True)
    print(f"[D] Shape: {df_t.shape}\n")

[D] # function to remove: 2116
[D] Shape: (2776869, 14)

[D] # function to remove: 17
[D] Shape: (75200, 14)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_t.drop(idx_list, inplace=True)


[D] # function to remove: 34482
[D] Shape: (5744313, 14)



In [44]:
print(f"Shape df_training: \t{df_training.shape}")
print(f"Shape df_validation: \t{df_validation.shape}")
print(f"Shape df_test: \t\t{df_test.shape}")

Shape df_training: 	(2776869, 14)
Shape df_validation: 	(75200, 14)
Shape df_test: 		(5744313, 14)


### Create positive and negative pairs for validation and test

In [None]:
def create_similarity_pairs(df_input, num_pairs, test):
    """
    Generate "num_pairs" positive function pairs by sub sampling all the
    possible function combinations. Use this function when the number
    of ((libraries, function_names)) is limited to few hundreds.
    """
    # Map (libraries, function_names) to the indexes in the DB
    libfunc_dict = {
        k: list(v) for k, v in df_input.groupby(["library", "func_name"]).groups.items()
    }

    pos_pair_set = set()
    neg_pair_set = set()
    pos_pair_list = list()
    neg_pair_list = list()

    # Iterate over each library/func_name pair
    for entry in tqdm(libfunc_dict.keys(), ncols=100):
        libname, fname = entry

        # Get the list of indexes associated to the ((libname, fname)) pair
        idx_libfunc = libfunc_dict[entry]
        # DataFrame for the library/func_name pair
        df_libfunc = df_input.iloc[idx_libfunc]

        # Get the list of indexes to select negative pairs
        # idx_list_neg = df_input[df_input["func_name"] != fname].index
        bool_mask = ~df_input['func_name'].isin([fname])
        idx_list_neg = np.where(bool_mask)[0]
        # (<-- left) Iterate over each function for the ((libname, fname)) pair
        for idx_left_p in idx_libfunc:

            # Extract the compilation variables
            comp_data = df_input.iloc[idx_left_p][CATEGORIES].values

            # For the XM test, any combination is valid
            idx_list_pos = idx_libfunc

            if test != "XM":
                mask = TASKS_DICT[test]
                # Build the constraints dict
                #   if m is True: the variable is required to be the same in the positive pair
                fd = {c: v for m, c, v in zip(mask, CATEGORIES, comp_data) if m}
                constraints = [(df_libfunc[k] == v) for k, v in fd.items()]
                #   if m is False: the variable is required to be different in the positive pair.
                fd = {c: v for m, c, v in zip(mask, CATEGORIES, comp_data) if not m}
                constraints += [(df_libfunc[k] != v) for k, v in fd.items()]

                # Get the list of indexes of candidate right functions to generate positive pairs
                idx_list_pos = df_libfunc[np.logical_and.reduce(constraints)].index

            # Remove the left function from the list
            idx_list_pos = [idx for idx in idx_list_pos if idx != idx_left_p]

            # Iterate over each (--> right) function
            for idx_right_p in idx_list_pos:
                pos_pair = (idx_left_p, idx_right_p)

                # Check if the pos_pair is already in the list
                if tuple(sorted(pos_pair)) not in pos_pair_set:
                    pos_pair_set.add(tuple(sorted(pos_pair)))
                    pos_pair_list.append(pos_pair)

                    # Generate the corresponding negative pair
                    is_success = False
                    while not is_success:
                        idx_right_n = random.choice(idx_list_neg)
                        neg_pair = (idx_left_p, idx_right_n)

                        # Check if the neg_pair is already in the list
                        if tuple(sorted(neg_pair)) not in neg_pair_set:
                            neg_pair_set.add(tuple(sorted(neg_pair)))
                            neg_pair_list.append(neg_pair)
                            is_success = True

    # print(
    #     f"[D] Before sampling - pos: {len(pos_pair_list)} - neg: {len(neg_pair_list)}"
    # )

    # Sub sample the positive and negative pairs to num_pairs
    if len(pos_pair_list) > num_pairs:
        sampled_list = random.sample(list(range(len(pos_pair_list))), num_pairs)
        pos_pair_list = [pos_pair_list[x] for x in sampled_list]
        neg_pair_list = [neg_pair_list[x] for x in sampled_list]
        # print(
        #     f"[D] After sampling - pos: {len(pos_pair_list)} - neg: {len(neg_pair_list)}"
        # )

    return pos_pair_list, neg_pair_list

In [None]:
import multiprocessing
import pandas as pd


class MultiProcessCollector:
    def __init__(self, threshold: int) -> None:
        self.threshold = threshold
        
        self.pos_pair_set = set()
        self.neg_pair_set = set()
        self.pos_pair_list = list()
        self.neg_pair_list = list()
        
    def start_collector(self, input_channel: multiprocessing.Queue, output_channel: multiprocessing.Queue):
        bar = tqdm(total=self.threshold, desc="Collected Pairs")
        while True:
            
            if not output_channel.empty():
                signal = output_channel.get()
                if isinstance(signal, str) and signal == "STOP":
                    break
            
            pos_pair_list_recv, neg_pair_list_recv = input_channel.get()
            if not len(pos_pair_list_recv) == len(neg_pair_list_recv):
                print(f"pos_pair_list_recv: {len(pos_pair_list_recv)}, neg_pair_list_recv: {len(neg_pair_list_recv)}")
                print(f"pos_pair_list_recv: {pos_pair_list_recv}")
                print(f"neg_pair_list_recv: {neg_pair_list_recv}")
                raise ValueError("The length of the positive and negative pairs are not the same")
            
            
            for i in range(len(pos_pair_list_recv)):
                pos_pair = pos_pair_list_recv[i]
                neg_pair = neg_pair_list_recv[i]
                if tuple(sorted(pos_pair)) not in self.pos_pair_set and tuple(sorted(neg_pair)) not in self.neg_pair_set:
                    self.pos_pair_set.add(tuple(sorted(pos_pair)))
                    self.neg_pair_set.add(tuple(sorted(neg_pair)))
                    self.pos_pair_list.append(pos_pair)
                    self.neg_pair_list.append(neg_pair)
                    bar.update(1)
                    
            if len(self.pos_pair_list) >= self.threshold:
                output_channel.put("STOP")
                break
        
        output_channel.put((self.pos_pair_list, self.neg_pair_list))
        bar.close()

class MultiProcessWorker:
    def __init__(self, df_input: pd.DataFrame, test: str, libfunc_dict: dict, categories: list, task_dict: dict):
        self.df_input = df_input
        self.test = test
        self.libfunc_dict = libfunc_dict
        self.categories = categories
        self.task_dict = task_dict

    def start_worker(self, input_queue: multiprocessing.Queue, output_queue: multiprocessing.Queue):
        while True:
            entry = input_queue.get()
            if isinstance(entry, str) and entry == "STOP":
                break
            
            res = self.handle_entry(entry)
            output_queue.put(res)

    def handle_entry(self, entry):
        idx_libfunc = self.libfunc_dict[entry]
        df_libfunc = self.df_input.iloc[idx_libfunc]
        
        pos_pair_list = []
        neg_pair_list = []
        
        for idx_left_p in idx_libfunc:
            comp_data = self.df_input.iloc[idx_left_p][self.categories].values
            idx_list_pos = idx_libfunc
            if self.test != "XM":
                # might use the query to speed up the process
                df_libfunc: pd.DataFrame = self.df_input.iloc[idx_libfunc]
                mask = self.task_dict[self.test]

                condition = ''

                for m, c, v in zip(mask, self.categories, comp_data):
                    if isinstance(v, str):
                        condition += f'({c} == "{v}") and ' if m else f'({c} != "{v}") and '
                    else:
                        condition += f'({c} == {v}) and ' if m else f'({c} != {v}) and '

                idx_list_pos = df_libfunc.query(condition[:-5]).index

            idx_list_pos = [idx for idx in idx_list_pos if idx != idx_left_p]
            for idx_right_p in idx_list_pos:
                pos_pair = (idx_left_p, idx_right_p)
                pos_pair_list.append(pos_pair)
                is_success = False
                while not is_success:
                    idx_right_n = random.choice(range(len(self.df_input)))

                    if self.df_input.iloc[idx_right_n]["func_name"] == self.df_input.iloc[idx_left_p]["func_name"]:
                        continue

                    neg_pair = (idx_left_p, idx_right_n)
                    if neg_pair not in neg_pair_list:
                        neg_pair_list.append(neg_pair)
                        is_success = True

        assert len(pos_pair_list) == len(neg_pair_list)
        return pos_pair_list, neg_pair_list


def spawn_collector_process_for_no_rand(threshold: int, input_queue: multiprocessing.Queue, output_queue: multiprocessing.Queue, lock: multiprocessing.Lock):
    collector_process = MultiProcessCollector(threshold)
    lock.acquire()
    collector_process.start_collector(input_queue, output_queue)
    lock.release()
    
def spawn_worker_process_for_no_rand(df_input: pd.DataFrame, test: str, libfunc_dict: dict, categories: list, task_dict: dict, input_queue: multiprocessing.Queue, output_queue: multiprocessing.Queue, lock: multiprocessing.Lock):
    worker_process = MultiProcessWorker(df_input, test, libfunc_dict, categories, task_dict)
    lock.acquire()
    worker_process.start_worker(input_queue, output_queue)
    lock.release()
    

def create_similarity_pairs_parallel(df_input: pd.DataFrame, num_pairs: int, test: str):
    libfunc_dict = {
        k: list(v) for k, v in df_input.groupby(["library", "func_name"]).groups.items()
    }

    threshold = min(len(libfunc_dict), num_pairs)

    manager = multiprocessing.Manager()
    
    collector_lock = multiprocessing.Lock()
    collector_lock.acquire()

    cpu_count = multiprocessing.cpu_count()
    
    worker_input_queue = manager.Queue(maxsize=cpu_count*10)
    worker_output_queue = manager.Queue(maxsize=cpu_count*10)
    collector_output_queue = manager.Queue(maxsize=cpu_count*10)
    
    
    collector_process = multiprocessing.Process(target=spawn_collector_process_for_no_rand, args=(threshold, worker_output_queue, collector_output_queue, collector_lock))
    
    worker_locks = [multiprocessing.Lock() for _ in range(cpu_count-1)]
    worker_processes = []
    
    for i in range(cpu_count-1):
        worker_locks[i].acquire()
        worker_process = multiprocessing.Process(target=spawn_worker_process_for_no_rand, args=(df_input, test, libfunc_dict, CATEGORIES, TASKS_DICT, worker_input_queue, worker_output_queue, worker_locks[i]))
        worker_processes.append(worker_process)
        
    collector_process.start()
    collector_lock.release()
    
    for i in range(len(worker_processes)):
        worker_processes[i].start()
        worker_locks[i].release()
        
    # Start give task to the worker processes
    bar_main = tqdm(total=len(libfunc_dict), desc="All Function consumed")
    for entry in libfunc_dict.keys():
        worker_input_queue.put(entry)
        bar_main.update(1)
        if not collector_output_queue.empty():
            signal = collector_output_queue.get()
            if isinstance(signal, str) and signal == "STOP":
                break
    bar_main.close()
            
    collector_output_queue.put("STOP")
    
    for i in range(len(worker_processes)):
        worker_input_queue.put("STOP")
        
    
    while not worker_output_queue.empty():
        _ = worker_output_queue.get()
    
    for i in range(len(worker_processes)):
        worker_processes[i].join()
    
    collector_process.join()
    
    while True:
        result = collector_output_queue.get()
        if isinstance(result, str) and result == "STOP":
            continue
        else:
            break
    
    pos_pair_list, neg_pair_list = result
    
    print(
        f"[D] Before sampling - pos: {len(pos_pair_list)} - neg: {len(neg_pair_list)}"
    )

    # Sub sample the positive and negative pairs to num_pairs
    if len(pos_pair_list) > num_pairs:
        sampled_list = random.sample(list(range(len(pos_pair_list))), num_pairs)
        pos_pair_list = [pos_pair_list[x] for x in sampled_list]
        neg_pair_list = [neg_pair_list[x] for x in sampled_list]
        print(
            f"[D] After sampling - pos: {len(pos_pair_list)} - neg: {len(neg_pair_list)}"
        )

    return pos_pair_list, neg_pair_list

In [None]:
def create_similarity_pairs_random_version(df_input, num_pairs, test, num_negatives=1):
    """
    Randomly generate "num_pairs" positive function pairs. Use this function
    when the number of (libraries, function_names) pairs is > 1 thousand.
    """
    # Map (libraries, function_names) to the indexes in the DB
    libfunc_dict = {
        k: list(v) for k, v in df_input.groupby(["library", "func_name"]).groups.items()
    }
    libfunc_list = list(libfunc_dict.keys())

    pos_pair_set = set()
    neg_pair_set = set()
    pos_pair_list = list()
    neg_pair_list = list()

    with tqdm(total=num_pairs, ncols=100) as pbar:
        # Iterate num_pairs time to create the pos/neg function pairs
        for _ in range(num_pairs):

            # Iterate until a positive function pair is generated
            is_success_pos = False
            while not is_success_pos:

                # Randomly select a library/func_name pair
                entry = random.choice(libfunc_list)
                libname, fname = entry
                # Get the list of indexes associated to the library/func_name pair
                idx_libfunc = libfunc_dict[entry]
                # DataFrame for the library/func_name pair
                df_libfunc = df_input.iloc[idx_libfunc]

                # Randomly select a (<-- left) function
                idx_left_p = random.choice(idx_libfunc)
                # Extract the compilation variables
                comp_data = df_input.iloc[idx_left_p][CATEGORIES].values

                # For the XM test, any combination is valid
                idx_list_pos = idx_libfunc

                if test != "XM":
                    mask = TASKS_DICT[test]
                    # Build the constraints dict
                    #   if m is True: the variable is required to be the same in the positive pair
                    fd = {c: v for m, c, v in zip(mask, CATEGORIES, comp_data) if m}
                    constraints = [(df_libfunc[k] == v) for k, v in fd.items()]
                    #   if m is False: the variable is required to be different in the positive pair.
                    fd = {c: v for m, c, v in zip(mask, CATEGORIES, comp_data) if not m}
                    constraints += [(df_libfunc[k] != v) for k, v in fd.items()]

                    # Get the list of indexes of candidate right functions to generate positive pairs
                    idx_list_pos = df_libfunc[np.logical_and.reduce(constraints)].index

                # Remove the left function from the list
                idx_list_pos = [idx for idx in idx_list_pos if idx != idx_left_p]

                # No functions are left. Retry
                if len(idx_list_pos) == 0:
                    continue

                # Randomly select a (<-- right) function
                idx_right_p = random.choice(idx_list_pos)
                pos_pair = (idx_left_p, idx_right_p)
                if tuple(sorted(pos_pair)) not in pos_pair_set:
                    pos_pair_set.add(tuple(sorted(pos_pair)))
                    pos_pair_list.append(pos_pair)
                    is_success_pos = True

                    for _ in range(num_negatives):
                        # Generate the corresponding negative pair
                        is_success_neg = False
                        while not is_success_neg:
                            idx_right_n = random.randint(0, df_input.shape[0] - 1)
                            if df_input.iloc[idx_right_n]["func_name"] == fname:
                                continue
                            neg_pair = (idx_left_p, idx_right_n)

                            # Check if the neg_pair is already in the list
                            if tuple(sorted(neg_pair)) not in neg_pair_set:
                                neg_pair_set.add(tuple(sorted(neg_pair)))
                                neg_pair_list.append(neg_pair)
                                is_success_neg = True

            # Update the progress bar
            pbar.update(1)

        # print(f"[D] pos: {len(pos_pair_list)} - neg: {len(neg_pair_list)}")

    return pos_pair_list, neg_pair_list


In [None]:
import itertools

class RandomPairsWorker:
    def __init__(self, df_input: pd.DataFrame, libfunc_dict: dict, test: str, num_negatives: int, task_dict: dict, categories: list, scaler: float = 1.5):
        self.df_input = df_input
        self.libfunc_dict = libfunc_dict
        self.test = test
        self.num_negatives = num_negatives
        
        self.categories = categories
        self.task_dict = task_dict
        
        self.scaler = scaler
        
    def start_worker(self, input_queue: multiprocessing.Queue, output_queue: multiprocessing.Queue):
        while True:
            entry = input_queue.get()
            if isinstance(entry, str) and entry == "STOP":
                break
            succ, pos_pair_list, neg_pair_list = self.handle_one(entry)
            if succ:
                output_queue.put((pos_pair_list, neg_pair_list))
        
    def handle_one(self, entry):
        
        pos_pair_list = []
        
        total_pos_pair_list = []
        total_neg_pair_list = []
        
        is_success_pos = False

        # Randomly select a library/func_name pair
        libname, fname = entry
        # Get the list of indexes associated to the library/func_name pair
        idx_libfunc = self.libfunc_dict[entry]
        # DataFrame for the library/func_name pair
        df_libfunc = self.df_input.iloc[idx_libfunc]

        # Randomly select a (<-- left) function
        idx_left_p = random.choice(idx_libfunc)
        
        for idx_left_p in idx_libfunc:
            # Extract the compilation variables
            comp_data = self.df_input.iloc[idx_left_p][self.categories].values

            # For the XM test, any combination is valid
            idx_list_pos = idx_libfunc

            if self.test != "XM":
                mask = self.task_dict[self.test]
                # Build the constraints dict
                #   if m is True: the variable is required to be the same in the positive pair
                fd = {c: v for m, c, v in zip(mask, self.categories, comp_data) if m}
                constraints = [(df_libfunc[k] == v) for k, v in fd.items()]
                #   if m is False: the variable is required to be different in the positive pair.
                fd = {c: v for m, c, v in zip(mask, self.categories, comp_data) if not m}
                constraints += [(df_libfunc[k] != v) for k, v in fd.items()]

                # Get the list of indexes of candidate right functions to generate positive pairs
                idx_list_pos = df_libfunc[np.logical_and.reduce(constraints)].index
                
                # need to use query
                # query_str = ""
                
                # for m, c, v in zip(mask, self.categories, comp_data):
                #     if m:
                #         if isinstance(v, str):
                #             query_str += f"{c} == '{v}' and "
                #         else:
                #             query_str += f"{c} == {v} and "
                #     else:
                #         if isinstance(v, str):
                #             query_str += f"{c} != '{v}' and "
                #         else:
                #             query_str += f"{c} != {v} and "
                            
                # idx_list_pos = df_libfunc.query(query_str[:-5]).index

            # Remove the left function from the list
            idx_list_pos = [idx for idx in idx_list_pos if idx != idx_left_p]

            number_of_positives = len(idx_list_pos)
            number_of_positives = random.randint(0, number_of_positives)
            idx_list_pos = random.sample(idx_list_pos, number_of_positives)

            # No functions are left. Retry
            if len(idx_list_pos) == 0:
                continue

            # Randomly select a (<-- right) function
            
            for idx_right_p in idx_list_pos:
                
                pos_pair = (idx_left_p, idx_right_p)

                pos_pair_list.append(pos_pair)
                
                # At lease one pair is successfully generated
                is_success_pos = True

                is_success_neg = False
                neg_pair_list = []
                neg_pair_set = set()
                
                for _ in range(round(self.num_negatives * self.scaler)):
                    # Generate the corresponding negative pair
                    is_success_neg = False
                    while not is_success_neg:
                        idx_right_n = random.randint(0, self.df_input.shape[0] - 1)
                        if self.df_input.iloc[idx_right_n]["func_name"] == fname:
                            continue
                        neg_pair = (idx_left_p, idx_right_n)

                        # Check if the neg_pair is already in the list
                        if tuple(sorted(neg_pair)) not in neg_pair_set:
                            neg_pair_set.add(tuple(sorted(neg_pair)))
                            neg_pair_list.append(neg_pair)
                            is_success_neg = True
                            
                if is_success_neg:
                    total_pos_pair_list.append(pos_pair)
                    total_neg_pair_list.append(neg_pair_list)
                    pos_pair_list = []
                    neg_pair_list = []
                    neg_pair_set = set()
                    
        if is_success_pos and is_success_neg:
            return True, total_pos_pair_list, total_neg_pair_list
        else:
            return False, [], []


class RandomPairsCollector:
    def __init__(self, threshold: int, num_negatives: int, test: str, scaler: float = 1.5):
        self.threshold = threshold
        self.test = test
        self.num_negatives = num_negatives
        self.scaler = scaler
        
    def start_collector(self, input_queue: multiprocessing.Queue, output_queue: multiprocessing.Queue):
        

        bar = tqdm(total=round(self.threshold * self.scaler), desc="Collecting Pairs", position=1)

        
        pos_pair_list = []
        neg_pair_list = []
        pos_pair_set = set()
        neg_pair_set = set()
        
        attempt = 0
        
        while True:
            if not output_queue.empty():
                msg = output_queue.get()
                if isinstance(msg, str) and msg == "STOP":
                    break
            
            if input_queue.empty():
                continue
            
            total_pos_pair_list_recv, total_neg_pair_list_recv = input_queue.get()
            
            if len(total_pos_pair_list_recv) != len(total_neg_pair_list_recv):
                print(f"[E] Positive Pair Recv Length: {len(total_pos_pair_list_recv)}, Negative Pair Recv Length: {len(total_neg_pair_list_recv)}")
                raise ValueError("The number of negative pairs is not correct.")
            
            attempt += len(total_pos_pair_list_recv)
            
            for i in range(len(total_pos_pair_list_recv)):
            
                pos_pair_list_recv = total_pos_pair_list_recv[i]
                neg_pair_list_recv = total_neg_pair_list_recv[i]
            
                if round(self.scaler * self.num_negatives) != len(neg_pair_list_recv):
                    print(f"[E] Positive Pair Recv Length: {len(pos_pair_list_recv)} - {pos_pair_list_recv}, Negative Pair Recv Length: {len(neg_pair_list_recv)}")
                    raise ValueError("The number of negative pairs is not correct.")
            
                
                if tuple(sorted(list(pos_pair_list_recv))) in pos_pair_set:
                    continue
                
                temp_neg_pair = []
                for neg_pair in neg_pair_list_recv:
                    if tuple(sorted(list(neg_pair))) in neg_pair_set:
                        continue
                    temp_neg_pair.append(neg_pair)
                
                if len(temp_neg_pair) < self.num_negatives:
                    continue
            
                temp_neg_pair = temp_neg_pair[:self.num_negatives]
            
                pos_pair_set.add(tuple(sorted(list(pos_pair_list_recv))))
                pos_pair_list.append(pos_pair_list_recv)
            
                for i in range(len(temp_neg_pair)):
                    neg_pair_set.add(tuple(sorted(list(temp_neg_pair[i]))))
                    neg_pair_list.append(temp_neg_pair[i])
                bar.update(1)
                
            # Need to run all the functions and use random sample to pick the pairs                     
            if len(pos_pair_list) >= round(self.threshold * self.scaler):
                # print("[Collector] Threshold Reached")
                output_queue.put("STOP")
                while not input_queue.empty():
                    input_queue.get()
                break
        
        # print(f"[Collector] Attempt: {attempt}, Putting Result to Main")
        output_queue.put((pos_pair_list, neg_pair_list))
        bar.close()
        

def spawn_collector_process_for_rand(threshold: int, num_negatives: int, input_channel: multiprocessing.Queue, output_channel: multiprocessing.Queue, lock: multiprocessing.Lock, test: str, scaler: float = 1.5):
    collector = RandomPairsCollector(threshold, num_negatives, test, scaler)
    lock.acquire()
    collector.start_collector(input_channel, output_channel)
    lock.release()
    
    
def spawn_worker_process_for_rand(df_input: pd.DataFrame, libfunc_dict: dict, test: str, num_negatives: int, task_dict: dict, categories: list, input_channel: multiprocessing.Queue, output_channel: multiprocessing.Queue, lock: multiprocessing.Lock, scaler: float = 1.5):
    worker = RandomPairsWorker(df_input, libfunc_dict, test, num_negatives, task_dict, categories, scaler)
    lock.acquire()
    worker.start_worker(input_channel, output_channel)
    lock.release()


def create_similarity_pairs_random_version_parallel(df_input, num_pairs, test, num_negatives=1):
    """
    Randomly generate "num_pairs" positive function pairs. Use this function
    when the number of (libraries, function_names) pairs is > 1 thousand.
    """
    libfunc_dict = {
        k: list(v) for k, v in df_input.groupby(["library", "func_name"]).groups.items()
    }
    libfunc_list = list(libfunc_dict.keys())
    
    # random.shuffle(libfunc_list)
    
    manager = multiprocessing.Manager()
    
    collector_lock = multiprocessing.Lock()
    collector_lock.acquire()
    
    cpu_count = multiprocessing.cpu_count()
    
    worker_input_channel = manager.Queue(maxsize=cpu_count*2)
    worker_output_channel = manager.Queue(maxsize=cpu_count*2)
    collector_output_channel = manager.Queue(maxsize=cpu_count*2)
    
    collector_process = multiprocessing.Process(target=spawn_collector_process_for_rand, args=(num_pairs, num_negatives, worker_output_channel, collector_output_channel, collector_lock, test))
    collector_process.daemon = True
    
    worker_locks = [multiprocessing.Lock() for _ in range(cpu_count - 1)]
    worker_processes = []
    
    for i in range(len(worker_locks)):
        worker_locks[i].acquire()
        worker_process = multiprocessing.Process(target=spawn_worker_process_for_rand, args=(df_input, libfunc_dict, test, num_negatives, TASKS_DICT, CATEGORIES, worker_input_channel, worker_output_channel, worker_locks[i]))
        worker_process.daemon = True
        worker_processes.append(worker_process)
        
    collector_process.start()
    collector_lock.release()
    
    for i in range(len(worker_processes)):
        worker_processes[i].start()
        worker_locks[i].release()
        

    random.shuffle(libfunc_list)
        
    bar_main = tqdm(total=len(libfunc_list), desc="All Function Consumed")
    for i in range(len(libfunc_list)):
        worker_input_channel.put(libfunc_list[i])
        bar_main.update(1)
        
        if not collector_output_channel.empty():
            msg = collector_output_channel.get()
            if isinstance(msg, str) and msg == "STOP":
                # print("[Main] Get STOP from Collector Process.")
                break
            
    bar_main.close()
    
    # print("[Main] Putting STOP to the worker processes and waiting for them to finish.")
    for i in range(len(worker_processes)):
        worker_input_channel.put("STOP")
        
    # Empty the worker output channel

    # print("[Main] Emptying the worker output channel.")
    while not worker_output_channel.empty():
        worker_output_channel.get()
    
    # print("[Main] Waiting for the worker processes to finish.")
    for i in range(len(worker_processes)):
        while not worker_output_channel.empty():
            worker_output_channel.get()
        worker_processes[i].terminate()
        worker_processes[i].join()
    
    # print("[Main] Putting STOP to the collector process and waiting for it to finish.")
    collector_output_channel.put("STOP")
    collector_process.join()

    # print("[Main] Collecting Result From Collector Process.")
    while True:
        result = collector_output_channel.get()
        if isinstance(result, str) and result == "STOP":
            continue
        else:
            break
    

    pos_pair_list, neg_pair_list = result
    
    print(
        f"[D] Before sampling - pos: {len(pos_pair_list)} - neg: {len(neg_pair_list)}"
    )

    pos_pair_list = [[x] for x in pos_pair_list]
    neg_pair_list = [neg_pair_list[i*num_negatives:(i+1)*num_negatives] for i in range(len(neg_pair_list) // num_negatives)]
    
    assert len(pos_pair_list) == len(neg_pair_list)
    
    # Sub sample the positive and negative pairs to num_pairs
    if len(pos_pair_list) > num_pairs:
        sampled_list = random.sample(list(range(len(pos_pair_list))), num_pairs)
        pos_pair_list = [pos_pair_list[x] for x in sampled_list]
        neg_pair_list = [neg_pair_list[x] for x in sampled_list]
        
        pos_pair_list = [x[0] for x in pos_pair_list]
        neg_pair_list = list(itertools.chain(*neg_pair_list))
        
        print(
            f"[D] After sampling - pos: {len(pos_pair_list)} - neg: {len(neg_pair_list)}"
        )

    return pos_pair_list, neg_pair_list

    

In [None]:
def convert_dicts_into_dataframes(df_input, dataset_dict):
    pair_columns = [
        "idb_path_1",
        "fva_1",
        "func_name_1",
        "idb_path_2",
        "fva_2",
        "func_name_2",
        "db_type",
    ]

    pos_pair_dict = defaultdict(list)
    neg_pair_dict = defaultdict(list)
    
    # Iterate over each positive and negative pair.
    #   Select the required info and save it in a new dictionary.
    for task in dataset_dict:
        for pos_pair in dataset_dict[task]["pos"]:
            for c in ["idb_path", "fva", "func_name"]:
                pos_pair_dict[c + "_1"].append(df_input.iloc[pos_pair[0]][c])
                pos_pair_dict[c + "_2"].append(df_input.iloc[pos_pair[1]][c])
            pos_pair_dict["db_type"].append(task)

        for neg_pair in dataset_dict[task]["neg"]:
            for c in ["idb_path", "fva", "func_name"]:
                neg_pair_dict[c + "_1"].append(df_input.iloc[neg_pair[0]][c])
                neg_pair_dict[c + "_2"].append(df_input.iloc[neg_pair[1]][c])
            neg_pair_dict["db_type"].append(task)
    
    # Convert the local pair_dicts into DataFrames
    df_pos = pd.DataFrame.from_dict(pos_pair_dict)
    df_neg = pd.DataFrame.from_dict(neg_pair_dict)
    
    # Check/change the order of the columns
    df_pos = df_pos[pair_columns]
    df_neg = df_neg[pair_columns]
    return df_pos, df_neg

In [None]:
def print_summary(dataset_dict):
    print("[D] Summary:") 
    for task in dataset_dict:
        print(
            "[D] \tTask: {:5} - pos: {:5} neg: {:5}".format(
                task, len(dataset_dict[task]["pos"]), len(dataset_dict[task]["neg"])
            )
        )
    print("\n")

In [None]:
def print_free_variables(df_input, task_list, dataset_dict):
    for task in task_list:
        # Skip "XM"
        if task not in TASKS_DICT:
            continue

        print("-" * 100 + "\n")
        print(f"[D] Task: {task}\n")

        # Get the name of the free variables for each task
        free_variables = list(
            compress(CATEGORIES, [not x for x in TASKS_DICT[task]])
        )

        v_list = list()
        for pos_pair in dataset_dict[task]["pos"]:
            # Get the values associated to the free variables
            vv = df_input.iloc[list(pos_pair)][free_variables].values
            # Sort them to avoid counting the permutations
            vv = tuple(sorted([tuple(x) for x in vv]))
            v_list.append(vv)

        # Print the frequency of each combination
        for k, v in Counter(v_list).most_common():
            print(f"\t{v:5}, {k}")

        print()

In [None]:

def create_pos_neg_dataset(
    df_input, task_dict, output_dir, output_fs, rand=True, num_negatives=1
):
    print("[D] Creating the pos/neg function pairs...", flush=True)
    dataset_dict = defaultdict(dict)

    for task, num_pairs in task_dict.items():
        ppl, npl = None, None
        if rand:
            # Use the random version of the pair generation function
            ppl, npl = create_similarity_pairs_random_version_parallel(
                df_input, num_pairs, task, num_negatives
            )
        else:
            ppl, npl = create_similarity_pairs_parallel(df_input, num_pairs, task)
        dataset_dict[task]["pos"] = ppl
        dataset_dict[task]["neg"] = npl

    print_summary(dataset_dict)

    print("[D] Converting the positive/negative pairs into CSV...", flush=True)
    df_pos, df_neg = convert_dicts_into_dataframes(df_input, dataset_dict)

    pos_fp = os.path.join(output_dir, output_fs.format("pos"))
    df_pos.to_csv(pos_fp)
    print(f"[D] \tPos CSV: {pos_fp}")

    neg_fp = os.path.join(output_dir, output_fs.format("neg"))
    df_neg.to_csv(neg_fp)
    print(f"[D] \tNeg CSV: {neg_fp}")

    # For debug only
    print_free_variables(df_input, task_dict.keys(), dataset_dict)
    
    selected_functions = set()
    for task in dataset_dict:
        for pair in dataset_dict[task]["pos"]:
            selected_functions.update(list(pair))
        for pair in dataset_dict[task]["neg"]:
            selected_functions.update(list(pair))
    return selected_functions
    

In [None]:
# Create pairs for validation dataset
sf_set = create_pos_neg_dataset(
    df_validation,
    DATASET_ONE_DICT["eval"]["validation"]["similarity"],
    os.path.join(OUTPUT_DIR, "pairs", "validation"),
    "{}_validation_Dataset-1.csv",
    rand=False,
    num_negatives=1
)

df_validation = df_validation.iloc[list(sf_set)]
df_validation.reset_index(inplace=True, drop=True)

[D] Creating the pos/neg function pairs...


All Function consumed:  99%|█████████▊| 390/395 [00:00<00:00, 1277.32it/s]



[D] Before sampling - pos: 411 - neg: 411


Collected Pairs: 418it [00:01, 325.81it/s]                       7.91it/s]
All Function consumed:  93%|█████████▎| 369/395 [00:00<00:00, 1711.48it/s]


[D] Before sampling - pos: 418 - neg: 418


Collected Pairs: 404it [00:01, 312.65it/s]                       3.04it/s]
All Function consumed:  94%|█████████▍| 371/395 [00:00<00:00, 1704.86it/s]


[D] Before sampling - pos: 404 - neg: 404


Collected Pairs: 790it [00:01, 661.18it/s]                         81it/s]
All Function consumed:  92%|█████████▏| 363/395 [00:00<00:00, 2216.18it/s]


[D] Before sampling - pos: 790 - neg: 790
[D] Summary:
[D] 	Task: XA    - pos:   411 neg:   411
[D] 	Task: XC    - pos:   418 neg:   418
[D] 	Task: XC+XB - pos:   404 neg:   404
[D] 	Task: XM    - pos:   790 neg:   790


[D] Converting the positive/negative pairs into CSV...
[D] 	Pos CSV: ../Dataset-1/pairs/validation/pos_validation_Dataset-1.csv
[D] 	Neg CSV: ../Dataset-1/pairs/validation/neg_validation_Dataset-1.csv
----------------------------------------------------------------------------------------------------

[D] Task: XA

	  104, (('arm', '64'), ('mips', '32'))
	   93, (('mips', '32'), ('x', '64'))
	   84, (('arm', '64'), ('x', '32'))
	   67, (('mips', '64'), ('x', '32'))
	   35, (('arm', '32'), ('mips', '64'))
	   28, (('arm', '32'), ('x', '64'))

----------------------------------------------------------------------------------------------------

[D] Task: XC

	   12, (('clang', '9', 'O0'), ('gcc', 'gcc_7', 'O1'))
	    9, (('clang', '7', 'O0'), ('gcc', 'gcc_4.8', 'O1'))
	  

In [None]:
# Create pairs for test dataset
sf_set_1 = create_pos_neg_dataset(
    df_test,
    DATASET_ONE_DICT["eval"]["test"]["similarity"],
    os.path.join(OUTPUT_DIR, "pairs", "testing"),
    "{}_testing_Dataset-1.csv",
    rand=True,
    num_negatives=1
)

[D] Creating the pos/neg function pairs...


All Function Consumed:   7%|▋         | 2976/45081 [00:04<01:10, 596.36it/s]
Collecting Pairs: 75040it [00:06, 12324.20it/s]                           


[D] Before sampling - pos: 75040 - neg: 75040
[D] After sampling - pos: 50000 - neg: 50000


Collecting Pairs: 75077it [00:02, 26822.85it/s]                           ] 



[D] Before sampling - pos: 75077 - neg: 75077
[D] After sampling - pos: 50000 - neg: 50000


Collecting Pairs: 75207it [00:02, 26156.52it/s]                           ] 



[D] Before sampling - pos: 75207 - neg: 75207
[D] After sampling - pos: 50000 - neg: 50000


Collecting Pairs: 76714it [00:02, 27850.60it/s]81 [00:01<05:52, 127.24it/s] 



[D] Before sampling - pos: 76714 - neg: 76714
[D] After sampling - pos: 50000 - neg: 50000


All Function Consumed:   6%|▋         | 2839/45081 [00:05<01:18, 536.08it/s]
Collecting Pairs: 75021it [00:06, 11650.35it/s]                           


[D] Before sampling - pos: 75021 - neg: 75021
[D] After sampling - pos: 50000 - neg: 50000


All Function Consumed:  12%|█▏        | 5197/45081 [00:08<01:07, 593.29it/s]
Collecting Pairs: 75039it [00:09, 7542.98it/s]                           


[D] Before sampling - pos: 75039 - neg: 75039
[D] After sampling - pos: 50000 - neg: 50000


Collecting Pairs: 75071it [00:05, 14823.69it/s]                           s]



[D] Before sampling - pos: 75071 - neg: 75071
[D] After sampling - pos: 50000 - neg: 50000


All Function Consumed:   4%|▍         | 1811/45081 [00:03<01:18, 552.19it/s]
Collecting Pairs: 75084it [00:04, 16886.71it/s]                           


[D] Before sampling - pos: 75084 - neg: 75084
[D] After sampling - pos: 50000 - neg: 50000


All Function Consumed:   5%|▍         | 2087/45081 [00:03<01:15, 569.77it/s]
Collecting Pairs: 75062it [00:04, 15386.75it/s]                           


[D] Before sampling - pos: 75062 - neg: 75062
[D] After sampling - pos: 50000 - neg: 50000
[D] Summary:
[D] 	Task: XA    - pos: 50000 neg: 50000
[D] 	Task: XC    - pos: 50000 neg: 50000
[D] 	Task: XC+XB - pos: 50000 neg: 50000
[D] 	Task: XM    - pos: 50000 neg: 50000
[D] 	Task: arch  - pos: 50000 neg: 50000
[D] 	Task: bit   - pos: 50000 neg: 50000
[D] 	Task: comp  - pos: 50000 neg: 50000
[D] 	Task: opt   - pos: 50000 neg: 50000
[D] 	Task: ver   - pos: 50000 neg: 50000


[D] Converting the positive/negative pairs into CSV...
[D] 	Pos CSV: ../Dataset-1/pairs/testing/pos_testing_Dataset-1.csv
[D] 	Neg CSV: ../Dataset-1/pairs/testing/neg_testing_Dataset-1.csv
----------------------------------------------------------------------------------------------------

[D] Task: XA

	 8764, (('arm', '64'), ('x', '32'))
	 8670, (('mips', '32'), ('x', '64'))
	 8490, (('mips', '64'), ('x', '32'))
	 8442, (('arm', '64'), ('mips', '32'))
	 7986, (('arm', '32'), ('x', '64'))
	 7648, (('arm', '32'), ('mips

In [None]:
# Create pairs for test rank dataset
sf_set_2 = create_pos_neg_dataset(
    df_test,
    DATASET_ONE_DICT["eval"]["test"]["rank"],
    os.path.join(OUTPUT_DIR, "pairs", "testing"),
    "{}_rank_testing_Dataset-1.csv",
    rand=True,
    num_negatives=100
)

[D] Creating the pos/neg function pairs...


Collecting Pairs: 317it [00:01, 180.13it/s]                         .56it/s]
All Function Consumed:   0%|          | 160/45081 [00:00<03:08, 238.06it/s] 


[D] Before sampling - pos: 317 - neg: 31700
[D] After sampling - pos: 200 - neg: 20000


Collecting Pairs: 337it [00:01, 173.27it/s]                         .54it/s]
All Function Consumed:   0%|          | 166/45081 [00:00<04:18, 173.78it/s] 


[D] Before sampling - pos: 337 - neg: 33700
[D] After sampling - pos: 200 - neg: 20000


Collecting Pairs: 313it [00:02, 111.31it/s]                         .64it/s]
All Function Consumed:   0%|          | 179/45081 [00:01<07:23, 101.19it/s] 


[D] Before sampling - pos: 313 - neg: 31300
[D] After sampling - pos: 200 - neg: 20000


Collecting Pairs: 310it [00:02, 148.69it/s]                         51it/s]
All Function Consumed:   0%|          | 118/45081 [00:01<07:50, 95.65it/s] 


[D] Before sampling - pos: 310 - neg: 31000
[D] After sampling - pos: 200 - neg: 20000
[D] Summary:
[D] 	Task: XA    - pos:   200 neg: 20000
[D] 	Task: XC    - pos:   200 neg: 20000
[D] 	Task: XC+XB - pos:   200 neg: 20000
[D] 	Task: XM    - pos:   200 neg: 20000


[D] Converting the positive/negative pairs into CSV...
[D] 	Pos CSV: ../Dataset-1/pairs/testing/pos_rank_testing_Dataset-1.csv
[D] 	Neg CSV: ../Dataset-1/pairs/testing/neg_rank_testing_Dataset-1.csv
----------------------------------------------------------------------------------------------------

[D] Task: XA

	   42, (('arm', '64'), ('mips', '32'))
	   39, (('arm', '64'), ('x', '32'))
	   34, (('mips', '32'), ('x', '64'))
	   31, (('mips', '64'), ('x', '32'))
	   27, (('arm', '32'), ('mips', '64'))
	   27, (('arm', '32'), ('x', '64'))

----------------------------------------------------------------------------------------------------

[D] Task: XC

	    4, (('clang', '7', 'O0'), ('gcc', 'gcc_9', 'Os'))
	    4, (('clang'

In [None]:
df_test = df_test.iloc[list(sf_set_1 | sf_set_2)]
df_test.reset_index(inplace=True, drop=True)

In [None]:
print(f"Shape df_training: \t{df_training.shape}")
print(f"Shape df_validation: \t{df_validation.shape}")
print(f"Shape df_test: \t\t{df_test.shape}")

# Save the "selected functions" to a CSV.
# This will be useful to post-process the results.
df_validation.to_csv(os.path.join(OUTPUT_DIR, "validation_Dataset-1.csv"))
df_training.to_csv(os.path.join(OUTPUT_DIR, "training_Dataset-1.csv"))
df_test.to_csv(os.path.join(OUTPUT_DIR, "testing_Dataset-1.csv"))

Shape df_training: 	(844841, 14)
Shape df_validation: 	(2760, 14)
Shape df_test: 		(835010, 14)


In [None]:
# Save the "selected functions" to a JSON.
# This is useful to limit the IDA analysis to some functions only.
df_list = [df_training, df_validation, df_test]
split_list = ["training", "validation", "testing"]

for split, df_t in zip(split_list, df_list):

    fset = set([tuple(x) for x in df_t[['idb_path', 'fva']].values])
    print("{}: {} functions".format(split, len(fset)))

    selected_functions = defaultdict(list)
    for t in fset:
        selected_functions[t[0]].append(int(t[1], 16))
        
    # Test
    assert(sum([len(v) for v in selected_functions.values()]) == len(fset))

    # Save to file
    with open(os.path.join(OUTPUT_DIR, "features", split, "selected_{}_Dataset-1.json".format(split)), "w") as f_out:
        json.dump(selected_functions, f_out)

training: 844841 functions
validation: 2760 functions
testing: 835010 functions
