## Various data validation and cleaning functions.

## 1. Validate PE/COFF Disassembly.
    - feature_extraction_validation.py

In [1]:
from multiprocessing import Pool
import os
from csv import writer
import numpy as np
import pandas as pd
import math
import scipy.misc
import array
import time as tm
import re
import subprocess as sub

In [2]:
def validate_disassembly(asm_path, hdr_path, file_ext): 
    # Check disassembly results for the PE/COFF files in the malware set.

    t1asm = os.listdir(asm_path)
    t1hdr = os.listdir(hdr_path)
    asm_files = []
    hdr_files = []

    for fname in t1asm:
        if fname.endswith('.pe.asm'):
            asm_files.append(fname)

    for fname in t1hdr:
        if fname.endswith('.pe.txt'):
            hdr_files.append(fname)

    print("asm dir: {:d} asm files {:d} hdr dir {:d} hdr files {:d}".format(len(t1asm),len(asm_files),len(t1hdr),len(hdr_files)))
    
    counter = 0
    missing_hdr_list = []

    for fname in asm_files:
        hdr_name = fname.replace('.asm', '.txt')
        if hdr_name not in hdr_files:
            print("{:s} not in header file list.".format(hdr_name))
            counter += 1
            missing_hdr_list.append(hdr_name)

    print("{:d} missing header files.".format(counter))
 
    counter = 0
    missing_asm_list = []

    for fname in hdr_files:
        asm_name = fname.replace('.txt','.asm')
        if asm_name not in asm_files:
            print("{:s} not in asm file list.".format(asm_name))
            counter += 1
            missing_asm_list.append(asm_name)

    print("{:d} missing assembly files.".format(counter))

    if len(missing_asm_list) > 0:
        counter = 0
        fop = open('data/temp-disass-missing-asm-files' + file_ext + '.txt', 'w')
        for fname in missing_asm_list:
            fop.write(fname + "\n")
            counter += 1

        fop.close()
        print("Wrote {:d} missing asm file names.".format(counter))

    if len(missing_hdr_list) > 0:
        counter = 0
        fop = open('data/temp-disass-missing-hdr-files' + file_ext + '.txt', 'w')
        for fname in missing_hdr_list:
            fop.write(fname + "\n")
            counter += 1

        fop.close()
        print("Wrote {:d} missing hdr file names.".format(counter))
        
    counter = 0
    bad_asm_list = []

    for fname in asm_files:
        fsize = os.path.getsize(asm_path + fname)
        if fsize < 1000:
            print("{:s} bad output, filesize = {:d}.".format(fname, fsize))
            counter += 1
            bad_asm_list.append(fname)

    print("{:d} bad asm files.".format(counter))

    counter = 0
    bad_hdr_list = []

    for fname in hdr_files:
        fsize = os.path.getsize(hdr_path + fname)
        if fsize < 1000:
            print("{:s} bad output, filesize = {:d}.".format(fname, fsize))
            counter += 1
            bad_hdr_list.append(fname)

    print("{:d} bad header files.".format(counter))

    if len(bad_hdr_list) > 0:
        counter = 0
        fop = open('data/temp-disass-bad-hdr-files' + file_ext + '.txt', 'w')
        for fname in bad_hdr_list:
            fop.write(fname + "\n")
            counter += 1

        fop.close()
        
    print("Wrote {:d} bad hdr file names.".format(counter))

    if len(bad_asm_list) > 0:
        counter = 0
        fop = open('data/temp-disass-bad-asm-files' + file_ext + '.txt', 'w')
        for fname in bad_asm_list:
            fop.write(fname + "\n")
            counter += 1

        fop.close()
        
    print("Wrote {:d} bad asm file names.".format(counter))
    
    
    return

In [None]:
validate_disassembly('/opt/vs/train1asm/', '/opt/vs/train1hdr/', '-vs251')

In [None]:
validate_disassembly('/opt/vs/train2asm/', '/opt/vs/train2hdr/', '-vs252')

In [None]:
validate_disassembly('/opt/vs/train3asm/', '/opt/vs/train3hdr/', '-vs263')

In [None]:
validate_disassembly('/opt/vs/train4asm/', '/opt/vs/train4hdr/', '-vs264')

In [5]:
validate_disassembly('/opt/vs/aptasm/', '/opt/vs/apthdr/', '-vsapt')

asm dir: 271 asm files 271 hdr dir 275 hdr files 275
0 missing header files.
VirusShare_d8b7b276710127d233abcdb7313aac36.pe.asm not in asm file list.
VirusShare_d4ba6430996fb4021241efc97c607504.pe.asm not in asm file list.
VirusShare_af719814507fdca4b96184f33b6b92ea.pe.asm not in asm file list.
VirusShare_6a4fbcfb44717eae2145c761c1c99b6a.pe.asm not in asm file list.
4 missing assembly files.
Wrote 4 missing asm file names.
0 bad asm files.
0 bad header files.
Wrote 0 bad hdr file names.
Wrote 0 bad asm file names.


In [None]:
# 1. Run the validate_disassembly function to identify missing and incomplete ASM and Header files.
# 2. List the missing and incomplete files and retry disassembly.
# 3. If retry stills fails manually analyse the culprit file to determine cause of error.
# 4. Run the feature extraction processes again on the ASM and Header files.
# 5. Run the validate_features function to identify missing or incomplete features for each sample.
# 6. Manually analyse files that have missing or incomplete feature sets to determine the cause of the errors.
# 7. TODO:

## Analyse Disassembly Errors and Attempt to Fix Them.

In [6]:
# Open the bad or missing ASM file list and copy the the binaries
# to another directory for manual analysis.

def copy_bad_pe(in_file, ext_dir, err_dir):
    
    if os.path.isfile(in_file):
        
        fip = open(in_file, 'r')
        in_lines = fip.readlines()
        fip.close()

        print("Got {:d} file names.".format(len(in_lines)))
        counter = 0

        if len(in_lines) > 0:
            for line in in_lines:
                line = line.rstrip()
                if line.endswith('.pe.asm'):
                    fname = line[0:line.find('.pe.asm')]
                    print("Copying file: {:s}".format(fname))
                    sub.call(["cp", ext_dir + fname, err_dir + fname])
                    counter += 1

        print("Completed copyinging {:d} files.".format(counter))
    
    else:
        print("{:s} input file not found.".format(in_file))
    
    return

In [None]:
copy_bad_pe('data/temp-disass-missing-asm-files-vs251.txt', '/opt/vs/train1/', '/opt/vs/train1err/')

In [None]:
copy_bad_pe('data/temp-disass-missing-asm-files-vs252.txt', '/opt/vs/train2/', '/opt/vs/train2err/')

In [None]:
copy_bad_pe('data/temp-disass-missing-asm-files-vs263.txt', '/opt/vs/train3/', '/opt/vs/train3err/')

In [None]:
copy_bad_pe('data/temp-disass-missing-asm-files-vs264.txt', '/opt/vs/train4/', '/opt/vs/train4err/')

In [None]:
copy_bad_pe('data/temp-disass-bad-asm-files-vs251.txt', '/opt/vs/train1/', '/opt/vs/train1err/')

In [None]:
copy_bad_pe('data/temp-disass-bad-asm-files-vs252.txt', '/opt/vs/train2/', '/opt/vs/train2err/')

In [None]:
copy_bad_pe('data/temp-disass-bad-asm-files-vs263.txt', '/opt/vs/train3/', '/opt/vs/train3err/')

In [None]:
copy_bad_pe('data/temp-disass-bad-asm-files-vs264.txt', '/opt/vs/train4/', '/opt/vs/train4err/')

In [14]:
def copy_bad_pe_header(in_file, ext_dir, err_dir):

    if os.path.isfile(in_file):
        
        fip = open(in_file, 'r')
        in_lines = fip.readlines()
        fip.close()

        print("Got {:d} file names.".format(len(in_lines)))
        counter = 0

        if len(in_lines) > 0:
            for line in in_lines:
                hdr_file_name = line.rstrip()
                if hdr_file_name.endswith('.pe.txt'):
                    fname = line[0:line.find('.pe.txt')]
                    print("Copying PE file: {:s}".format(fname))
                    sub.call(["cp", ext_dir + fname, err_dir + fname])
                    counter += 1

        print("Completed copying {:d} PE files.".format(counter))
    
    else:
        print("{:s} input file not found.".format(in_file))
        
    return

In [15]:
copy_bad_pe_header('data/temp-disass-bad-hdr-files-vs251.txt', '/opt/vs/train1/', '/opt/vs/train1err/')

Got 9 file names.
Copying PE file: VirusShare_7dc02b8661b9cd6311943701c90aef4e
Copying PE file: VirusShare_adbcb556aabf27758191f3be3f466c36
Copying PE file: VirusShare_787d3645c5b5393984e7557daa389249
Copying PE file: VirusShare_36cb828738111e0580a28607c713fcc7
Copying PE file: VirusShare_592d7ac775519110d58e9ce1975c1b5b
Copying PE file: VirusShare_4a0c79f6ad27b0a674b08005d102e16d
Copying PE file: VirusShare_7e681c6b0488c8533389660c86a70982
Copying PE file: VirusShare_d5eff38b212286c46db007aa7159ffd8
Copying PE file: VirusShare_c80d9b2dbf9b7953a3b6e9b51a39a0c2
Completed copying 9 PE files.


In [16]:
copy_bad_pe_header('data/temp-disass-bad-hdr-files-vs252.txt', '/opt/vs/train2/', '/opt/vs/train2err/')

Got 12 file names.
Copying PE file: VirusShare_e8c51dae6396d78e1c42c735f99c24e6
Copying PE file: VirusShare_f7e2f4f676454287ac2a7ec1fa941f00
Copying PE file: VirusShare_76cad0f51839af82a2c55270b3e27981
Copying PE file: VirusShare_c34c0c753fdca67a21674dfc7820fa71
Copying PE file: VirusShare_8c0ea62a8a791d81a7441e835b3320a4
Copying PE file: VirusShare_5d6417a3dc81e53127c50dfad1572252
Copying PE file: VirusShare_3eae26d3da9c58ee9519e23ef6ae5371
Copying PE file: VirusShare_e18a2dbf74eb09df1518ec79aba01073
Copying PE file: VirusShare_6c19c8e181dbaf6b50fe26322389459c
Copying PE file: VirusShare_d5bf45ef758c093f3f15ed243882b105
Copying PE file: VirusShare_3c5b41a6660c1f6e65bbeb136a91ecd3
Copying PE file: VirusShare_857738eff74ce15405aac235a5e25577
Completed copying 12 PE files.


In [None]:
copy_bad_pe_header('data/temp-disass-bad-hdr-files-vs263.txt', '/opt/vs/train3/', '/opt/vs/train3err/')

In [None]:
copy_bad_pe_header('data/temp-disass-bad-hdr-files-vs264.txt', '/opt/vs/train4/', '/opt/vs/train4err/')

In [17]:
copy_bad_pe_header('data/temp-disass-missing-hdr-files-vs251.txt', '/opt/vs/train1/', '/opt/vs/train1err/')

data/temp-disass-missing-hdr-files-vs251.txt input file not found.


In [18]:
copy_bad_pe_header('data/temp-disass-missing-hdr-files-vs252.txt', '/opt/vs/train2/', '/opt/vs/train2err/')

data/temp-disass-missing-hdr-files-vs252.txt input file not found.


In [None]:
copy_bad_pe_header('data/temp-disass-missing-hdr-files-vs263.txt', '/opt/vs/train3/', '/opt/vs/train3err/')

In [None]:
copy_bad_pe_header('data/temp-disass-missing-hdr-files-vs264.txt', '/opt/vs/train4/', '/opt/vs/train4err/')

In [6]:
# Functions to fix ASM file rename stuff up for train3.

def fix_file_names(ext_drive, file_list):
    asm_list_file = open('data/temp-train3-asm-files.txt','w')
    counter = 0
    
    for idx, file_name in enumerate(file_list):
        if not file_name.endswith('.asm'):
            continue
            
        file_path = ext_drive + file_name
        
        signat = sub.check_output(["file", file_path])
        signat = signat.replace(',','').rstrip() # get rid of newlines and commas they are annoying
        #print("File type: {:s}".format(signat))
            
        if 'data' in signat or 'text' in signat:
            fip = open(file_path, 'r')
            in_lines = fip.readlines()
            fip.close()
            
            for idx, line in enumerate(in_lines):
                if 'Hex-Rays' in line:
                    asm_list_file.write(file_path + "\n")
                    counter += 1
                    if counter % 1000 == 0:
                        print("{:d} IDA Pro ASM File: {:s}".format(counter, line))
                        
                    break
                    
                if idx > 10:
                    break
                    
    asm_list_file.close()
    
    print("Found {:d} IDA Pro ASM files.".format(counter))
    
    return


def move_asm_files(in_file):
    asm_file_list = open(in_file,'r')
    file_list = asm_file_list.readlines()
    asm_file_list.close()
    
    counter = 0
    
    print("Found {:d} files.".format(len(file_list)))
    
    for line in file_list:
        file_name = line.rstrip()
        signat = sub.check_output(["mv", file_name, file_name + ".bak"])
        counter += 1
        
        if counter % 1000 == 0:
            print("{:d} Moved file: {:s}".format(counter, file_name))


    
    print("Moved {:d} files.".format(counter))
    
    return


def revert_asm_files(ext_dir, file_list):
    
    counter = 0
    
    print("Found {:d} files.".format(len(file_list)))
    
    for fname in file_list:
        if fname.endswith('.bak'):
            new_file_path = ext_dir + fname[0:fname.find('.bak')]
            file_path = ext_dir + fname
        
            signat = sub.check_output(["mv", file_path, new_file_path])
            
            counter += 1
            
            if counter % 1000 == 0:
                print("{:d} Moved file: {:s}".format(counter, new_file_path))
        
    
    print("Reverted {:d} files.".format(counter))
    
    return

In [None]:
ext_drive = '/opt/vs/train3asm/'
tfiles = os.listdir(ext_drive)
revert_asm_files(ext_drive, tfiles)

In [None]:
move_asm_files('data/temp-train3-asm-files.txt')

In [None]:
#ext_drive = '/opt/vs/apt/'
ext_drive = '/opt/vs/train3/'
tfiles = os.listdir(ext_drive)
fix_file_names(ext_drive, tfiles)

In [None]:
#ext_drive = '/opt/vs/apt/'
ext_drive = '/opt/vs/asm/'
tfiles = os.listdir(ext_drive)
fix_file_names(tfiles)

In [None]:
ext_drive = '/opt/vs/hdr/'
tfiles = os.listdir(ext_drive)
fix_file_names(tfiles)

In [3]:
def rename_asm_files_fix(ext_dir):
    # Rename all the PE headers files so it is easier to process them.
    
    file_list = os.listdir(ext_dir)
    pe_counter = 0
    unpe_counter = 0
    
    print("Got total files: {:d}".format(len(file_list)))
    
    for fname in file_list:
        if fname.endswith('.pe.asm'):
            pe_counter += 1
        elif fname.endswith('.asm'):
            file_path = ext_dir + fname
            trunc_name = fname[0:fname.find('.asm')]
            new_path = ext_dir + trunc_name + '.pe.asm'
            result = sub.check_call(['mv', file_path, new_path])
            unpe_counter += 1

            if (unpe_counter % 1000) == 0:
                print('Renamed {:d} ASM files.'.format(unpe_counter))

    print('Completed move of {:d} ASM files with {:d} files already renamed.'.format(unpe_counter, pe_counter))
    
    return

In [4]:
rename_asm_files_fix('/opt/vs/train4asm/')

Got total files: 14366
Renamed 1000 ASM files.
Renamed 2000 ASM files.
Renamed 3000 ASM files.
Renamed 4000 ASM files.
Renamed 5000 ASM files.
Renamed 6000 ASM files.
Renamed 7000 ASM files.
Renamed 8000 ASM files.
Renamed 9000 ASM files.
Renamed 10000 ASM files.
Renamed 11000 ASM files.
Renamed 12000 ASM files.
Renamed 13000 ASM files.
Renamed 14000 ASM files.
Completed move of 14366 ASM files with 0 files already renamed.


In [5]:
rename_asm_files_fix('/opt/vs/train3asm/')

Got total files: 40980
Renamed 1000 ASM files.
Renamed 2000 ASM files.
Renamed 3000 ASM files.
Renamed 4000 ASM files.
Renamed 5000 ASM files.
Renamed 6000 ASM files.
Renamed 7000 ASM files.
Renamed 8000 ASM files.
Renamed 9000 ASM files.
Renamed 10000 ASM files.
Renamed 11000 ASM files.
Renamed 12000 ASM files.
Renamed 13000 ASM files.
Renamed 14000 ASM files.
Renamed 15000 ASM files.
Completed move of 15376 ASM files with 25603 files already renamed.


In [6]:
25603 + 15376

40979

In [3]:
def rename_binary_files_fix(ext_dir):
    
    file_list = os.listdir(ext_dir)
    pe_counter = 0
    
    print("Got total files: {:d}".format(len(file_list)))
    
    for fname in file_list:
        if fname.endswith('.pe.asm'):
            pe_counter += 1
            file_path = ext_dir + fname
            trunc_name = fname[0:fname.find('.pe.asm')]
            new_path = ext_dir + trunc_name + '.bin'
            result = sub.check_call(['mv', file_path, new_path])
            
            if (pe_counter % 1000) == 0:
                print('Renamed {:d} binary files.'.format(pe_counter))

    print('Completed rename of {:d} binary files.'.format(pe_counter))
    
    return

In [None]:
rename_binary_files_fix('/opt/vs/train3/')

In [9]:
def train3_binary_files_fix(ext_dir):
    
    file_list = os.listdir(ext_dir)
    counter = 0
    name_match_counter = 0
    file_id_file = 'data/sorted-file-id-features-vs263.csv'
    file_id_features = pd.read_csv(file_id_file)
    file_names_list = file_id_features['file_name']
    
    print("Got total files: {:d}".format(len(file_list)))
    
    for fname in file_list:
        tokens = fname.split('_')
        if len(tokens) < 2:
            continue
            
        if fname.endswith('.bin'):
            counter += 1
            file_path = ext_dir + fname
            
            trunc_name = tokens[1]
            trunc_name = trunc_name[0:trunc_name.find('.bin')]
            
            
            # Now lookup the correct hash value in the file id database
            # and generate the correct file name.
            for hash_name in file_names_list:
                short_hash = hash_name[0:-1]
                if trunc_name == short_hash:
                    new_path = ext_dir + 'VirusShare_' + hash_name
                    name_match_counter += 1
                    result = sub.call(['mv', file_path, new_path])
            
            if (counter % 1000) == 0:
                print('Renamed {:d} binary files {:s}'.format(counter, new_path))
                print('Match {:d} {:s}'.format(name_match_counter, fname))

    print('Completed rename of {:d} binary files with {:d} name matches.'.format(counter, name_match_counter))
    
    return

In [None]:
train3_binary_files_fix('/opt/vs/train3/')

In [None]:
help(os.path)

## Examine IDA Pro ASM Disassembly (call/int) Formats

In [3]:
# Test Call Graph Generation.

call_opcodes = ['call','int']
call_blocks = ['sub_']

def construct_call_graph(lines):
    vertex = '.program_entry_point' # this is the root node, corresponds to the program original entry point not C main().
    vertex_count = 1
    edge_count = 0
    cfgraph = gra.Graph()
    cfgraph.add_vertex(vertex)
    
    for row in lines:
        row = row.rstrip('\r\n')  # get rid of newlines they are annoying.
        if row.startswith(';'):
            continue
            
        if ';' in row:
            row = row.split(';')[0] # get rid of comments they are annoying.
            #print(row)
      
        # get rid of all these things they are annoying.
        row = row.replace('short','').replace('ds:',' ')
        row = row.replace('dword','').replace('near','').replace('far','')
        row = row.replace('ptr','').replace(':',' ').replace(',',' ')
        row = row.replace('@','').replace('?','')
        parts = row.split() # tokenize code line
        
        if (len(parts) < 2): # this is just a comment line
            continue
        
        if (parts[3] == 'endp'): # ignore subroutine end labels
            continue
        
        # check for subroutines and block labels
        # block and subroutine labels are always after the .text HHHHHHHH relative address
        for block in call_blocks:
            token = parts[2]  
            idx = token.find(block)
            if ((idx == 0) or (parts[3] == 'proc')):
                # add new vertex to the graph, we are now in a new subroutine
                vertex = token
                cfgraph.add_vertex(vertex)
                # print("Vertex: " + vertex)
                vertex_count += 1
                break

        # now check for edge opcode    
        for opcode in call_opcodes: # check the line for a new edge
            if opcode in parts:
                # Extract desination address/function name/interrupt number as the directed edge.
                idx = parts.index(opcode)
                edge_count += 1
                if ((idx + 1) < len(parts)): # in a few ASM files there is no operand, disassembly error?
                    next_vertex = parts[idx + 1]
                else:
                    next_vertex = "none"
                cfgraph.add_edge(vertex, next_vertex)
                # print("Edge: " + vertex + " " + parts[idx] + " " + edge)
                break

    # print("Vertex Count: {:d}".format(vertex_count))
    
    return cfgraph


def extract_call_graphs(multi_params):
    asm_files = multi_params.file_list
    ftot = len(asm_files)
    ext_drive = multi_params.ext_drive
    
    pid = os.getpid()
    feature_file = 'data/' + str(pid) + "-" + multi_params.feature_file 
    
    print('Process ID: {:d} Graph Feature file: {:s}'.format(pid, feature_file))
    
    graph_lines = []
    graph_features = []
    graph_file = open('data/' + str(pid) + "-" + multi_params.graph_file, 'w') # write as a graphviz DOT format file
    
    with open(feature_file, 'w') as f:
        # write the column names for the csv file
        fw = writer(f)
        #colnames = ['filename','vertex_count','edge_count','delta_max','density','diameter']
        #colnames = ['file_name','vertex_count','edge_count','delta_max','density']
        #fw.writerow(colnames) put in combine_feature_files
        
        # Now iterate through the file list and extract the call graph from each file.
        for idx, fname in enumerate(asm_files):
            fasm = open(ext_drive + fname, 'r', errors='ignore')
            lines = fasm.readlines()
            fasm.close()
            
            call_graph = construct_call_graph(lines)
            cgvc = call_graph.n_vertices()
            cgec = call_graph.n_edges()
            cgdm = call_graph.delta_max()
            cgde = call_graph.density()
            # cdia = call_graph.diameter() this is constantly problematic !!!
            
            fname_parts = fname.split('_') # Truncate the file name to the hash value.
            trunc_name = fname_parts[1]
            trunc_name = trunc_name[:trunc_name.find('.pe.asm')]
            
            graph_features.append([trunc_name] + [cgvc, cgec, cgdm, cgde])
            call_graph.set_graph_name(trunc_name)
            #graph_lines.append(call_graph.to_str('multinoleaf')) 
            graph_lines.append(call_graph.to_str('graphviz'))
            
            del(call_graph) # for some reason new graphs get appended to the previous graphs if not deleted???
            
            # Print progress
            if (idx + 1) % 100 == 0:
                print(pid, idx + 1, 'of', ftot, 'files processed.')
                fw.writerows(graph_features)
                graph_file.writelines(graph_lines)
                graph_features = []
                graph_lines = []
                
        # Write remaining files
        if len(graph_lines) > 0:
            fw.writerows(graph_features)
            graph_file.writelines(graph_lines)
            graph_features = []
            graph_lines = []

    graph_file.close()
    
    print('Process ID: {:d} finished.'.format(pid))
    
    return


class Multi_Params(object):
    def __init__(self, featurefile="", graphfile="", extdrive="", filelist=[]):
        self.feature_file = featurefile
        self.graph_file = graphfile
        self.ext_drive = extdrive
        self.file_list = filelist


In [None]:
def get_call_lines(file_list):
    

In [4]:
feature_file = 'sorted-pe-call-graph-features-apt.csv'
graph_file = 'pe-call-graphs-apt.gv'
ext_drive = '/opt/vs/aptasm/'
file_ext = '-apt'

file_list = os.listdir(ext_drive)
tfiles = [i for i in file_list if '.pe.asm' in i]

## Validate PE Header Feature Values.
    Picking up some negative values which stuffs up feature reduction in sklearn.

In [1]:
# TODO:
# Open pe header feature file.
# Parse for ",-" and replace with ",".
# Done.

import os
from csv import writer
import numpy as np
import pandas as pd
import math
import scipy.misc
import array
import time as tm
import re



In [1]:
def clean_out_the_negatives(feature_file_name, cleansed_out_file_name):
    # Cleanse the insidious negatives.
    #fip = pd.read_csv(feature_file_name, na_filter=False)
    fip = open(feature_file_name, 'r')
    fop = open(cleansed_out_file_name, 'w')
    in_lines = fip.readlines()
    counter = 0
    ltot = len(in_lines)
    
    for line in in_lines:
        line = line.replace(",-",",")
        fop.write(line)
        counter += 1
        
        # Print the awsome progress
        if (counter + 1) % 100 == 0:
            print(counter, 'of', ltot, 'lines processed.')
    
    fop.close()
    
    print("Completed clean of file: {:s}".format(feature_file_name))
    
    return

In [None]:
ext_drive = "/opt/vs/"
feature_file = ext_drive + "pe-header-features-vs263.csv"
clean_feature_file = ext_drive + "pe-header-features-vs263-clean.csv"
clean_out_the_negatives(feature_file, clean_feature_file)

In [2]:
# IDIOT: this does not remove the ELF samples, just removes ".elf"
# from the sample file name. Put a bloody else statement in the if block.
# Now have to check the PE/COFF sample names list because original file has
# been deleted.
def clean_out_elves_in_vs263(feature_file_name, cleansed_out_file_name):
    # Cleanse the 5 insidious ELF samples in the PE function counts.
    fip = open(feature_file_name, 'r')
    fop = open(cleansed_out_file_name, 'w')
    
    counter = 0
    elf_counter = 0    
  
    for line in fip:
        if '.elf,' in line:
            #line = line.replace(".elf","")
            elf_counter += 1
            print("ELF: {:s} = {:d}".format(line, elf_counter))
        else: # FIXED you idiot.
            fop.write(line)
            
        counter += 1
        
        # Print the awsome progress
        if (counter + 1) % 1000 == 0:
            print(' {:d} lines processed.'.format(counter))
    
    fop.close()
    fip.close()
    
    print("Completed clean of file: {:s} with {:d} elves removed.".format(feature_file_name, elf_counter))
    
    return

In [3]:
clean_out_elves_in_vs263('/opt/vs/call-graph-reduced-function_counts-vs263.csv', '/opt/vs/call-graph-reduced-function_counts-vs263-clean.csv')

 999 lines processed.
ELF: af8970eb045a77ad1c427eb6333c9efd.elf,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,