## 1. Process ClamAV and Windows Defender Malware Scan Reports
    Training labels will be generated from ClamAV, Windows Defender and VirusTotal.com reports.
    - vs00251.txt (clamav)
    - vs00252.txt (clamav)
    - vs00263.txt (clamav)
    - vs00264.txt (clamav)
    - MPDetection2.log (Windows Defender)
    - MPDetection3.log (Windows Defender)

In [1]:
from multiprocessing import Pool
import os
from csv import writer
import numpy as np
import pandas as pd
import math
import scipy.misc
import array
import time as tm
import io # this is required as a compatability layer between 2.x and 3.x because 2.x cannot read utf-16 text files.
import re
import matplotlib.pyplot as plt
import seaborn # make it look pretty.

In [2]:
ext_drive = '/opt/vs/'
tfiles1 = os.listdir(ext_drive + "train")
tfiles2 = os.listdir(ext_drive + "train2")

In [None]:
# First load in the clamav reports and convert to csv files.
file_name = 'data/vs00263.txt'
vfr1 = open(file_name, 'r')
vlines1 = vfr1.readlines()

# Do the next clamav file.
file_name = 'data/vs00264.txt'
vfr2 = open(file_name, 'r')
vlines2 = vfr2.readlines()

# Do the next clamav file.
file_name = 'data/vs00apt.txt'
vfr3 = open(file_name, 'r')
vlines3 = vfr3.readlines()

# Open the output csv file.
fop = open('data/clamav-vs263-264.csv', 'w')
csv_wouter = writer(fop)
cols = ['file_name','malware_type'] # write out the column names.
csv_wouter.writerow(cols)

process_clamav_report(vlines1, csv_wouter)
process_clamav_report(vlines2, csv_wouter)
process_clamav_report(vlines3, csv_wouter)

vfr1.close()
vfr2.close()
vfr3.close()
fop.close()

In [15]:
# First load in the clamav reports and convert to csv files.
file_name = 'data/vs00251.txt'
vfr1 = open(file_name, 'r')
vlines1 = vfr1.readlines()

# Do the next clamav file.
file_name = 'data/vs00252.txt'
vfr2 = open(file_name, 'r')
vlines2 = vfr2.readlines()

# Open the output csv file.
fop = open('data/clamav001.csv', 'w')
csv_wouter = writer(fop)
cols = ['file_name','malware_type'] # write out the column names.
csv_wouter.writerow(cols)

process_clamav_report(vlines1, csv_wouter)
process_clamav_report(vlines2, csv_wouter)

vfr1.close()
vfr2.close()
fop.close()

Processed line number 0 : 46b510e161423a7e626adc3d95440f44 -> Win.Trojan.Dialer-729.
Processed line number 1000 : e616e7a2bfe3f3bea6a3e6d3b6c91c28 -> OK.
Processed line number 2000 : 7fb7c2ec5b58decc43e2331f1517bcda -> Win.Trojan.Morstar-7.
Processed line number 3000 : 1fb2b8ae511fc4a20ef6bc45971b8372 -> OK.
Processed line number 4000 : 62e67f7dd75056c4cda1a95ea5d53127 -> Win.Trojan.Agent-1385451.
Processed line number 5000 : ef559ec5ed3f7281e1e64da2eb96b08b -> Win.Adware.913802-1.
Processed line number 6000 : 20f5f176c8b6f0b28634ff13e36e4d98 -> Win.Adware.Ticnomultibar-1.
Processed line number 7000 : 223c88c24429a84fb3d3774509d05061 -> OK.
Processed line number 8000 : 5039ac8290c69b9118ae747fa5ecb2f8 -> Win.Virus.Elkern-9.
Processed line number 9000 : 6d69e1b1336adbbb0c0ee0d4b00ebd74 -> OK.
Processed line number 10000 : 315762706a9156364caa1e1e4e6922df -> Win.Trojan.Cosmu-4.
Processed line number 11000 : 1eccb6695b7f882fcce1edb70f5535b7 -> Win.Trojan.KillAV-43.
Processed line number 1

In [14]:
def process_clamav_report(vlines, outfile):
    counter = 0
    outlines = []
    for idx, line in enumerate(vlines):
        if line.startswith('---'): # we hit the scan summary at end of file.
            break
        else:
            line = line.rstrip() # get rid of newlines they are annoying
            line = line.replace('_', ' ').replace(':', ' ') # get rid of these things they are annoying
            tokens = line.split()
            if len(tokens) > 2:
                malware_file_name = tokens[1]
                malware_type = tokens[2]
                outlines.append([malware_file_name, malware_type])
                counter += 1
                if (idx % 1000) == 0: # write out some lines
                    outfile.writerows(outlines)
                    outlines = []
                    print("Processed line number {:d} : {:s} -> {:s}.".format(idx, malware_file_name, malware_type))
            
    # Finish off.
    if (len(outlines) > 0):
        outfile.writerows(outlines)
        outlines = []
        
    print("Completed processing {:d} lines.".format(counter))

In [10]:
help(writer)

Help on built-in function writer in module _csv:

writer(...)
    csv_writer = csv.writer(fileobj [, dialect='excel']
                                [optional keyword args])
        for row in sequence:
            csv_writer.writerow(row)
    
        [or]
    
        csv_writer = csv.writer(fileobj [, dialect='excel']
                                [optional keyword args])
        csv_writer.writerows(rows)
    
    The "fileobj" argument can be any object that supports the file API.



## 2. Load the Training Sample Classifications from ClamAV.
    Now generate integer values for the labels based on the malware type, since ClamAV does not
    recognise all types of malware, find the unclassified files and send to VirusTotal.com for a
    second opinion. As there is no standard method of defining malware type strings we will
    have to do some munging on the virustotal results and convert to a ClamAV type malware
    classification string. Also scan with Windows Defender and MalwareBytes Anti-Malware and
    compare the results.

In [2]:
# now get the clamav data
clammals = pd.read_csv('data/clamav001.csv')

In [3]:
clammals.head()

Unnamed: 0,filename,malware_type
0,46b510e161423a7e626adc3d95440f44,Win.Trojan.Dialer-729
1,1103c897ed2979339774f48ff47c0203,Win.Trojan.Jorik-10673
2,1835b8c9ed56ca729ad664e4c1725b1c,Win.Worm.Mydoom-7
3,da301519b87e8b796ece22b3f4c13429,Win.Trojan.11484026-1
4,579659363281e349a93adfe5cfadf320,Win.Trojan.Sality-8178


In [4]:
clammals.shape

(131073, 2)

In [5]:
# Now we can assign a numerical value to each malware classification.
moks = clammals[clammals['malware_type'] == 'OK'] # these are all classified as OK by ClamAV, so we have to send them 
                                          # to VirusTotal.com for a second opinion.
moks.to_csv('data/malok.csv', index=False)
# Now sort and write out the labels.



In [6]:
moks.head()

Unnamed: 0,filename,malware_type
5,3d91f9da7b6ddd05f7fc3e6854ba51b9,OK
9,afeca052db9266bcdeb97d6f2a61a5e9,OK
12,ba251cd16eb5f6b16efbdd65f28eafc2,OK
17,7d03f1d4bcf044d44dec7396e750bef9,OK
18,9d03b0c2f333fb339e4e47359af759ef,OK


In [7]:
moks.shape

(38898, 2)

## 3. Process Windows Defender Malware Scan Report.
    Windows Defender detected 48678 of the samples, MalwareBytes detected +38000 but crashed at
    the end of the scan and the logs could not be recovered.

In [4]:
# First load in the Windows Defender reports and convert to csv files.
# NOTE: windows defender logs are UTF-16, so have to use io module to open in Python 2.x
# Two scans were conducted on vs00251 and vs00252.

#file_name = 'data/MPDetection1.log'
#vfr1 = io.open(file_name, mode='r', encoding='utf-16')
#vlines1 = vfr1.readlines()

# print("Read in {:d} lines {:s}".format(len(vlines1), vlines1[0]))

# This log file contains all the detections from vs00251 and vs00252 scans after the second scan.
file_name = 'data/MPDetection2.log'
vfr2 = io.open(file_name, mode='r', encoding='utf-16')
vlines2 = vfr2.readlines()

# Open the output csv file.
fop = open('data/defender-vs251-252.csv', 'w')
csv_wouter = writer(fop)
cols = ['file_name','malware_type'] # write out the column names.
csv_wouter.writerow(cols)

#process_defender_report(vlines1, csv_wouter)
process_defender_report(vlines2, csv_wouter)

#vfr1.close()
vfr2.close()
fop.close()

Processed line number 1000 : bfa8aab3bb5c5084ed0adc2b1874f470 -> Worm:Win32/VB.AT.
Processed line number 2000 : ec9aa46c3fccfaa3bbd01ed4eae73828 -> Worm:Win32/Yuner.A.
Processed line number 3000 : 23b7df82e89fad292f2914d60c19afef -> Worm:Win32/Picsys.C.
Processed line number 4000 : 5b6677a5a4a859c152f01e918940b0e6 -> Adware:Win32/Hotbar.
Processed line number 5000 : 92c99706bd4fbe4ae0bd547177f45ca4 -> Worm:Win32/Mydoom.O@mm.
Processed line number 6000 : 7330785b76d3a07e5907c849faa5123c -> Adware:Win32/Hotbar.
Processed line number 7000 : a7ef64cdca6b96a9938fa5f1567063bb -> Backdoor:Win32/Optixpro.T.
Processed line number 8000 : 62e8f66e9c5134440e2c074aca4c96dc -> Worm:Win32/Yuner.A.
Processed line number 9000 : aa3078efdc8db3fe1b44440cc86add2b -> BrowserModifier:Win32/Diplugem.
Processed line number 10000 : 56a6157b79f6bc5b9e030ca0ba45483c -> Worm:Win32/Soltern.L.
Processed line number 11000 : 6f6de894ec3d98c1dc0bc56470d1e8d5 -> Adware:Win32/Hotbar.
Processed line number 12000 : 1d8b5d

In [2]:
def process_defender_report(vlines, outfile):
    counter = 0
    outlines = []
    for idx, line in enumerate(vlines):
        if line.find('DETECTION') > 0: # we hit the scan summary at end of file.
            line = line.rstrip() # get rid of newlines they are annoying
            #line = line.replace('_', ' ').replace(':', ' ') 
            tokens = line.split()
            if len(tokens) > 2:
                temp_file_name = tokens[3]
                malware_type = tokens[2]
                temp_file_name = temp_file_name.replace('_',' ').replace('->',' ')
                path_tokens = temp_file_name.split()
                malware_file_name = path_tokens[1]
                outlines.append([malware_file_name, malware_type])
                counter += 1
                if (idx % 1000) == 0: # write out some lines
                    outfile.writerows(outlines)
                    outlines = []
                    print("Processed line number {:d} : {:s} -> {:s}.".format(idx, malware_file_name, malware_type))
            
    # Finish off.
    if (len(outlines) > 0):
        outfile.writerows(outlines)
        outlines = []
        
    print("Completed processing {:d} lines.".format(counter))

In [4]:
help(pd.DataFrame.drop_duplicates)

Help on method drop_duplicates in module pandas.core.frame:

drop_duplicates(self, cols=None, take_last=False, inplace=False) unbound pandas.core.frame.DataFrame method
    Return DataFrame with duplicate rows removed, optionally only
    considering certain columns
    
    Parameters
    ----------
    cols : column label or sequence of labels, optional
        Only consider certain columns for identifying duplicates, by
        default use all of the columns
    take_last : boolean, default False
        Take the last observed row in a row. Defaults to the first row
    inplace : boolean, default False
        Whether to drop duplicates in place or to return a copy
    
    Returns
    -------
    deduplicated : DataFrame



## 4. Load the Windows Defender Classifications and Combine with ClamAV Classifications.
    - script: combine_av_reports.py

In [8]:
windefmals = pd.read_csv('data/defender001.csv')
windefmals.head()

Unnamed: 0,filename,malware_type
0,00027c21667d9119a454df8cef2dc1c7,Trojan:JS/Redirector.QE
1,0004376a62e22f6ad359467eb742b8ff,Worm:Win32/Picsys.C
2,000634f03457d088c71dbffb897b1315,Worm:Win32/Rebhip
3,00072ed24314e91b63b425b3dc572f50,VirTool:Win32/VBInject.UG
4,00092d369958b67557da8661cc9093bc,Adware:Win32/Hotbar


In [9]:
windefmals.shape

(97347, 2)

In [10]:
clammals.head()

Unnamed: 0,filename,malware_type
0,46b510e161423a7e626adc3d95440f44,Win.Trojan.Dialer-729
1,1103c897ed2979339774f48ff47c0203,Win.Trojan.Jorik-10673
2,1835b8c9ed56ca729ad664e4c1725b1c,Win.Worm.Mydoom-7
3,da301519b87e8b796ece22b3f4c13429,Win.Trojan.11484026-1
4,579659363281e349a93adfe5cfadf320,Win.Trojan.Sality-8178


In [11]:
clammals.shape

(131073, 2)

In [12]:
131073 - 97347

33726

In [13]:
moks.head()

Unnamed: 0,filename,malware_type
5,3d91f9da7b6ddd05f7fc3e6854ba51b9,OK
9,afeca052db9266bcdeb97d6f2a61a5e9,OK
12,ba251cd16eb5f6b16efbdd65f28eafc2,OK
17,7d03f1d4bcf044d44dec7396e750bef9,OK
18,9d03b0c2f333fb339e4e47359af759ef,OK


In [14]:
moks.shape

(38898, 2)

In [21]:
allmals = clammals.merge(windefmals, on='file_name', how='outer', indicator=True, sort=True)

In [22]:
allmals.head(20)

Unnamed: 0,filename,malware_type_x,malware_type_y,_merge
0,00002e640cafb741bea9a48eaee27d6f,Win.Worm.Tufik-182,Virus:Win32/Parite.B,both
1,000118d12cbf9ad6103e8b914a6e1ac3,Win.Trojan.Agent-1345346,SoftwareBundler:Win32/Techsnab,both
2,0001776237ac37a69fcef93c1bac0988,Win.Trojan.Agent-1309696,TrojanDropper:Win32/Sventore.B,both
3,00027c21667d9119a454df8cef2dc1c7,OK,Trojan:JS/Redirector.QE,both
4,0003887ab64b8ae19ffa988638decac2,OK,,left_only
5,000403e4e488356b7535cc613fbeb80b,OK,TrojanDownloader:Win32/Fosniw.B,both
6,0004376a62e22f6ad359467eb742b8ff,Win.Worm.Picsys-1,Worm:Win32/Picsys.C,both
7,0004c8b2a0f4680a5694d74199b40ea2,Win.Adware.Loadmoney-12162,SoftwareBundler:Win32/ICLoader,both
8,000595d8b586915c12053104cf845097,Win.Adware.Mplug-2637,BrowserModifier:Win32/Diplugem,both
9,000634f03457d088c71dbffb897b1315,OK,Worm:Win32/Rebhip,both


In [27]:
uniq_allmals = allmals.drop_duplicates(subset='file_name', keep='first')

In [28]:
uniq_allmals.head(20)

Unnamed: 0,filename,malware_type_x,malware_type_y,_merge
0,00002e640cafb741bea9a48eaee27d6f,Win.Worm.Tufik-182,Virus:Win32/Parite.B,both
1,000118d12cbf9ad6103e8b914a6e1ac3,Win.Trojan.Agent-1345346,SoftwareBundler:Win32/Techsnab,both
2,0001776237ac37a69fcef93c1bac0988,Win.Trojan.Agent-1309696,TrojanDropper:Win32/Sventore.B,both
3,00027c21667d9119a454df8cef2dc1c7,OK,Trojan:JS/Redirector.QE,both
4,0003887ab64b8ae19ffa988638decac2,OK,,left_only
5,000403e4e488356b7535cc613fbeb80b,OK,TrojanDownloader:Win32/Fosniw.B,both
6,0004376a62e22f6ad359467eb742b8ff,Win.Worm.Picsys-1,Worm:Win32/Picsys.C,both
7,0004c8b2a0f4680a5694d74199b40ea2,Win.Adware.Loadmoney-12162,SoftwareBundler:Win32/ICLoader,both
8,000595d8b586915c12053104cf845097,Win.Adware.Mplug-2637,BrowserModifier:Win32/Diplugem,both
9,000634f03457d088c71dbffb897b1315,OK,Worm:Win32/Rebhip,both


In [29]:
uniq_allmals.shape

(131074, 4)

In [33]:
filled_uniq_allmals = uniq_allmals.replace(np.NaN, 'OK')

In [34]:
filled_uniq_allmals.head(20)

Unnamed: 0,filename,malware_type_x,malware_type_y,_merge
0,00002e640cafb741bea9a48eaee27d6f,Win.Worm.Tufik-182,Virus:Win32/Parite.B,both
1,000118d12cbf9ad6103e8b914a6e1ac3,Win.Trojan.Agent-1345346,SoftwareBundler:Win32/Techsnab,both
2,0001776237ac37a69fcef93c1bac0988,Win.Trojan.Agent-1309696,TrojanDropper:Win32/Sventore.B,both
3,00027c21667d9119a454df8cef2dc1c7,OK,Trojan:JS/Redirector.QE,both
4,0003887ab64b8ae19ffa988638decac2,OK,OK,left_only
5,000403e4e488356b7535cc613fbeb80b,OK,TrojanDownloader:Win32/Fosniw.B,both
6,0004376a62e22f6ad359467eb742b8ff,Win.Worm.Picsys-1,Worm:Win32/Picsys.C,both
7,0004c8b2a0f4680a5694d74199b40ea2,Win.Adware.Loadmoney-12162,SoftwareBundler:Win32/ICLoader,both
8,000595d8b586915c12053104cf845097,Win.Adware.Mplug-2637,BrowserModifier:Win32/Diplugem,both
9,000634f03457d088c71dbffb897b1315,OK,Worm:Win32/Rebhip,both


In [35]:
filled_uniq_allmals.shape

(131074, 4)

In [36]:
# Now we have our combined AV results, write to file.
filled_uniq_allmals.to_csv('data/sorted-av-report.csv', index=False)

In [38]:
moks = filled_uniq_allmals[filled_uniq_allmals['malware_type_x'] == 'OK'] 
moks = moks[moks['malware_type_y'] == 'OK']
moks.to_csv('data/malok.csv', index=False)
# these are all classified as OK by ClamAV and Windows Defender, 
# so we have to send them to VirusTotal.com for a second opinion.

In [39]:
moks.head(20)

Unnamed: 0,filename,malware_type_x,malware_type_y,_merge
4,0003887ab64b8ae19ffa988638decac2,OK,OK,left_only
16,000ac11fa7587b2316470b154254a219,OK,OK,left_only
22,000d8dda1d4d1a88276e2b25a064fa43,OK,OK,left_only
38,001ea100e348e7f72a8f1b5f737dbd0a,OK,OK,left_only
40,001ff0574000822be988193df8166bd2,OK,OK,left_only
45,00233e467ec20f975b09c3407877f7eb,OK,OK,left_only
47,0025420de5eeae2b56a44366aabdfe7a,OK,OK,left_only
49,0025cc13683331a61986b6433e768f3f,OK,OK,left_only
56,002efa40dbab524e00c66988a51ca1c2,OK,OK,left_only
88,00423f1656a26c53a787304f27aa60cd,OK,OK,left_only


In [40]:
moks.shape

(16918, 4)

## 5. Munge the Two Malware Classifications Together and Generate Unique Scalar Values.

In [2]:
mals = pd.read_csv('data/sorted-av-report.csv')
mals.head()

Unnamed: 0,filename,malware_type_x,malware_type_y,_merge
0,00002e640cafb741bea9a48eaee27d6f,Win.Worm.Tufik-182,Virus:Win32/Parite.B,both
1,000118d12cbf9ad6103e8b914a6e1ac3,Win.Trojan.Agent-1345346,SoftwareBundler:Win32/Techsnab,both
2,0001776237ac37a69fcef93c1bac0988,Win.Trojan.Agent-1309696,TrojanDropper:Win32/Sventore.B,both
3,00027c21667d9119a454df8cef2dc1c7,OK,Trojan:JS/Redirector.QE,both
4,0003887ab64b8ae19ffa988638decac2,OK,OK,left_only


In [3]:
mals.shape

(131074, 4)

In [5]:
scalar_labels = [0] * mals.shape[0]
len(scalar_labels)

131074

In [3]:
type_x = mals['malware_type_x']
type_y = mals['malware_type_y']
x_ok = type_x[type_x == 'OK']
y_ok = type_y[type_y == 'OK']
len(x_ok)

38899

In [5]:
len(y_ok)

39869

In [6]:
# Now generate unique scalar label map, we will use ClamAV as the default classification, if ClamAV is OK
# and Defender is not OK, then use the Defender classification, if both are OK then default to 0 label value for now.
scalar_labels = [0] * mals.shape[0]
label_map = {}
counter = 0
for idx, x_val in enumerate(type_x):
    if x_val == 'OK':
        if type_y.iloc[idx] != 'OK':
            mals.iloc[idx,1] = mals.iloc[idx,2] # copy the defender classification to ClamAV classification
        else:
            continue # leave the scalar label == 0
            
    # now add the classification to the label map with a new scalar value
    if mals.iloc[idx,1] not in label_map.keys():
        counter += 1
        label_map[mals.iloc[idx,1]] = counter
        
    # now get the scalar label for this malware sample
    scalar_labels[idx] = label_map[mals.iloc[idx,1]]
        
mals['label'] = scalar_labels
mals.head(20)

Unnamed: 0,filename,malware_type_x,malware_type_y,_merge,label
0,00002e640cafb741bea9a48eaee27d6f,Win.Worm.Tufik-182,Virus:Win32/Parite.B,both,1
1,000118d12cbf9ad6103e8b914a6e1ac3,Win.Trojan.Agent-1345346,SoftwareBundler:Win32/Techsnab,both,2
2,0001776237ac37a69fcef93c1bac0988,Win.Trojan.Agent-1309696,TrojanDropper:Win32/Sventore.B,both,3
3,00027c21667d9119a454df8cef2dc1c7,Trojan:JS/Redirector.QE,Trojan:JS/Redirector.QE,both,4
4,0003887ab64b8ae19ffa988638decac2,OK,OK,left_only,0
5,000403e4e488356b7535cc613fbeb80b,TrojanDownloader:Win32/Fosniw.B,TrojanDownloader:Win32/Fosniw.B,both,5
6,0004376a62e22f6ad359467eb742b8ff,Win.Worm.Picsys-1,Worm:Win32/Picsys.C,both,6
7,0004c8b2a0f4680a5694d74199b40ea2,Win.Adware.Loadmoney-12162,SoftwareBundler:Win32/ICLoader,both,7
8,000595d8b586915c12053104cf845097,Win.Adware.Mplug-2637,BrowserModifier:Win32/Diplugem,both,8
9,000634f03457d088c71dbffb897b1315,Worm:Win32/Rebhip,Worm:Win32/Rebhip,both,9


In [7]:
mals.to_csv('data/sorted_train_labels.csv', index=False)

In [14]:
# Output the malware scalar classifications.
fop = open('data/malware-class-labels.csv', 'w')
csv_wouter = writer(fop)
cols = ['malware_type','class'] # write out the column names.
csv_wouter.writerow(cols)
outlines = []
sorted_keys = label_map.keys()
sorted_keys.sort()
for key in sorted_keys:
    outlines.append([key, label_map[key]])
    if (idx % 100) == 0: # write out some lines
        csv_wouter.writerows(outlines)
        outlines = []
        print("Processed label {:s} -> {:d}.".format(key, val))
            
# Finish off.
if (len(outlines) > 0):
    csv_wouter.writerows(outlines)
    outlines = []
        
print("Completed processing {:d} labels.".format(counter))    
fop.close()

Completed processing 10506 labels.


In [8]:
131074 - 16918

114156

In [None]:
help(allmals.replace)

## 6. Munge the Two Malware Classifications Together and Generate Malware Families.
    Experiment 1, use truncated ClamAV or WinDefender definitions to generate malware families and
    assign a scalar training label to each family.  
    
    - DEPRECATED: use code in section 7 below.

In [4]:
mals = pd.read_csv('data/sorted_train_labels.csv')
mals.head(20)

Unnamed: 0,filename,malware_type_x,malware_type_y,_merge,label
0,00002e640cafb741bea9a48eaee27d6f,Win.Worm.Tufik-182,Virus:Win32/Parite.B,both,1
1,000118d12cbf9ad6103e8b914a6e1ac3,Win.Trojan.Agent-1345346,SoftwareBundler:Win32/Techsnab,both,2
2,0001776237ac37a69fcef93c1bac0988,Win.Trojan.Agent-1309696,TrojanDropper:Win32/Sventore.B,both,3
3,00027c21667d9119a454df8cef2dc1c7,Trojan:JS/Redirector.QE,Trojan:JS/Redirector.QE,both,4
4,0003887ab64b8ae19ffa988638decac2,OK,OK,left_only,0
5,000403e4e488356b7535cc613fbeb80b,TrojanDownloader:Win32/Fosniw.B,TrojanDownloader:Win32/Fosniw.B,both,5
6,0004376a62e22f6ad359467eb742b8ff,Win.Worm.Picsys-1,Worm:Win32/Picsys.C,both,6
7,0004c8b2a0f4680a5694d74199b40ea2,Win.Adware.Loadmoney-12162,SoftwareBundler:Win32/ICLoader,both,7
8,000595d8b586915c12053104cf845097,Win.Adware.Mplug-2637,BrowserModifier:Win32/Diplugem,both,8
9,000634f03457d088c71dbffb897b1315,Worm:Win32/Rebhip,Worm:Win32/Rebhip,both,9


In [3]:
mals.shape

(131074, 4)

In [7]:
# Now generate unique scalar label map for malware families, converting Windows Defender format to ClamAV as necessary.

type_x = mals['malware_type_x']
scalar_labels = [0] * mals.shape[0]
family_labels = [' '] * mals.shape[0]
family_label_map = {}
sample_counter_map = {}
family_counter_map = {}
counter = 0
p1 = re.compile('(\w+):(\w+)/(\w+)[!.-/]+(\w+)') # Windows Defender malware definition patterns
p2 = re.compile('(\w+):(\w+)/(\w+)')
pcav = re.compile('(\w+)\.(\w+)\.(\w+)[!./-](\w+)') # ClamAV malware definition pattern
malware_family = 'unknown'

for idx, x_val in enumerate(type_x):
    # first count the sample type
    if x_val in sample_counter_map.keys():
        sample_counter_map[x_val] += 1
    else:
        sample_counter_map[x_val] = 1
        
    if x_val != 'OK':
        # now check if it is a ClamAV definition.
        pos = x_val.find('-')
        if pos > 0:
            malware_family = x_val[0:pos]
        else:
            malware_family = x_val        
        # if it is a defender classification then convert to ClamAV classification.
        m = p1.match(x_val)
        if m != None:
            malware_family = m.group(2) + '.' + m.group(1) + '.' + m.group(3)
        else:
            m = p2.match(x_val)
            if m != None:
                malware_family = m.group(2) + '.' + m.group(1) + '.' + m.group(3)           
    else:
        continue # leave the scalar label == 0, the malware sample has not been classified.
            
    # now add the classification to the label map with a new scalar value
    if malware_family not in family_label_map.keys():
        counter += 1
        family_label_map[malware_family] = counter
        
    # Count the malware family occurrences.
    if (malware_family in family_counter_map.keys()):
        family_counter_map[malware_family] += 1
    else:
        family_counter_map[malware_family] = 1
                         
    # now get the scalar label for this malware sample
    scalar_labels[idx] = family_label_map[malware_family]
    family_labels[idx] = malware_family
        
    if (idx % 1000) == 0: # report progress
        print("Processed family label {:s} -> {:d}.".format(malware_family, family_label_map[malware_family]))
        
# Finish off by adding malware family label to training label set.
mals['family_label'] = scalar_labels
mals['family_label_str'] = family_labels
mals.head(20)

Processed family label Win.Worm.Tufik -> 1.
Processed family label Win.Trojan.Antifw -> 103.
Processed family label Win.Trojan.Sality -> 60.
Processed family label Legacy.Trojan.Agent -> 19.
Processed family label JS.Trojan.Redirector -> 3.
Processed family label Win.Trojan.Small -> 447.
Processed family label Win.Worm.Soltern -> 30.
Processed family label Win.Trojan.11484026 -> 42.
Processed family label Win.Downloader.94061 -> 179.
Processed family label Win.Adware.Zango -> 612.
Processed family label Win.Trojan.Aliser -> 199.
Processed family label Win32.VirTool.CeeInject -> 197.
Processed family label Win.Trojan.Cosmu -> 82.
Processed family label Win.Adware.Trymedia -> 39.
Processed family label Win.Trojan.Dialer -> 76.
Processed family label Win.Trojan.Firseria -> 92.
Processed family label Win.Trojan.KillAV -> 41.
Processed family label Win32.TrojanDownloader.Malushka -> 122.
Processed family label Win32.SoftwareBundler.OutBrowse -> 12.
Processed family label Win.Trojan.Adinstal

Unnamed: 0,filename,malware_type_x,malware_type_y,_merge,label,family_label,family_label_str
0,00002e640cafb741bea9a48eaee27d6f,Win.Worm.Tufik-182,Virus:Win32/Parite.B,both,1,1,Win.Worm.Tufik
1,000118d12cbf9ad6103e8b914a6e1ac3,Win.Trojan.Agent-1345346,SoftwareBundler:Win32/Techsnab,both,2,2,Win.Trojan.Agent
2,0001776237ac37a69fcef93c1bac0988,Win.Trojan.Agent-1309696,TrojanDropper:Win32/Sventore.B,both,3,2,Win.Trojan.Agent
3,00027c21667d9119a454df8cef2dc1c7,Trojan:JS/Redirector.QE,Trojan:JS/Redirector.QE,both,4,3,JS.Trojan.Redirector
4,0003887ab64b8ae19ffa988638decac2,OK,OK,left_only,0,0,
5,000403e4e488356b7535cc613fbeb80b,TrojanDownloader:Win32/Fosniw.B,TrojanDownloader:Win32/Fosniw.B,both,5,4,Win32.TrojanDownloader.Fosniw
6,0004376a62e22f6ad359467eb742b8ff,Win.Worm.Picsys-1,Worm:Win32/Picsys.C,both,6,5,Win.Worm.Picsys
7,0004c8b2a0f4680a5694d74199b40ea2,Win.Adware.Loadmoney-12162,SoftwareBundler:Win32/ICLoader,both,7,6,Win.Adware.Loadmoney
8,000595d8b586915c12053104cf845097,Win.Adware.Mplug-2637,BrowserModifier:Win32/Diplugem,both,8,7,Win.Adware.Mplug
9,000634f03457d088c71dbffb897b1315,Worm:Win32/Rebhip,Worm:Win32/Rebhip,both,9,8,Win32.Worm.Rebhip


In [8]:
mals.to_csv('data/sorted-family-train-labels.csv', index=False)

In [11]:
# Output the malware family scalar classifications.
fop = open('data/malware-family-labels.csv', 'w')
csv_wouter = writer(fop)
cols = ['malware_type','class'] # write out the column names.
csv_wouter.writerow(cols)
outlines = []
sorted_keys = family_label_map.keys()
sorted_keys.sort()
for key in sorted_keys:
    outlines.append([key, family_label_map[key]])
    if (idx % 100) == 0: # write out some lines
        csv_wouter.writerows(outlines)
        outlines = []
        print("Processed family label {:s} -> {:d}.".format(key, val))
            
# Finish off.
if (len(outlines) > 0):
    csv_wouter.writerows(outlines)
    outlines = []
        
print("Completed processing {:d} family labels.".format(len(sorted_keys)))    
fop.close()

Completed processing 2730 family labels.


In [12]:
# Output the malware classification counts.
fop = open('data/malware-class-counts.csv', 'w')
csv_wouter = writer(fop)
cols = ['malware_type','count'] # write out the column names.
csv_wouter.writerow(cols)
outlines = []
sorted_keys = sample_counter_map.keys()
sorted_keys.sort()
for key in sorted_keys:
    outlines.append([key, sample_counter_map[key]])
    if (idx % 100) == 0: # write out some lines
        csv_wouter.writerows(outlines)
        outlines = []
        print("Processed sample {:s} -> {:d}.".format(key, val))
            
# Finish off.
if (len(outlines) > 0):
    csv_wouter.writerows(outlines)
    outlines = []
        
print("Completed processing {:d} samples.".format(len(sorted_keys)))    
fop.close()

Completed processing 10507 samples.


In [13]:
# Output the malware family counts.
fop = open('data/malware-family-counts.csv', 'w')
csv_wouter = writer(fop)
cols = ['malware_type','count'] # write out the column names.
csv_wouter.writerow(cols)
outlines = []
sorted_keys = family_counter_map.keys()
sorted_keys.sort()
for key in sorted_keys:
    outlines.append([key, family_counter_map[key]])
    if (idx % 100) == 0: # write out some lines
        csv_wouter.writerows(outlines)
        outlines = []
        print("Processed family {:s} -> {:d}.".format(key, val))
            
# Finish off.
if (len(outlines) > 0):
    csv_wouter.writerows(outlines)
    outlines = []
        
print("Completed processing {:d} families.".format(len(sorted_keys)))    
fop.close()

Completed processing 2730 families.


## 7. Munge the Two Malware Classifications Together and Generate Malware Families.
    Experiment 2, use truncated ClamAV or WinDefender definitions to generate malware families and
    assign a scalar training label to each family. Use the WinDefender definitions by default or ClamAV
    if WinDefender classifies as OK. Start fresh with sorted-av-report.csv and generate new malware classification
    labels and family labels.
    
    - Script: generate-train-labels.py

In [25]:
mals = pd.read_csv('data/sorted-av-report.csv')
mals.head(20)

Unnamed: 0,filename,malware_type_x,malware_type_y,_merge
0,00002e640cafb741bea9a48eaee27d6f,Win.Worm.Tufik-182,Virus:Win32/Parite.B,both
1,000118d12cbf9ad6103e8b914a6e1ac3,Win.Trojan.Agent-1345346,SoftwareBundler:Win32/Techsnab,both
2,0001776237ac37a69fcef93c1bac0988,Win.Trojan.Agent-1309696,TrojanDropper:Win32/Sventore.B,both
3,00027c21667d9119a454df8cef2dc1c7,OK,Trojan:JS/Redirector.QE,both
4,0003887ab64b8ae19ffa988638decac2,OK,OK,left_only
5,000403e4e488356b7535cc613fbeb80b,OK,TrojanDownloader:Win32/Fosniw.B,both
6,0004376a62e22f6ad359467eb742b8ff,Win.Worm.Picsys-1,Worm:Win32/Picsys.C,both
7,0004c8b2a0f4680a5694d74199b40ea2,Win.Adware.Loadmoney-12162,SoftwareBundler:Win32/ICLoader,both
8,000595d8b586915c12053104cf845097,Win.Adware.Mplug-2637,BrowserModifier:Win32/Diplugem,both
9,000634f03457d088c71dbffb897b1315,OK,Worm:Win32/Rebhip,both


In [26]:
# Now generate unique scalar label map, we will use WinDefender as the default classification, if WinDefender is OK
# and ClamAV is not OK, then use the ClamAV classification, if both are OK then default to 0 label value for now.
type_x = np.array(mals['malware_type_x'])
type_y = np.array(mals['malware_type_y'])
scalar_labels = [0] * mals.shape[0]
scalar_label_map = {}
counter = 0
scalar_label_map['OK'] = 0

for idx, y_val in enumerate(type_y):
    if y_val != 'OK':
        mals.iloc[idx,1] = mals.iloc[idx,2] # copy the defender classification to ClamAV classification
            
    # now add the classification to the label map with a new scalar value
    if mals.iloc[idx,1] not in scalar_label_map.keys():
        counter += 1
        scalar_label_map[mals.iloc[idx,1]] = counter
        
    # now get the scalar label for this malware sample
    scalar_labels[idx] = scalar_label_map[mals.iloc[idx,1]]
        
mals['sample_label'] = scalar_labels
mals.head(20)

Unnamed: 0,filename,malware_type_x,malware_type_y,_merge,sample_label
0,00002e640cafb741bea9a48eaee27d6f,Virus:Win32/Parite.B,Virus:Win32/Parite.B,both,1
1,000118d12cbf9ad6103e8b914a6e1ac3,SoftwareBundler:Win32/Techsnab,SoftwareBundler:Win32/Techsnab,both,2
2,0001776237ac37a69fcef93c1bac0988,TrojanDropper:Win32/Sventore.B,TrojanDropper:Win32/Sventore.B,both,3
3,00027c21667d9119a454df8cef2dc1c7,Trojan:JS/Redirector.QE,Trojan:JS/Redirector.QE,both,4
4,0003887ab64b8ae19ffa988638decac2,OK,OK,left_only,0
5,000403e4e488356b7535cc613fbeb80b,TrojanDownloader:Win32/Fosniw.B,TrojanDownloader:Win32/Fosniw.B,both,5
6,0004376a62e22f6ad359467eb742b8ff,Worm:Win32/Picsys.C,Worm:Win32/Picsys.C,both,6
7,0004c8b2a0f4680a5694d74199b40ea2,SoftwareBundler:Win32/ICLoader,SoftwareBundler:Win32/ICLoader,both,7
8,000595d8b586915c12053104cf845097,BrowserModifier:Win32/Diplugem,BrowserModifier:Win32/Diplugem,both,8
9,000634f03457d088c71dbffb897b1315,Worm:Win32/Rebhip,Worm:Win32/Rebhip,both,9


In [27]:
mals.to_csv('data/sorted-av-report-labels-wd.csv', index=False)

In [28]:
# Output the malware sample scalar classifications.
fop = open('data/malware-class-labels-wd.csv', 'w')
csv_wouter = writer(fop)
cols = ['malware_type','class'] # write out the column names.
csv_wouter.writerow(cols)
outlines = []
sorted_keys = scalar_label_map.keys()
sorted_keys.sort()
for key in sorted_keys:
    outlines.append([key, scalar_label_map[key]])
    if (idx % 100) == 0: # write out some lines
        csv_wouter.writerows(outlines)
        outlines = []
        print("Processed label {:s} -> {:d}.".format(key, val))
            
# Finish off.
if (len(outlines) > 0):
    csv_wouter.writerows(outlines)
    outlines = []
        
print("Completed processing {:d} labels.".format(counter))    
fop.close()

Completed processing 5986 labels.


In [29]:
# Now generate unique scalar label map for malware families.

type_x = np.array(mals['malware_type_x'])
#type_y = mals['malware_type_y']
family_scalar_labels = [0] * mals.shape[0]
family_labels = [' '] * mals.shape[0]
family_label_map = {}
sample_counter_map = {}
family_counter_map = {}
counter = 0
pwd1 = re.compile('(\w+):(\w+)/(\w+)[!.-/]+(\w+)') # Windows Defender malware definition patterns.
pwd2 = re.compile('(\w+):(\w+)/(\w+)')
pcav = re.compile('(\w+)\.(\w+)\.(\w+)[!./-](\w+)') # ClamAV malware definition pattern.
malware_family = 'unknown'
family_label_map['unknown'] = 0 # The default family scalar label.

for idx, x_val in enumerate(type_x):
    # first count the sample type
    if x_val in sample_counter_map.keys():
        sample_counter_map[x_val] += 1
    else:
        sample_counter_map[x_val] = 1
        
    if x_val != 'OK':
        # if it is a defender classification then convert to ClamAV definition style.
        m = pwd1.match(x_val)
        if m != None:
            malware_family = m.group(2) + '.' + m.group(1) + '.' + m.group(3) # rearrange the components to
        else:                                                                 # (platform).(class).(type)
            m = pwd2.match(x_val)
            if m != None:
                malware_family = m.group(2) + '.' + m.group(1) + '.' + m.group(3) 
            else:
                # then check if it is a ClamAV definition.
                m = pcav.match(x_val)
                if m != None:        # just truncate the end bit off.
                    malware_family = m.group(1) + '.' + m.group(2) + '.' + m.group(3)
                else:
                    malware_family = x_val  # catch the corner cases and default to original name/definition.
        
    else:
        malware_family = 'unknown' # leave the scalar label == 0, the malware sample has not been classified.
            
    # now add the classification to the label map with a new scalar value
    if malware_family not in family_label_map.keys():
        counter += 1
        family_label_map[malware_family] = counter
        
    # Count the malware family occurrences.
    if (malware_family in family_counter_map.keys()):
        family_counter_map[malware_family] += 1
    else:
        family_counter_map[malware_family] = 1
                         
    # now get the scalar label for this malware sample
    family_scalar_labels[idx] = family_label_map[malware_family]
    family_labels[idx] = malware_family
        
    if (idx % 1000) == 0: # report progress
        print("Processed family label {:s} -> {:d}.".format(malware_family, family_label_map[malware_family]))
        
# Finish off by adding malware family label to training label set.
mals['family_label'] = family_scalar_labels
mals['family_label_str'] = family_labels
mals.head(20)

Processed family label Win32.Virus.Parite -> 1.
Processed family label Win32.BrowserModifier.Diplugem -> 8.
Processed family label Win32.SoftwareBundler.Fourthrem -> 51.
Processed family label VBS.Virus.Ramnit -> 18.
Processed family label JS.Trojan.Redirector -> 4.
Processed family label Win32.Trojan.Flymux -> 479.
Processed family label Win32.Worm.Soltern -> 29.
Processed family label Win.Trojan.11484026 -> 38.
Processed family label Win32.TrojanDownloader.Renos -> 165.
Processed family label Win32.Adware.Hotbar -> 11.
Processed family label Win.Trojan.Aliser -> 183.
Processed family label Win32.VirTool.CeeInject -> 181.
Processed family label Win32.Worm.VB -> 73.
Processed family label unknown -> 0.
Processed family label Win.Adware.Trymedia -> 35.
Processed family label Win32.Dialer.CarpeDiem -> 68.
Processed family label Win.Trojan.Firseria -> 83.
Processed family label Win32.Worm.Yuner -> 78.
Processed family label Win32.TrojanDownloader.Malushka -> 113.
Processed family label Wi

Unnamed: 0,filename,malware_type_x,malware_type_y,_merge,sample_label,family_label,family_label_str
0,00002e640cafb741bea9a48eaee27d6f,Virus:Win32/Parite.B,Virus:Win32/Parite.B,both,1,1,Win32.Virus.Parite
1,000118d12cbf9ad6103e8b914a6e1ac3,SoftwareBundler:Win32/Techsnab,SoftwareBundler:Win32/Techsnab,both,2,2,Win32.SoftwareBundler.Techsnab
2,0001776237ac37a69fcef93c1bac0988,TrojanDropper:Win32/Sventore.B,TrojanDropper:Win32/Sventore.B,both,3,3,Win32.TrojanDropper.Sventore
3,00027c21667d9119a454df8cef2dc1c7,Trojan:JS/Redirector.QE,Trojan:JS/Redirector.QE,both,4,4,JS.Trojan.Redirector
4,0003887ab64b8ae19ffa988638decac2,OK,OK,left_only,0,0,unknown
5,000403e4e488356b7535cc613fbeb80b,TrojanDownloader:Win32/Fosniw.B,TrojanDownloader:Win32/Fosniw.B,both,5,5,Win32.TrojanDownloader.Fosniw
6,0004376a62e22f6ad359467eb742b8ff,Worm:Win32/Picsys.C,Worm:Win32/Picsys.C,both,6,6,Win32.Worm.Picsys
7,0004c8b2a0f4680a5694d74199b40ea2,SoftwareBundler:Win32/ICLoader,SoftwareBundler:Win32/ICLoader,both,7,7,Win32.SoftwareBundler.ICLoader
8,000595d8b586915c12053104cf845097,BrowserModifier:Win32/Diplugem,BrowserModifier:Win32/Diplugem,both,8,8,Win32.BrowserModifier.Diplugem
9,000634f03457d088c71dbffb897b1315,Worm:Win32/Rebhip,Worm:Win32/Rebhip,both,9,9,Win32.Worm.Rebhip


In [30]:
mals.to_csv('data/sorted-family-train-labels-wd.csv', index=False)

In [31]:
# Output the malware family scalar classifications.
fop = open('data/malware-family-labels-wd.csv', 'w')
csv_wouter = writer(fop)
cols = ['malware_type','class'] # write out the column names.
csv_wouter.writerow(cols)
outlines = []
sorted_keys = family_label_map.keys()
sorted_keys.sort()
for key in sorted_keys:
    outlines.append([key, family_label_map[key]])
    if (idx % 100) == 0: # write out some lines
        csv_wouter.writerows(outlines)
        outlines = []
        print("Processed family label {:s} -> {:d}.".format(key, val))
            
# Finish off.
if (len(outlines) > 0):
    csv_wouter.writerows(outlines)
    outlines = []
        
print("Completed processing {:d} family labels.".format(len(sorted_keys)))    
fop.close()

Completed processing 2055 family labels.


In [32]:
# Output the malware sample classification counts.
fop = open('data/malware-class-counts-wd.csv', 'w')
csv_wouter = writer(fop)
cols = ['malware_type','count'] # write out the column names.
csv_wouter.writerow(cols)
outlines = []
sorted_keys = sample_counter_map.keys()
sorted_keys.sort()
for key in sorted_keys:
    outlines.append([key, sample_counter_map[key]])
    if (idx % 100) == 0: # write out some lines
        csv_wouter.writerows(outlines)
        outlines = []
        print("Processed sample {:s} -> {:d}.".format(key, val))
            
# Finish off.
if (len(outlines) > 0):
    csv_wouter.writerows(outlines)
    outlines = []
        
print("Completed processing {:d} samples.".format(len(sorted_keys)))    
fop.close()

Completed processing 5987 samples.


In [33]:
# Output the malware family counts.
fop = open('data/malware-family-counts-wd.csv', 'w')
csv_wouter = writer(fop)
cols = ['malware_type','count'] # write out the column names.
csv_wouter.writerow(cols)
outlines = []
sorted_keys = family_counter_map.keys()
sorted_keys.sort()
for key in sorted_keys:
    outlines.append([key, family_counter_map[key]])
    if (idx % 100) == 0: # write out some lines
        csv_wouter.writerows(outlines)
        outlines = []
        print("Processed family {:s} -> {:d}.".format(key, val))
            
# Finish off.
if (len(outlines) > 0):
    csv_wouter.writerows(outlines)
    outlines = []
        
print("Completed processing {:d} families.".format(len(sorted_keys)))    
fop.close()

Completed processing 2055 families.


In [4]:
# Join the malware family sample scalar classifications and counts.
cldf = pd.read_csv('data/malware-family-labels-wd.csv')
ccdf = pd.read_csv('data/malware-family-counts-wd.csv')
cjdf = pd.merge(cldf,ccdf,on='malware_type')
cjdf.to_csv('data/malware-family-wd.csv', index=False)

# Join the malware sample scalar classifications and counts.
cldf = pd.read_csv('data/malware-class-labels-wd.csv')
ccdf = pd.read_csv('data/malware-class-counts-wd.csv')
cjdf = pd.merge(cldf,ccdf,on='malware_type')
cjdf.to_csv('data/malware-class-wd.csv', index=False)

In [None]:
help(pd.merge)

In [2]:
mals = pd.read_csv('data/sorted-family-train-labels-wd.csv')
mals.head()

In [5]:
mals.drop(['malware_type_y', '_merge'], axis=1, inplace=True)

In [6]:
mals.head(20)

Unnamed: 0,filename,malware_type_x,sample_label,family_label,family_label_str
0,00002e640cafb741bea9a48eaee27d6f,Virus:Win32/Parite.B,1,1,Win32.Virus.Parite
1,000118d12cbf9ad6103e8b914a6e1ac3,SoftwareBundler:Win32/Techsnab,2,2,Win32.SoftwareBundler.Techsnab
2,0001776237ac37a69fcef93c1bac0988,TrojanDropper:Win32/Sventore.B,3,3,Win32.TrojanDropper.Sventore
3,00027c21667d9119a454df8cef2dc1c7,Trojan:JS/Redirector.QE,4,4,JS.Trojan.Redirector
4,0003887ab64b8ae19ffa988638decac2,OK,0,0,unknown
5,000403e4e488356b7535cc613fbeb80b,TrojanDownloader:Win32/Fosniw.B,5,5,Win32.TrojanDownloader.Fosniw
6,0004376a62e22f6ad359467eb742b8ff,Worm:Win32/Picsys.C,6,6,Win32.Worm.Picsys
7,0004c8b2a0f4680a5694d74199b40ea2,SoftwareBundler:Win32/ICLoader,7,7,Win32.SoftwareBundler.ICLoader
8,000595d8b586915c12053104cf845097,BrowserModifier:Win32/Diplugem,8,8,Win32.BrowserModifier.Diplugem
9,000634f03457d088c71dbffb897b1315,Worm:Win32/Rebhip,9,9,Win32.Worm.Rebhip


In [8]:
mals.to_csv('data/sorted-train-labels.csv', index=False)

### 7.1 Validate Munging and Scalar Label Generation Between Runs.

In [15]:
# Load in the train labels for each sample set run then compare the 
# training label value for each malware family and class to ensure
# each one has a unique scalar training label and the same malware
# types and families have the same label.
# [filename,malware_type_x,malware_type_y,sample_label,family_name,family_label]


def validate_label_generation():
    mals1_df = pd.read_csv('data/sorted-train-labels-vs251-252.csv')
    mals2_df = pd.read_csv('data/sorted-train-labels-vs263-264-apt.csv')

    counter = 0
    m1_x = np.array(mals1_df['malware_type_x'])
    m1_f = np.array(mals1_df['family_name'])
    m1_sl = np.array(mals1_df['sample_label'])
    m1_fl = np.array(mals1_df['family_label'])
    m2_x = np.array(mals2_df['malware_type_x'])
    m21_f = np.array(mals2_df['family_name'])
    m2_sl = np.array(mals2_df['sample_label'])
    m2_fl = np.array(mals2_df['family_label'])
    
    for idx1, mname1 in enumerate(m1_x):
        for idx2, mname2 in enumerate(m2_x):
            if mname1 == mname2:
                if m1_sl[idx1] != m2_sl[idx2]:
                    print("Sample label incongruence: {:d} {:d}".format(m1_sl[idx1], m2_sl[idx2]))
                    counter += 1
                    
                if (m1_fl[idx1] != m2_fl[idx2]):
                    print("Family label incongruence: {:d} {:d}".format(m1_fl[idx1], m2_fl[idx2]))
                    counter += 1            
        
        if (idx1 % 1000) == 0:
            print("Processed {:d} malware names.".format(idx1))


    print("Total Incongruence Errors: {:d}".format(counter))
    
    return

In [16]:
validate_label_generation()

Total Incongruence Errors: 0


In [10]:
# Split out the training sample sets.
def split_training_sets(training_set_directory, train_label_file, output_file):
    mals1_df = pd.read_csv(train_label_file)
    
    counter = 0
    file_list = os.listdir(training_set_directory)
    #malnames = np.array(mals1_df['file_name'])
    malnames = np.array(mals1_df['file_name'])
    truncated_filenames = []
    
    for fname in file_list:
        mname = fname[fname.find('_') + 1:]
        truncated_filenames.append(mname)
        counter += 1        
        
    #t1_df = mals1_df[mals1_df['file_name'].isin(truncated_filenames)]
    t1_df = mals1_df[mals1_df['file_name'].isin(truncated_filenames)]
    
    t1_df.to_csv(output_file, index=False)
    
    
    return t1_df


In [11]:
s1_df = split_training_sets('/opt/vs/train1/', 'data/sorted-train-labels-vs251-252.csv', 'data/sorted-train-labels-vs251.csv')
s1_df.head()

Unnamed: 0,file_name,malware_type_x,sample_label,family_name,family_label
3,00027c21667d9119a454df8cef2dc1c7,Trojan:JS/Redirector.QE,4,JS.Trojan.Redirector,4
4,0003887ab64b8ae19ffa988638decac2,OK,0,unknown,0
6,0004376a62e22f6ad359467eb742b8ff,Worm:Win32/Picsys.C,6,Win32.Worm.Picsys,6
9,000634f03457d088c71dbffb897b1315,Worm:Win32/Rebhip,9,Win32.Worm.Rebhip,9
10,00072ed24314e91b63b425b3dc572f50,VirTool:Win32/VBInject.UG,10,Win32.VirTool.VBInject,10


In [12]:
s1_df.shape

(65536, 5)

In [13]:
s2_df = split_training_sets('/opt/vs/train2/', 'data/sorted-train-labels-vs251-252.csv', 'data/sorted-train-labels-vs252.csv')
s2_df.head()

Unnamed: 0,file_name,malware_type_x,sample_label,family_name,family_label
0,00002e640cafb741bea9a48eaee27d6f,Virus:Win32/Parite.B,1,Win32.Virus.Parite,1
1,000118d12cbf9ad6103e8b914a6e1ac3,SoftwareBundler:Win32/Techsnab,2,Win32.SoftwareBundler.Techsnab,2
2,0001776237ac37a69fcef93c1bac0988,TrojanDropper:Win32/Sventore.B,3,Win32.TrojanDropper.Sventore,3
5,000403e4e488356b7535cc613fbeb80b,TrojanDownloader:Win32/Fosniw.B,5,Win32.TrojanDownloader.Fosniw,5
7,0004c8b2a0f4680a5694d74199b40ea2,SoftwareBundler:Win32/ICLoader,7,Win32.SoftwareBundler.ICLoader,7


In [14]:
s3_df = split_training_sets('/opt/vs/train3/', 'data/sorted-train-labels-vs263-264-apt.csv', 'data/sorted-train-labels-vs263.csv')
s3_df.head()

Unnamed: 0,file_name,malware_type_x,sample_label,family_name,family_label
2,0002b2f621ea5786be03bf4153532dce,PWS:Win32/OnLineGames.LW,59,Win32.PWS.OnLineGames,23
5,000401419eccde59975c713cfadc974c,Worm:Win32/Soltern!rfn,36,Win32.Worm.Soltern,29
6,00042f23bc15b89d9c6a7bde0e316f8b,Rogue:Win32/FakeRean,117,Win32.Rogue.FakeRean,93
7,0004824a60ff9fe1fb30d669a5baa627,Worm:Win32/Soltern.L,30,Win32.Worm.Soltern,29
8,0004c49071481789f1c8c80656638497,OK,0,unknown,0


In [15]:
s4_df = split_training_sets('/opt/vs/train4/', 'data/sorted-train-labels-vs263-264-apt.csv', 'data/sorted-train-labels-vs264.csv')
s4_df.head()

Unnamed: 0,file_name,malware_type_x,sample_label,family_name,family_label
0,000070db76b6dc1ee3497a3f9319848c,Trojan:JS/Redirector.QE,4,JS.Trojan.Redirector,4
1,00009cbc0a90337e4c30950a51ae3d67,Win.Adware.ForceStartPage-1,5987,Win.Adware.ForceStartPage,2055
3,0003c05a1320e64fe72438ab48da7ecf,TrojanClicker:JS/Faceliker.S,29,JS.TrojanClicker.Faceliker,28
4,0003e52a9267b657d9b08b2cbc0a2593,Trojan:JS/Redirector.QE,4,JS.Trojan.Redirector,4
9,0005743596135fe65f61da7a0eba0bb6,TrojanClicker:JS/Faceliker.D,91,JS.TrojanClicker.Faceliker,28


In [16]:
sa_df = split_training_sets('/opt/vs/apt/', 'data/sorted-train-labels-vs263-264-apt.csv', 'data/sorted-train-labels-apt.csv')
sa_df.head()

Unnamed: 0,file_name,malware_type_x,sample_label,family_name,family_label
61,001dd76872d80801692ff942308c64e6,Trojan:Win32/Sluegot.D,5992,Win32.Trojan.Sluegot,2057
75,002325a0a67fded0381b5648d7fe9b8e,Trojan:Win32/Sluegot.C,5993,Win32.Trojan.Sluegot,2057
469,00dbb9e1c09dbdafb360f3163ba5a3de,Backdoor:Win32/Stradatu,6005,Win32.Backdoor.Stradatu,2064
697,0149b7bd7218aab4e257d28469fddb0d,Trojan:Win32/Sluegot.A,6017,Win32.Trojan.Sluegot,2057
990,01e0dc079d4e33d8edd050c4900818da,Backdoor:Win32/Stradatu,6005,Win32.Backdoor.Stradatu,2064


In [17]:
s1_df = split_training_sets('/opt/vs/train1/', 'data/sorted-entropy-features-vs251-252.csv', 'data/sorted-entropy-features-vs251.csv')
s1_df.head()

Unnamed: 0,file_name,entropy,file_size
3,00027c21667d9119a454df8cef2dc1c7,0.666599,18390
4,0003887ab64b8ae19ffa988638decac2,0.90326,1134320
6,0004376a62e22f6ad359467eb742b8ff,0.803515,149720
9,000634f03457d088c71dbffb897b1315,0.957584,1725502
10,00072ed24314e91b63b425b3dc572f50,0.486112,328093


In [18]:
s1_df = split_training_sets('/opt/vs/train2/', 'data/sorted-entropy-features-vs251-252.csv', 'data/sorted-entropy-features-vs252.csv')
s1_df.head()

Unnamed: 0,file_name,entropy,file_size
0,00002e640cafb741bea9a48eaee27d6f,0.992174,208860
1,000118d12cbf9ad6103e8b914a6e1ac3,0.834382,201600
2,0001776237ac37a69fcef93c1bac0988,0.966021,682192
5,000403e4e488356b7535cc613fbeb80b,0.773787,199168
7,0004c8b2a0f4680a5694d74199b40ea2,0.985592,1165440


In [19]:
s1_df.shape

(65536, 3)

## 8. Test Code Only

In [5]:
mals1_df = pd.read_csv('data/sorted-av-report-vs251-252.csv')
mals1_df.head()

Unnamed: 0,filename,malware_type_x,malware_type_y,_merge
0,00002e640cafb741bea9a48eaee27d6f,Win.Worm.Tufik-182,Virus:Win32/Parite.B,both
1,000118d12cbf9ad6103e8b914a6e1ac3,Win.Trojan.Agent-1345346,SoftwareBundler:Win32/Techsnab,both
2,0001776237ac37a69fcef93c1bac0988,Win.Trojan.Agent-1309696,TrojanDropper:Win32/Sventore.B,both
3,00027c21667d9119a454df8cef2dc1c7,OK,Trojan:JS/Redirector.QE,both
4,0003887ab64b8ae19ffa988638decac2,OK,OK,left_only


In [6]:
mals1_df.drop('_merge', axis=1, inplace=True)
mals1_df.head()

Unnamed: 0,filename,malware_type_x,malware_type_y
0,00002e640cafb741bea9a48eaee27d6f,Win.Worm.Tufik-182,Virus:Win32/Parite.B
1,000118d12cbf9ad6103e8b914a6e1ac3,Win.Trojan.Agent-1345346,SoftwareBundler:Win32/Techsnab
2,0001776237ac37a69fcef93c1bac0988,Win.Trojan.Agent-1309696,TrojanDropper:Win32/Sventore.B
3,00027c21667d9119a454df8cef2dc1c7,OK,Trojan:JS/Redirector.QE
4,0003887ab64b8ae19ffa988638decac2,OK,OK


In [9]:
mals1_df.to_csv('data/sorted-av-report-vs251-252.csv', index=False)

In [4]:
malcounts = mals['malware_type_x'].value_counts()
malcounts

OK                                     16918
Worm:Win32/Yuner.A                      8452
Worm:Win32/VB.AT                        7830
Adware:Win32/Hotbar                     7434
BrowserModifier:Win32/Diplugem          5689
Worm:Win32/Soltern.L                    4957
SoftwareBundler:Win32/Ogimant           4243
PWS:Win32/OnLineGames.IZ                3789
Trojan:Win32/Dynamer!ac                 2527
TrojanDropper:Win32/Sventore.B          2141
Worm:Win32/Mydoom.O@mm                  1937
Win.Trojan.Morstar-7                    1768
SoftwareBundler:Win32/OutBrowse         1726
Win.Adware.Screensaver-1                1387
PWS:Win32/OnLineGames.LW                1384
Win.Trojan.Morstar-10                   1224
Worm:Win32/Soltern!rfn                  1210
Trojan:Win32/Bulta!rfn                  1054
Worm:Win32/Picsys.C                      974
Virus:VBS/Ramnit.gen!C                   958
Win.Adware.Agent-1111578                 904
Trojan:JS/Redirector.QE                  868
Win.Trojan

In [5]:
malcounts[:10].plot(kind='barh', rot=0)
plt.show()

In [8]:
# Windows Defender malware class matching patterns.

p1 = re.compile('(\w+):(\w+)/(\w+)[!.-]+(\w+)')
p2 = re.compile('(\w+):(\w+)/(\w+)')

m = p1.match('Backdoor:MSIL/Bladabindi!rfn')

# m.group(1) == 'Backdoor'
# m.group(2) == 'MSIL'
# m.group(3) == 'Bladabindi'
# m.group(4) == 'rfn'

# Convert to ClamAV style malware family
malware_family = 'unknown'
if m != None:
    malware_family = m.group(2) + '.' + m.group(1) + '.' + m.group(3)
else:
    m = p2.match('Backdoor:MSIL/Bladabindi')
    if m != None:
        malware_family = m.group(2) + '.' + m.group(1) + '.' + m.group(3)

print(malware_family)

# Convert ClamAV malware class to malware family by removing the number and hyphen from the end of the malware string.

malware_str = 'Andr.Adware.Kuguo-2'
pos = malware_str.find('-')
if pos > 0:
    malware_family = malware_str[0:pos]
else:
    malware_family = malware_str
    
print(malware_family)

MSIL.Backdoor.Bladabindi
Andr.Adware.Kuguo


In [3]:
class_labels_df = pd.read_csv('data/av-malware-class-labels.csv')
family_labels_df = pd.read_csv('data/av-malware-family-labels.csv')
vs1_df = pd.read_csv('data/sorted-av-report-vs251-252.csv')
vs2_df = pd.read_csv('data/sorted-av-report-vs263-264-apt.csv')

print("Class Labels = {:d}, Family Labels = {:d}".format(class_labels_df.shape[0], family_labels_df.shape[0]))

Class Labels = 5987, Family Labels = 2055


In [5]:
type_x = np.array(vs2_df['malware_type_x'])
type_y = np.array(vs2_df['malware_type_y'])
scalar_labels = [0] * vs2_df.shape[0]
counter = 0
scalar_label_map = {}

for idx, y_val in enumerate(type_y):
    if y_val != 'OK':
        malware_name = y_val
    else:
        malware_name = vs2_df.iloc[idx,1]

    if malware_name not in scalar_label_map.keys():
        counter += 1
        scalar_label_map[malware_name] = counter
        
    # now get the scalar label for this malware sample
    scalar_labels[idx] = scalar_label_map[malware_name]

print("Class Labels: {:d}".format(len(scalar_label_map.keys())))

Class Labels: 4596


In [7]:
sorted_train_labels_df = pd.read_csv('data/sorted-train-labels-vs251-252.csv')
type_x = np.array(sorted_train_labels_df['malware_type_x'])
counter = 0
for malware_name in scalar_label_map.keys():
    if malware_name not in type_x:
        counter += 1
        
print("New Malware Types: {:d}".format(counter))

New Malware Types: 2347


In [8]:
5987 + 2347

8334

In [2]:
class_labels_df = pd.read_csv('data/av-malware-class-labels.csv')
family_labels_df = pd.read_csv('data/av-malware-family-labels.csv')
vs1_df = pd.read_csv('data/sorted-train-labels-vs251-252.csv')
vs2_df = pd.read_csv('data/sorted-train-labels-vs263-264-apt.csv')

print("Class Labels = {:d}, Family Labels = {:d}".format(class_labels_df.shape[0], family_labels_df.shape[0]))

Class Labels = 5987, Family Labels = 2055


In [3]:
newclass_labels_df = pd.read_csv('data/av-malware-class-labels-wd.csv')
newfamily_labels_df = pd.read_csv('data/av-malware-family-labels-wd.csv')

print("Class Labels = {:d}, Family Labels = {:d}".format(newclass_labels_df.shape[0], newfamily_labels_df.shape[0]))

Class Labels = 8334, Family Labels = 2737


In [10]:
newclass_labels_df = pd.read_csv('data/av-malware-class-labels-wd.csv')
newfamily_labels_df = pd.read_csv('data/av-malware-family-labels-wd.csv')

print("Class Labels = {:d}, Family Labels = {:d}".format(newclass_labels_df.shape[0], newfamily_labels_df.shape[0]))

Class Labels = 8334, Family Labels = 2737


In [4]:
class_labels_df = pd.read_csv('data/av-malware-class-labels.csv')
family_labels_df = pd.read_csv('data/av-malware-family-labels.csv')
vs1_df = pd.read_csv('data/sorted-train-labels-vs251.csv')
vs2_df = pd.read_csv('data/sorted-train-labels-vs252.csv')
vs3_df = pd.read_csv('data/sorted-train-labels-vs263.csv')
vs4_df = pd.read_csv('data/sorted-train-labels-vs264.csv')
vs5_df = pd.read_csv('data/sorted-train-labels-apt.csv')
print("Class Labels = {:d}, Family Labels = {:d}".format(class_labels_df.shape[0], family_labels_df.shape[0]))
ok_count = vs1_df["malware_type_x"].value_counts()
ok_count

Class Labels = 8334, Family Labels = 2737


OK                                   8007
Worm:Win32/Soltern.L                 4745
Worm:Win32/Yuner.A                   4035
Adware:Win32/Hotbar                  3787
Worm:Win32/VB.AT                     3756
BrowserModifier:Win32/Diplugem       2098
PWS:Win32/OnLineGames.IZ             2028
SoftwareBundler:Win32/Ogimant        1980
Worm:Win32/Soltern!rfn               1206
Trojan:Win32/Dynamer!ac              1182
Worm:Win32/Mydoom.O@mm               1127
Worm:Win32/Picsys.C                   960
Win.Trojan.Morstar-7                  890
TrojanDropper:Win32/Sventore.B        821
Win.Adware.Screensaver-1              759
SoftwareBundler:Win32/OutBrowse       755
PWS:Win32/OnLineGames.LW              750
Win.Trojan.Morstar-10                 583
Trojan:Win32/Bulta!rfn                561
Win.Trojan.11484026-1                 516
Win.Adware.Agent-1111578              447
Worm:Win32/Mydoom.L@mm                417
Win.Trojan.Trymedia-7                 313
Win.Trojan.Downloadware-15        

In [6]:
vs1_df.shape[0] - 8007

57529

In [7]:
ok_count = vs2_df["malware_type_x"].value_counts()
ok_count

OK                                 8911
Worm:Win32/Yuner.A                 4417
Worm:Win32/VB.AT                   4074
Adware:Win32/Hotbar                3647
BrowserModifier:Win32/Diplugem     3591
SoftwareBundler:Win32/Ogimant      2263
PWS:Win32/OnLineGames.IZ           1761
Trojan:Win32/Dynamer!ac            1345
TrojanDropper:Win32/Sventore.B     1320
SoftwareBundler:Win32/OutBrowse     971
Win.Trojan.Morstar-7                878
Worm:Win32/Mydoom.O@mm              810
Virus:VBS/Ramnit.gen!C              702
Trojan:JS/Redirector.QE             667
Win.Trojan.Morstar-10               641
PWS:Win32/OnLineGames.LW            634
Win.Adware.Screensaver-1            628
Trojan:Win32/Bulta!rfn              493
Win.Adware.Agent-1111578            457
Trojan:Win32/Rimecud.A              342
Win.Adware.913802-1                 336
Win.Trojan.11484026-1               318
Win.Trojan.Morstar-12               318
Win.Adware.Trymedia-3               313
Exploit:HTML/IframeRef.gen          307


In [8]:
vs2_df.shape[0] - 8911

56625

In [9]:
ok_count = vs3_df["malware_type_x"].value_counts()
ok_count

OK                                    13924
Worm:Win32/Soltern.L                   8356
Trojan:JS/Redirector.QE                3571
Adware:Win32/Hotbar                    3340
Worm:Win32/Soltern!rfn                 2474
BrowserModifier:Win32/Diplugem         1800
PWS:Win32/OnLineGames.IZ               1742
Worm:Win32/Picsys.C                    1661
Trojan:Win32/Dynamer!ac                1133
Win.Adware.Screensaver-1                828
Win.Trojan.11484026-1                   655
PWS:Win32/OnLineGames.LW                613
Worm:Win32/Mydoom.O@mm                  550
Virus:VBS/Ramnit.gen!C                  469
Win.Adware.Agent-1126070                455
Win.Trojan.Downloadware-15              455
SoftwareBundler:Win32/Ogimant           447
Win.Adware.Imali-17                     429
Win.Trojan.Trymedia-7                   408
Trojan:JS/HideLink.A                    398
TrojanClicker:JS/Faceliker.A            393
Worm:Win32/Yuner.A                      363
Trojan:JS/Iframe.AE             

In [10]:
vs3_df.shape[0] - 13924

51612

In [11]:
ok_count = vs4_df["malware_type_x"].value_counts()
ok_count

OK                                   23262
Trojan:JS/Redirector.QE              11217
Worm:Win32/Soltern.L                  2967
BrowserModifier:Win32/Diplugem        1700
Virus:VBS/Ramnit.gen!C                1654
TrojanClicker:JS/Faceliker.A          1480
TrojanClicker:JS/Faceliker.S          1270
Trojan:JS/Iframe.AE                   1168
Trojan:JS/HideLink.A                  1005
Worm:Win32/Soltern!rfn                 830
TrojanClicker:JS/Faceliker.D           763
Adware:Win32/Hotbar                    632
Trojan:JS/Redirector.QD                595
Worm:Win32/Picsys.C                    533
TrojanClicker:JS/Faceliker.C           502
Exploit:HTML/IframeRef.gen             473
Trojan:JS/Iframe.EP                    453
PWS:Win32/OnLineGames.IZ               449
Win.Adware.Imali-17                    444
SoftwareBundler:Win32/Bervisec         438
Trojan:JS/Redirector.ON                410
Trojan:JS/Iframeinject                 358
Virus:VBS/Ramnit.gen!A                 349
SoftwareBun

In [12]:
vs4_df.shape[0] - 23262

42274

In [13]:
ok_count = vs5_df["malware_type_x"].value_counts()
ok_count

Trojan:Win32/Connapts               40
Backdoor:Win32/Likseput.B           25
Backdoor:Win32/Neporoot.A           21
Trojan:Win32/Sluegot.A              13
Backdoor:Win32/Tartober.A           12
Backdoor:Win32/Stradatu             12
TrojanDownloader:Win32/Govdi.A       8
Backdoor:Win32/Neunut.A              8
Backdoor:Win32/Likseput.A            7
Backdoor:Win32/Ecltys.A              7
Trojan:Win32/Dynamer!dtc             7
Backdoor:Win32/Warood.B              7
Backdoor:Win32/Noobot.A              6
TrojanDownloader:Win32/Dalbot.A      5
Backdoor:Win32/Minaps.A              5
Backdoor:Win32/Sharat.gen!A          5
TrojanDownloader:Win32/Pingbed.A     5
Backdoor:Win32/Pingbed.A             4
Backdoor:Win32/Xifos.A               4
Backdoor:Win32/Touasper.A            4
TrojanDownloader:Win32/Small.XR      4
Backdoor:Win32/Miniasroot.A          4
Backdoor:Win32/Tosct.A               4
Trojan:Win32/Sluegot.C               4
TrojanDownloader:Win32/Coswid.A      3
Trojan:Win32/Sluegot.D   

In [14]:
vs5_df.shape[0] - 1

292

In [17]:
(vs1_df.shape[0] * 4) #+ 293 

262144

In [18]:
262144 + 293

262437

In [None]:
counter = 0
errors = 0
found = False
fip = open('/opt/vs/unpacked_file_list-vs251-252.txt','r')
unpacked_list = fip.readlines()
fip.close()
file_list = os.listdir('/opt/vs/asm/')
file_list.sort()
hdr_list = []
asm_list = []
for fname in file_list:
    if fname.endswith('.asm'):
        asm_list.append(fname)
    elif fname.endswith('.txt'):
        hdr_list.append(fname)

print("Header list size: {:d}".format(len(hdr_list)))
print("ASM list size: {:d}".format(len(asm_list)))

hdr_list.sort()
asm_list.sort()

for idx, fname in enumerate(asm_list):
    asm_name = fname[0:fname.find(".asm")]
    #hdr_name = hdr_list[idx]
    #hdr_name = hdr_name[0:hdr_name.find(".hdr")]
    #if asm_name not in asm_list:
    #if asm_name != hdr_name:
    for hname in hdr_list:
        hdr_name = hname[0:hname.find(".txt")]
        if asm_name == hdr_name:
            print("Successful Disassembly for: {:s}".format(asm_name))
            counter += 1
            found = True
            break
            
    if not found:
        errors += 1
    else:    
        found = False
    
    
        
print("Total Successful Disassemblies: {:d} Total Disassembly Errors: {:d}".format(counter, errors))

In [13]:
counter = 0
errors = 0
found = False
fip = open('/opt/vs/unpacked_file_list-vs251-252.txt','r')
unpacked_list = fip.readlines()
fip.close()
file_list = os.listdir('/opt/vs/asm/')
file_list.sort()
hdr_list = []
asm_list = []
for fname in file_list:
    if fname.endswith('.asm'):
        asm_list.append(fname)
    elif fname.endswith('.txt'):
        hdr_list.append(fname)

print("Header list size: {:d}".format(len(hdr_list)))
print("ASM list size: {:d}".format(len(asm_list)))

hdr_list.sort()
asm_list.sort()

for idx, fname in enumerate(hdr_list):
    hdr_name = fname[0:fname.find(".txt")]
    #hdr_name = hdr_list[idx]
    #hdr_name = hdr_name[0:hdr_name.find(".hdr")]
    #if asm_name not in asm_list:
    #if asm_name != hdr_name:
    for hname in asm_list:
        asm_name = hname[0:hname.find(".asm")]
        if asm_name == hdr_name:
            #print("Successful Disassembly for: {:s}".format(asm_name))
            counter += 1
            found = True
            break
            
    if not found:
        errors += 1
        print("Failed Disassembly for: {:s}".format(hdr_name))
    else:    
        found = False
    
    
        
print("Total Successful Disassemblies: {:d} Total Disassembly Errors: {:d}".format(counter, errors))

Header list size: 792
ASM list size: 774
Failed Disassembly for: VirusShare_0003887ab64b8ae19ffa988638decac2
Failed Disassembly for: VirusShare_0025cc13683331a61986b6433e768f3f
Failed Disassembly for: VirusShare_006b4c72e79e60d10515a64ec6a4e021
Failed Disassembly for: VirusShare_00d574c8f6fe8453e0c57a8a731f15b4
Failed Disassembly for: VirusShare_01561d7971d10d2192e87b75a74980a4
Failed Disassembly for: VirusShare_018c4ec104af60efebd868c6c96c4015
Failed Disassembly for: VirusShare_027aceafdea60810bd493b91fad6d83b
Failed Disassembly for: VirusShare_028a2651d8a23f8a86c6a0440b817826
Failed Disassembly for: VirusShare_02acf1da2758c291fc377d4ea18efcce
Failed Disassembly for: VirusShare_02b88fab6d6a76e3f00e99d88b42e29e
Failed Disassembly for: VirusShare_02d15c11abb5ef375e9ac3e9f05a1a52
Failed Disassembly for: VirusShare_02e6357bc2e276c4113e6de1a5b1c69c
Failed Disassembly for: VirusShare_038ae293c2dd804f41f7f7305f37ebe2
Failed Disassembly for: VirusShare_03acebfbcabb20a76e707d585aaf8c49
Failed 

In [None]:
VirusShare_0003887ab64b8ae19ffa988638decac2
VirusShare_0025cc13683331a61986b6433e768f3f
VirusShare_006b4c72e79e60d10515a64ec6a4e021
VirusShare_00d574c8f6fe8453e0c57a8a731f15b4
VirusShare_01561d7971d10d2192e87b75a74980a4
Failed Disassembly for: VirusShare_018c4ec104af60efebd868c6c96c4015
Failed Disassembly for: VirusShare_027aceafdea60810bd493b91fad6d83b
Failed Disassembly for: VirusShare_028a2651d8a23f8a86c6a0440b817826
Failed Disassembly for: VirusShare_02acf1da2758c291fc377d4ea18efcce
Failed Disassembly for: VirusShare_02b88fab6d6a76e3f00e99d88b42e29e
Failed Disassembly for: VirusShare_02d15c11abb5ef375e9ac3e9f05a1a52
Failed Disassembly for: VirusShare_02e6357bc2e276c4113e6de1a5b1c69c
Failed Disassembly for: VirusShare_038ae293c2dd804f41f7f7305f37ebe2
Failed Disassembly for: VirusShare_03acebfbcabb20a76e707d585aaf8c49
Failed Disassembly for: VirusShare_6a4fbcfb44717eae2145c761c1c99b6a
Failed Disassembly for: VirusShare_af719814507fdca4b96184f33b6b92ea
Failed Disassembly for: VirusShare_d4ba6430996fb4021241efc97c607504
Failed Disassembly for: VirusShare_d8b7b276710127d233abcdb7313aac36