Importing Libraries

In [1]:
import os
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

Setting up Paths and API Keys

In [2]:
base_dir = os.path.dirname(os.path.abspath('__file__'))

virustotal_file_dir = os.path.join(base_dir, 'virustotal_file')
virustotal_behaviour_dir = os.path.join(base_dir, 'virustotal_behaviour')
cuckoo_reports_path = os.path.join(base_dir, 'cuckoo')

<br/><br/>

--------------

<br/><br/>

## Data Extraction

In [3]:
from utils import get_all_virus_total_reports, get_all_cuckoo_reports

all_file_reports = get_all_virus_total_reports(virustotal_file_dir)
all_behaviour_reports = get_all_virus_total_reports(virustotal_behaviour_dir)
all_cuckoo_reports = get_all_cuckoo_reports(cuckoo_reports_path)

Skipping .DS_Store as it is not a folder


File Reports

In [4]:
file_reports = pd.DataFrame(all_file_reports[1:])

# expand data column into multiple columns
file_reports = pd.concat([file_reports.drop(['data'], axis=1), file_reports['data'].apply(pd.Series)], axis=1)

# expand attributes column into multiple columns
file_reports = pd.concat([file_reports.drop(['attributes'], axis=1), file_reports['attributes'].apply(pd.Series)], axis=1)

# use only the columns we need
file_reports = file_reports[['apt_group', 'file_name', 'type', 'unique_sources', 'packers', 'size', 'type_extension', 'pe_info','last_analysis_results']]

# expand pe_info column into multiple columns
file_reports = pd.concat([file_reports.drop(['pe_info'], axis=1), file_reports['pe_info'].apply(pd.Series)], axis=1)

file_reports

Unnamed: 0,apt_group,file_name,type,unique_sources,packers,size,type_extension,last_analysis_results,timestamp,imphash,...,resource_details,resource_langs,resource_types,sections,compiler_product_versions,rich_pe_header_hash,import_list,exports,overlay,0
0,APT 1,d5fc378ab31019f99f613bdbabd5aa63d97a3cd0031e90...,file,4,{'PEiD': 'Microsoft Visual C++'},53760,exe,"{'Bkav': {'method': 'blacklist', 'engine_name'...",1.318256e+09,bc02d3eb1fbd816be6bbaef57b51b664,...,"[{'lang': 'CHINESE SIMPLIFIED', 'chi2': 9849.4...",{'CHINESE SIMPLIFIED': 1},{'RT_STRING': 1},"[{'name': '.text', 'chi2': 208705.75, 'virtual...","[[---] Unmarked objects (old) count=15, [---] ...",052dffcee4266379e2fb10a075e1cf6d,"[{'library_name': 'SHELL32.dll', 'imported_fun...",,,
1,APT 1,8c414cc53009cefac9ad9aa3ffd766085a7b76aa56f69a...,file,13,{'PEiD': 'Microsoft Visual C++'},15360,exe,"{'Bkav': {'method': 'blacklist', 'engine_name'...",1.285661e+09,4d333650666fecf3482501fc120e6226,...,,,,"[{'name': '.text', 'chi2': 80016.28, 'virtual_...",[[IMP] Windows Server 2003 SP1 DDK build 4035 ...,6e82f1876480c4b87d31cf741c4fc4f6,"[{'library_name': 'KERNEL32.dll', 'imported_fu...",,,
2,APT 1,051f9ff45c531ad265489f563e6babca55f4a3f94604ff...,file,7,"{'PEiD': 'Microsoft Visual C++ v6.0 DLL', 'Cyr...",33280,dll,"{'Bkav': {'method': 'blacklist', 'engine_name'...",1.300891e+09,48a27f442dc0d3fdacf1c9677c384769,...,"[{'lang': 'CHINESE SIMPLIFIED', 'chi2': 463505...",{'CHINESE SIMPLIFIED': 1},{'EXE': 1},"[{'name': '.text', 'chi2': 31830.25, 'virtual_...","[[ C ] VS98 (6.0) build 8168 count=4, [---] Un...",80d914bd667072fd143b24cbf4e5be5b,"[{'library_name': 'KERNEL32.dll', 'imported_fu...","[InstallService, ServiceMain, UninstallService...",,
3,APT 1,010f36d6b66747e906c8f7025df1e154ceabc598f9c1d9...,file,8,{'PEiD': 'Microsoft Visual C++'},301056,exe,"{'Bkav': {'method': 'blacklist', 'engine_name'...",1.346683e+09,293741ea8e458644ab6b51e3e5150c58,...,"[{'lang': 'CHINESE SIMPLIFIED', 'chi2': 44880....",{'CHINESE SIMPLIFIED': 3},"{'RT_ICON': 1, 'FILE': 1, 'RT_GROUP_ICON': 1}","[{'name': '.text', 'chi2': 102719.52, 'virtual...","[[ C ] VS98 (6.0) build 8168 count=11, [LNK] V...",1520370b4ac12759a65745e889867c57,"[{'library_name': 'KERNEL32.dll', 'imported_fu...",,,
4,APT 1,6e417e9fadda9948ed7a8bf472d48285126369c407aad8...,file,4,{'PEiD': 'Microsoft Visual C++ v6.0 DLL'},16896,dll,"{'Bkav': {'method': 'blacklist', 'engine_name'...",1.250580e+09,d1f5d82ce2627e0b8ddbcbcd370be897,...,"[{'lang': 'ENGLISH US', 'chi2': 67894.2, 'file...",{'ENGLISH US': 1},{'RT_VERSION': 1},"[{'name': '.text', 'chi2': 82028.49, 'virtual_...","[[ C ] VS98 (6.0) build 8168 count=4, [---] Un...",facb7db511e1118ea7346b9b42988139,"[{'library_name': 'KERNEL32.dll', 'imported_fu...","[ServiceMain, install, uninstall]",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3588,Dark Hotel,6102993eb00bd97832b74fd802b486abb7cb43712eccfe...,file,6,"{'F-PROT': 'RAR', 'Command': 'RAR', 'Varist': ...",322664,exe,"{'Bkav': {'method': 'blacklist', 'engine_name'...",1.268634e+09,9402b48d966c911f0785b076b349b5ef,...,"[{'lang': 'KOREAN', 'chi2': 86289.31, 'filetyp...","{'KOREAN': 12, 'NEUTRAL DEFAULT': 2}","{'RT_DIALOG': 6, 'RT_ICON': 1, 'RT_MANIFEST': ...","[{'name': '.text', 'chi2': 422214.72, 'virtual...","[[ C ] VS2005 build 50727 count=8, [IMP] VS200...",2eaa9f1ec1b1d5c52443604fa9765c03,"[{'library_name': 'COMCTL32.dll', 'imported_fu...",,"{'chi2': 497.32, 'filetype': 'RAR', 'entropy':...",
3589,Dark Hotel,6d6d550d6415fc64c4dc7c68655cced8ece6b911e31176...,file,10,{'PEiD': 'Microsoft Visual C++'},26112,exe,"{'Bkav': {'method': 'blacklist', 'engine_name'...",1.301530e+09,07979bd3614fca25d8c85fecf481649c,...,"[{'lang': 'NEUTRAL', 'chi2': 409183.28, 'filet...",{'NEUTRAL': 5},"{'RT_ICON': 2, 'RT_MENU': 1, 'RT_GROUP_ICON': 2}","[{'name': '.text', 'chi2': 51316.16, 'virtual_...","[[ C ] VS98 (6.0) build 8168 count=11, [LNK] V...",563c0a6dea6577ecd9264f9730dfe897,"[{'library_name': 'WININET.dll', 'imported_fun...",,,
3590,Dark Hotel,804d47631c16751f26af0c0f892d7036f628b314bf2322...,file,14,{'PEiD': 'Microsoft Visual C++'},28672,exe,"{'Bkav': {'method': 'blacklist', 'engine_name'...",1.294369e+09,b06435e88232cde7cd0ff95f38ab3509,...,,,,"[{'name': '.text', 'chi2': 335395.53, 'virtual...","[[ C ] VS98 (6.0) build 8168 count=11, [LNK] V...",71621d7bcaea7c218d1d9697bda172ec,"[{'library_name': 'WININET.dll', 'imported_fun...",,,
3591,Dark Hotel,98165b4667ae606a8ff0c8f398f584c264f1bf337344f4...,file,11,{'PEiD': 'Microsoft Visual C++'},376832,exe,"{'Bkav': {'method': 'blacklist', 'engine_name'...",1.231317e+09,e1dd10f8b6a2d123125ba69d8a8efc62,...,,,,"[{'name': '.text', 'chi2': 386231.72, 'virtual...","[[---] Unmarked objects (old) count=6, [---] U...",0b078f49d858460cbbcbda652182fe43,"[{'library_name': 'KERNEL32.dll', 'imported_fu...",,,


In [5]:
file_reports.type_extension.value_counts()

type_extension
exe     1843
dll     1024
rtf      402
docx     192
xlsx      30
xls       23
pdf       19
rar       13
doc       10
zip        7
ps1        3
swf        2
vbs        2
jar        1
ps         1
msg        1
lnk        1
ini        1
ppt        1
Name: count, dtype: int64

Behavioral Data

In [6]:
behaviour_reports = pd.DataFrame(all_behaviour_reports[1:])

# expand data column into multiple columns
behaviour_reports = pd.concat([behaviour_reports.drop(['data'], axis=1), behaviour_reports['data'].apply(pd.Series)], axis=1)

behaviour_reports

Unnamed: 0,apt_group,file_name,processes_tree,files_deleted,registry_keys_deleted,files_dropped,tags,dns_lookups,ip_traffic,registry_keys_set,...,text_decoded,processes_killed,services_created,services_deleted,windows_searched,crypto_algorithms_observed,services_stopped,verdict_labels,signals_hooked,crypto_plain_text
0,APT 1,8c414cc53009cefac9ad9aa3ffd766085a7b76aa56f69a...,"[{'process_id': '2980', 'name': '%WINDIR%\expl...",[%SAMPLEPATH%\8c414cc53009cefac9ad9aa3ffd76608...,[\REGISTRY\USER\S-1-5-21-1482476501-1645522239...,[{'path': '%USERPROFILE%\8c414cc53009cefac9ad9...,"[SELF_DELETE, DIRECT_CPU_CLOCK_ACCESS, CHECKS_...","[{'hostname': 'suirg.imly.org', 'resolved_ips'...","[{'destination_ip': '35.205.61.67', 'destinati...",[{'key': 'HKU\%SID%\Software\Microsoft\Windows...,...,,,,,,,,,,
1,APT 1,051f9ff45c531ad265489f563e6babca55f4a3f94604ff...,,,,,,,,,...,,,,,,,,,,
2,APT 1,010f36d6b66747e906c8f7025df1e154ceabc598f9c1d9...,"[{'process_id': '5716', 'name': '""C:\Users\<US...","[C:\Users\<USER>\Desktop\software.exe, %USERPR...",[HKCU\SOFTWARE\MICROSOFT\WINDOWS\CURRENTVERSIO...,"[{'path': 'thumbcache_idx.db', 'sha256': '0bfe...","[PERSISTENCE, SELF_DELETE, LONG_SLEEPS, RUNTIM...","[{'hostname': 'software.myftp.info', 'resolved...","[{'destination_ip': '204.79.197.203', 'destina...",[{'key': 'HKEY_CURRENT_USER\Software\Classes\L...,...,,,,,,,,,,
3,APT 1,6e417e9fadda9948ed7a8bf472d48285126369c407aad8...,,,,,,,,,...,,,,,,,,,,
4,APT 1,6f6d7fe35d8e2b17bfd85752b46f2d20f71087231d8830...,"[{'process_id': '540', 'name': '****.exe', 'ch...",[C:\analyse\1641940524.5873144_f58abb97-3322-4...,[\REGISTRY\MACHINE\SYSTEM\Acrobatviewercpp304\...,[{'path': 'C:\Users\<USER>\Downloads\6f6d7fe35...,"[DETECT_DEBUG_ENVIRONMENT, DIRECT_CPU_CLOCK_AC...","[{'hostname': 'software.myftp.info', 'resolved...","[{'destination_ip': '12.38.236.32', 'destinati...",[{'key': 'HKEY_CURRENT_USER\Software\Microsoft...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3587,Dark Hotel,6102993eb00bd97832b74fd802b486abb7cb43712eccfe...,"[{'process_id': '5168', 'name': '""C:\Users\<US...",[C:\Users\<USER>\AppData\Roaming\Adobe\Acro\__...,[\REGISTRY\USER\S-1-5-21-1482476501-1645522239...,"[{'path': 'ssasv.exe', 'sha256': 'dfabb16cab65...","[LONG_SLEEPS, CHECKS_HOSTNAME, RUNTIME_MODULES...","[{'hostname': 'updatejava.megabyet.net', 'reso...","[{'destination_ip': '199.59.243.225', 'destina...",[{'key': '\REGISTRY\MACHINE\Software\Wow6432No...,...,,,,,[EDIT],,,,,
3588,Dark Hotel,6d6d550d6415fc64c4dc7c68655cced8ece6b911e31176...,,[C:\Windows\System32\wbem\Performance\WmiApRpl...,[\REGISTRY\USER\S-1-5-21-1482476501-1645522239...,,,"[{'hostname': 'tom-PC', 'resolved_ips': ['192....","[{'destination_ip': '185.27.134.202', 'destina...",[{'key': 'HKEY_CURRENT_USER\Software\Microsoft...,...,,,,,[(null) - Shell_TrayWnd],,,,,
3589,Dark Hotel,804d47631c16751f26af0c0f892d7036f628b314bf2322...,"[{'process_id': '716', 'name': '****.exe'}]",,,,,,,,...,,,,,,,,,,
3590,Dark Hotel,98165b4667ae606a8ff0c8f398f584c264f1bf337344f4...,"[{'process_id': '348', 'name': '""${SamplePath}...",[C:\temp\652005\98165b4667ae606a8ff0c8f398f584...,,"[{'path': 'C:\windows\SysWOW64\boof.sys', 'sha...",[PERSISTENCE],"[{'hostname': 'www.microsoft.com', 'resolved_i...","[{'destination_ip': '69.195.129.72', 'destinat...",[{'key': '\REGISTRY\MACHINE\SOFTWARE\Wow6432No...,...,,,,,,,[Beep],,,


Cuckoo Reports

In [7]:
# Load malware data and remove rows with missing 'cuckoo_id'.
df_malware = pd.read_csv(os.path.join(base_dir, 'malware.csv'))
df_malware = df_malware[df_malware.cuckoo_id.notna()]
df_malware['cuckoo_id'] = df_malware['cuckoo_id'].astype(int).astype(object)

df_malware

Unnamed: 0,apt,file,cuckoo_id
0,APT 1,4123011354d8259e919fbdf605be1973a79100074959dc...,5121514
1,APT 1,c5d3906f7c6f39c16bb9b3d8061026d06f8a6dbb9a363f...,5121628
2,APT 1,6971d8780aafa44664a469ff82074b5fea575b16aad399...,5121432
3,APT 1,f23b384c2f44d7be20389a8f0d9688f650f6b88f8ad370...,5121804
4,APT 1,71434227b085c02a969d16c264574bf49863a0ac8966d0...,5121502
...,...,...,...
3588,Dark Hotel,ab478166ea93e9dac3e37a9ad7457aa58249046003238e...,5127146
3589,Dark Hotel,9bc2309d5e391dd14c2948c55551105572ec0ae5cfc1f3...,5126365
3590,Dark Hotel,e7d65f2e23e76e2378afe028bd091d98469aa36a3a1bb3...,5127257
3591,Dark Hotel,962810f908daab4ed0796ff563433eb65a60507d23089a...,5127104


In [8]:
cuckoo_reports = pd.DataFrame(all_cuckoo_reports)
cuckoo_reports['task_id'] = cuckoo_reports['task_id'].astype(int).astype(object)

# Merge data on 'cuckoo_id' and 'task_id' with a left join
df_cuckoo_reports = df_malware.merge(cuckoo_reports, left_on='cuckoo_id', right_on='task_id', how='left')

# Keep rows with non-null 'processes'
df_cuckoo_reports = df_cuckoo_reports[df_cuckoo_reports.processes.notna()]


df_cuckoo_reports

Unnamed: 0,apt,file,cuckoo_id,task_id,score,signatures_count,signature_mark_call_count,category,package,strings_count,static,fileops,generic_behavior,apistats,processes,summary,suricata,suricata_summary
0,APT 1,4123011354d8259e919fbdf605be1973a79100074959dc...,5121514,5121514,10.0,11,140,archive,exe,195,"{'pdb_path': None, 'pe_imports': [{'imports': ...",,[{'process_path': 'C:\Windows\System32\lsass.e...,"{'172': {'RegCreateKeyExW': 12, 'NtDuplicateOb...",[{'process_path': 'C:\Windows\System32\lsass.e...,{'regkey_written': ['HKEY_CURRENT_USER\Softwar...,[{'timestamp': '2024-08-14T03:28:23.625214+030...,{'timestamp': '2024-08-14T03:37:14.878360+0300...
1,APT 1,c5d3906f7c6f39c16bb9b3d8061026d06f8a6dbb9a363f...,5121628,5121628,10.0,5,0,archive,exe,87,"{'pdb_path': None, 'pe_imports': [{'imports': ...",,[{'process_path': 'C:\Windows\System32\lsass.e...,"{'2908': {'setsockopt': 1, 'WSASocketA': 1, 'S...",[{'process_path': 'C:\Windows\System32\lsass.e...,"{'connects_ip': ['205.159.83.91'], 'file_opene...",[{'timestamp': '2024-08-14T03:46:42.411318+030...,{'timestamp': '2024-08-14T03:55:22.499882+0300...
2,APT 1,6971d8780aafa44664a469ff82074b5fea575b16aad399...,5121432,5121432,10.0,10,44,archive,exe,103,"{'pdb_path': None, 'pe_imports': [{'imports': ...",,[{'process_path': 'C:\Windows\System32\lsass.e...,"{'316': {'RegCreateKeyExW': 5, 'LdrUnloadDll':...",[{'process_path': 'C:\Windows\System32\lsass.e...,{'regkey_written': ['HKEY_CURRENT_USER\Softwar...,[{'timestamp': '2024-08-14T03:14:40.196844+030...,{'timestamp': '2024-08-14T03:23:20.539288+0300...
3,APT 1,f23b384c2f44d7be20389a8f0d9688f650f6b88f8ad370...,5121804,5121804,10.0,16,224,archive,exe,1015,{'pdb_path': 'd:\Projects\WinRAR\SFX\build\sfx...,[{'path': 'C:\Users\Administrator\AppData\Loca...,[{'process_path': 'C:\Users\Administrator\AppD...,"{'1560': {'LdrUnloadDll': 7, 'NtOpenSection': ...",[{'process_path': 'C:\Windows\System32\lsass.e...,{'regkey_deleted': ['HKEY_CURRENT_USER\Softwar...,[{'timestamp': '2024-08-14T04:18:52.074571+030...,{'timestamp': '2024-08-14T04:27:47.251493+0300...
4,APT 1,71434227b085c02a969d16c264574bf49863a0ac8966d0...,5121502,5121502,10.0,10,140,archive,exe,155,"{'pdb_path': None, 'pe_imports': [{'imports': ...",,[{'process_path': 'C:\Windows\System32\lsass.e...,"{'2600': {'RegCreateKeyExW': 12, 'NtDuplicateO...",[{'process_path': 'C:\Windows\System32\lsass.e...,{'regkey_written': ['HKEY_CURRENT_USER\Softwar...,[{'timestamp': '2024-08-14T03:26:09.010023+030...,{'timestamp': '2024-08-14T03:34:56.949981+0300...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3588,Dark Hotel,ab478166ea93e9dac3e37a9ad7457aa58249046003238e...,5127146,5127146,10.0,13,116,archive,exe,2041,"{'pdb_path': None, 'pe_imports': [{'imports': ...",[{'path': 'C:\Users\Administrator\AppData\Loca...,[{'process_path': 'C:\Windows\System32\lsass.e...,"{'2712': {'NtOpenSection': 3, 'RegCloseKey': 1...",[{'process_path': 'C:\Windows\System32\lsass.e...,{'file_created': ['C:\Users\Administrator\AppD...,[{'timestamp': '2024-08-16T12:36:28.915464+030...,{'timestamp': '2024-08-16T12:47:00.150290+0300...
3589,Dark Hotel,9bc2309d5e391dd14c2948c55551105572ec0ae5cfc1f3...,5126365,5126365,10.0,10,132,archive,exe,186,"{'pdb_path': None, 'pe_imports': [{'imports': ...","[{'path': 'C:\Users', 'time': 125, 'op': 'dire...",[{'process_path': 'C:\Windows\System32\lsass.e...,"{'540': {'CreateToolhelp32Snapshot': 3, 'NtDup...",[{'process_path': 'C:\Windows\System32\lsass.e...,{'directory_created': ['C:\Users\Administrator...,[{'timestamp': '2024-08-16T01:14:19.119004+030...,{'timestamp': '2024-08-16T01:22:49.666816+0300...
3590,Dark Hotel,e7d65f2e23e76e2378afe028bd091d98469aa36a3a1bb3...,5127257,5127257,10.0,11,56,archive,exe,172,"{'pdb_path': None, 'pe_imports': [{'imports': ...",,[{'process_path': 'C:\Windows\System32\lsass.e...,"{'2664': {'RegCreateKeyExW': 11, 'NtDuplicateO...",[{'process_path': 'C:\Windows\System32\lsass.e...,{'regkey_written': ['HKEY_CURRENT_USER\Softwar...,[{'timestamp': '2024-08-16T12:53:11.836492+030...,{'timestamp': '2024-08-16T13:01:57.923261+0300...
3591,Dark Hotel,962810f908daab4ed0796ff563433eb65a60507d23089a...,5127104,5127104,10.0,15,100,archive,exe,533,{'pdb_path': 'd:\Projects\WinRAR\SFX\build\sfx...,[{'path': 'C:\Users\Administrator\AppData\Loca...,[{'process_path': 'C:\Windows\System32\lsass.e...,"{'2728': {'RegCreateKeyExW': 8, 'LdrUnloadDll'...",[{'process_path': 'C:\Windows\System32\lsass.e...,{'regkey_deleted': ['HKEY_CURRENT_USER\Softwar...,[{'timestamp': '2024-08-16T12:30:06.004309+030...,{'timestamp': '2024-08-16T12:36:40.673013+0300...


In [9]:
df_cuckoo_reports.describe()

Unnamed: 0,score,signatures_count,signature_mark_call_count,strings_count
count,3562.0,3562.0,3562.0,3562.0
mean,9.893038,9.070185,78.050533,626.774003
std,0.867434,5.165241,127.69572,505.53242
min,0.0,0.0,0.0,2.0
25%,10.0,5.0,4.0,266.0
50%,10.0,8.0,40.0,494.0
75%,10.0,12.0,92.0,799.0
max,10.0,34.0,1060.0,2048.0


<br/><br/>

--------------

<br/><br/>

## Data Cleaning

File Reports

In [10]:
file_cleaned = file_reports[['file_name', 'apt_group', 'unique_sources']]

file_cleaned['malicious_count'] = file_reports['last_analysis_results'].apply(lambda x: 0 if type(x) != dict else sum([1 for k, v in x.items() if v['category'] == 'malicious']))

file_cleaned['imports_count'] = file_reports['import_list'].apply(lambda x: 0 if type(x) != list else len(x))
file_cleaned['imported_functions'] = file_reports['import_list'].apply(lambda x: 0 if type(x) != list else sum([len(y['imported_functions']) for y in x]))

file_cleaned

Unnamed: 0,file_name,apt_group,unique_sources,malicious_count,imports_count,imported_functions
0,d5fc378ab31019f99f613bdbabd5aa63d97a3cd0031e90...,APT 1,4,65,9,146
1,8c414cc53009cefac9ad9aa3ffd766085a7b76aa56f69a...,APT 1,13,65,7,85
2,051f9ff45c531ad265489f563e6babca55f4a3f94604ff...,APT 1,7,63,3,53
3,010f36d6b66747e906c8f7025df1e154ceabc598f9c1d9...,APT 1,8,63,5,67
4,6e417e9fadda9948ed7a8bf472d48285126369c407aad8...,APT 1,4,53,4,70
...,...,...,...,...,...,...
3588,6102993eb00bd97832b74fd802b486abb7cb43712eccfe...,Dark Hotel,6,63,9,163
3589,6d6d550d6415fc64c4dc7c68655cced8ece6b911e31176...,Dark Hotel,10,65,9,75
3590,804d47631c16751f26af0c0f892d7036f628b314bf2322...,Dark Hotel,14,66,10,88
3591,98165b4667ae606a8ff0c8f398f584c264f1bf337344f4...,Dark Hotel,11,63,8,231


Behavioral Data

In [11]:
behaviour_cleaned = behaviour_reports[['file_name']]

behaviour_cleaned['files_written_count'] = behaviour_reports['files_written'].apply(lambda x: 0 if type(x) != list else len(x))
behaviour_cleaned['files_deleted_count'] = behaviour_reports['files_deleted'].apply(lambda x: 0 if type(x) != list else len(x))

behaviour_cleaned['registry_keys_count'] =  behaviour_reports['registry_keys_opened'].apply(lambda x: 0 if type(x) != list else len(x)) + \
                                            behaviour_reports['registry_keys_deleted'].apply(lambda x: 0 if type(x) != list else len(x)) + \
                                            behaviour_reports['registry_keys_set'].apply(lambda x: 0 if type(x) != list else len(x)) 

behaviour_cleaned['processes_count'] =  behaviour_reports['processes_created'].apply(lambda x: 0 if type(x) != list else len(x)) + \
                                        behaviour_reports['processes_injected'].apply(lambda x: 0 if type(x) != list else len(x)) + \
                                        behaviour_reports['processes_killed'].apply(lambda x: 0 if type(x) != list else len(x)) + \
                                        behaviour_reports['processes_terminated'].apply(lambda x: 0 if type(x) != list else len(x))

behaviour_cleaned['mutexes_count'] =  behaviour_reports['mutexes_created'].apply(lambda x: 0 if type(x) != list else len(x))
behaviour_cleaned['command_executions'] = behaviour_reports['command_executions'].apply(lambda x: 0 if type(x) != list else len(x))

behaviour_cleaned['ip_traffic_count'] = behaviour_reports['ip_traffic'].apply(lambda x: 0 if type(x) != list else len(x))
behaviour_cleaned['modules_loaded_count'] = behaviour_reports['modules_loaded'].apply(lambda x: 0 if type(x) != list else len(x))

behaviour_cleaned

Unnamed: 0,file_name,files_written_count,files_deleted_count,registry_keys_count,processes_count,mutexes_count,command_executions,ip_traffic_count,modules_loaded_count
0,8c414cc53009cefac9ad9aa3ffd766085a7b76aa56f69a...,14,11,637,26,17,8,4,68
1,051f9ff45c531ad265489f563e6babca55f4a3f94604ff...,0,0,0,0,0,0,0,0
2,010f36d6b66747e906c8f7025df1e154ceabc598f9c1d9...,72,31,957,64,48,27,15,120
3,6e417e9fadda9948ed7a8bf472d48285126369c407aad8...,0,0,0,0,0,0,0,0
4,6f6d7fe35d8e2b17bfd85752b46f2d20f71087231d8830...,29,14,597,53,32,12,2,139
...,...,...,...,...,...,...,...,...,...
3587,6102993eb00bd97832b74fd802b486abb7cb43712eccfe...,51,9,815,50,30,29,10,172
3588,6d6d550d6415fc64c4dc7c68655cced8ece6b911e31176...,14,5,245,11,16,13,4,46
3589,804d47631c16751f26af0c0f892d7036f628b314bf2322...,0,0,31,1,0,0,0,5
3590,98165b4667ae606a8ff0c8f398f584c264f1bf337344f4...,15,3,125,12,1,6,1,50


Extracting API Statistics

In [12]:
def extract_all_api_stats(api):
    # Return an empty dictionary if the input is not a dictionary.
    if not isinstance(api, dict): return {}
    
    compiled_api_stats = {}
    # Compile statistics for each API call across all entries.
    for key in api.keys():
        for api_call, value in api[key].items():
            compiled_api_stats[api_call] = compiled_api_stats.get(api_call, 0) + value

    return compiled_api_stats
  
  # Apply the function to extract API stats and convert results to a DataFrame.
df_ = df_cuckoo_reports['apistats'].apply(extract_all_api_stats).apply(pd.Series).fillna(0)

df_

Unnamed: 0,RegCreateKeyExW,NtDuplicateObject,CoUninitialize,RegCloseKey,NtQueryKey,GetBestInterfaceEx,HttpOpenRequestA,InternetOpenA,WSAStartup,NtClose,...,pdf_unescape,InternetOpenUrlW,WSARecvFrom,WSASendTo,InternetWriteFile,ReadCabinetState,RegisterHotKey,CertCreateCertificateContext,GetInterfaceInfo,CryptProtectData
0,12.0,3.0,5.0,43.0,6.0,2.0,1.0,1.0,7.0,59.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,4.0,3.0,30.0,4.0,2.0,0.0,4.0,4.0,41.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,11.0,41.0,41.0,626.0,26.0,42.0,4.0,4.0,16.0,1665.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12.0,4.0,5.0,43.0,6.0,2.0,1.0,1.0,7.0,60.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3588,18.0,10.0,5.0,264.0,8.0,2.0,3.0,3.0,10.0,456.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3589,18.0,9.0,5.0,86.0,8.0,2.0,3.0,3.0,12.0,95.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3590,11.0,9.0,3.0,53.0,4.0,2.0,3.0,3.0,9.0,182.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3591,9.0,8.0,8.0,110.0,4.0,2.0,3.0,3.0,6.0,182.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


 Performing PCA Analysis

In [13]:
from sklearn.decomposition import PCA
import numpy as np

# Initialize and fit PCA on the DataFrame.
pca = PCA().fit(df_)

# Calculate the importance of each feature based on their contribution to the principal components.
feature_importance = np.abs(pca.components_).sum(axis=0)

# Sort features based on their importance in descending order.
important_features = feature_importance.argsort()[::-1]
sorted_features = df_.columns[important_features]

# Select the top 8 most important features from the sorted list.
important_apis = sorted_features[:8]

# Print features ranked by their importance.
print("Features ranked by importance:", important_apis)

Features ranked by importance: Index(['InternetOpenA', 'DeleteUrlCacheEntryA', 'InternetConnectA',
       'InternetOpenUrlA', 'InternetQueryOptionA', 'HttpSendRequestA',
       'RegCreateKeyExW', 'gethostbyname'],
      dtype='object')


Cuckoo Report Data

In [14]:
# Create an empty DataFrame to hold the processed data
cuckoo_reports = pd.DataFrame()

# Remove '.zip' extension and replace it with '.json'
cuckoo_reports['file_name'] = df_cuckoo_reports['file'].str.replace('.zip', '.json')

# Calculate the number of processes, modules, and calls for each report
cuckoo_reports['process_call_count'] = df_cuckoo_reports['processes'].apply(lambda x: 0 if type(x) != list or len(x) == 0 else sum(0 if 'calls' not in y else len(y['calls']) for y in x))
cuckoo_reports['system32_module_count'] = df_cuckoo_reports['processes'].apply(lambda x: 0 if type(x) != list or len(x) == 0 else sum(len([z for z in data['modules'] if 'system32' in z['filepath'].lower()]) for data in x if 'modules' in data))
cuckoo_reports['syswow64_module_count'] = df_cuckoo_reports['processes'].apply(lambda x: 0 if type(x) != list or len(x) == 0 else sum(len([z for z in data['modules'] if 'syswow64' in z['filepath'].lower()]) for data in x if 'modules' in data))

# Calculate the number of PE resources and imports for each report
cuckoo_reports['pe_resources_count'] = df_cuckoo_reports['static'].apply(lambda x: 0 if type(x) != dict or 'pe_resources' not in x.keys() else len(x['pe_resources']))
cuckoo_reports['pe_imports_count'] = df_cuckoo_reports['static'].apply(lambda x: 0 if type(x) != dict or 'pe_imports' not in x.keys() else sum(len(z) for z in x['pe_imports'] ))

# Count files created under Administrator path from 'fileops'
cuckoo_reports['files_created_in_administrator'] =  df_cuckoo_reports['fileops'].apply(lambda x: 0 if type(x) != list else sum([1 for fileop in x if fileop['op'] == 'file_created' and fileop['path'].startswith('C:\\Users\\Administrator')]))

# Summarize the number of file operations and registry keys accessed
cuckoo_reports['files_not_ddl_exe_count'] = df_cuckoo_reports['generic_behavior'].apply(lambda x: sum(sum(len([ z for z in value if type(z) == str and not z.endswith('.dll') and not z.endswith('.exe')]) for key, value in y.get('summary').items() if 'file' in key and type(value) == list) for y in x))
cuckoo_reports['explorer_accessed_count'] = df_cuckoo_reports['generic_behavior'].apply(lambda x: sum(sum(len([ z for z in value if type(z) == str and 'Windows\\CurrentVersion\\explorer'.lower() in z.lower()]) for key, value in y.get('summary').items() if 'regkey_read' in key and type(value) == list) for y in x))

# Summarize the number of registry keys opened and values accessed
cuckoo_reports['regkey_opened_count'] = df_cuckoo_reports['summary'].apply(lambda x: 0 if type(x) != dict or 'regkey_opened' not in x.keys() else len(x['regkey_opened']))
cuckoo_reports['summary_values_count'] = df_cuckoo_reports['summary'].apply( lambda x: 0 if type(x) != dict else sum(len(val) for val in x.values()))
cuckoo_reports['keyboard_control_count'] = df_cuckoo_reports['summary'].apply(lambda x: 0 if type(x) != dict or 'regkey_opened' not in x.keys() else sum(1 for y in x['regkey_opened'] if 'Keyboard Layouts'.lower() in y.lower()))
cuckoo_reports['wpad_regkey_count'] = df_cuckoo_reports['summary'].apply(lambda x: 0 if type(x) != dict else sum( sum(1 for z in x[y] if type(z) == str and 'wpad' in z.lower()) for y in x.keys()))

# Include the number of strings
cuckoo_reports['strings_count'] = df_cuckoo_reports['strings_count']

# Include the number of signatures and signature mark call count
cuckoo_reports['signatures_count'] = df_cuckoo_reports['signatures_count']
cuckoo_reports['signature_mark_call_count'] = df_cuckoo_reports['signature_mark_call_count']

# Include the top 8 important columns identified by PCA.
for columns in important_apis:
  cuckoo_reports['api_' + columns] = df_[columns]

# Calculate average packet size from Suricata summary data
cuckoo_reports['average_pkt_size'] = df_cuckoo_reports['suricata_summary'].apply(lambda x: 0 if type(x) != dict else x['stats']['decoder']['avg_pkt_size'])

cuckoo_reports

Unnamed: 0,file_name,process_call_count,system32_module_count,syswow64_module_count,pe_resources_count,pe_imports_count,files_created_in_administrator,files_not_ddl_exe_count,explorer_accessed_count,regkey_opened_count,...,signature_mark_call_count,api_InternetOpenA,api_DeleteUrlCacheEntryA,api_InternetConnectA,api_InternetOpenUrlA,api_InternetQueryOptionA,api_HttpSendRequestA,api_RegCreateKeyExW,api_gethostbyname,average_pkt_size
0,4123011354d8259e919fbdf605be1973a79100074959dc...,0,63,26,1,12,0,3,0,23,...,140,1.0,0.0,1.0,0.0,0.0,3.0,12.0,0.0,133
1,c5d3906f7c6f39c16bb9b3d8061026d06f8a6dbb9a363f...,200,65,20,1,12,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,132
2,6971d8780aafa44664a469ff82074b5fea575b16aad399...,0,62,24,1,10,0,0,0,23,...,44,4.0,0.0,0.0,4.0,0.0,0.0,5.0,0.0,270
3,f23b384c2f44d7be20389a8f0d9688f650f6b88f8ad370...,3500,86,67,24,16,16,157,43,295,...,224,4.0,0.0,4.0,0.0,0.0,4.0,11.0,0.0,260
4,71434227b085c02a969d16c264574bf49863a0ac8966d0...,0,63,26,0,12,0,3,0,23,...,140,1.0,0.0,1.0,0.0,0.0,3.0,12.0,0.0,113
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3588,ab478166ea93e9dac3e37a9ad7457aa58249046003238e...,1000,68,76,16,6,43,115,0,224,...,116,3.0,0.0,3.0,0.0,0.0,3.0,18.0,3.0,134
3589,9bc2309d5e391dd14c2948c55551105572ec0ae5cfc1f3...,100,64,26,0,18,0,2,0,30,...,132,3.0,0.0,3.0,0.0,0.0,3.0,18.0,3.0,130
3590,e7d65f2e23e76e2378afe028bd091d98469aa36a3a1bb3...,300,64,27,0,20,0,1,0,30,...,56,3.0,0.0,3.0,0.0,0.0,3.0,11.0,3.0,140
3591,962810f908daab4ed0796ff563433eb65a60507d23089a...,300,73,47,19,20,2,8,14,73,...,100,3.0,0.0,3.0,0.0,0.0,3.0,9.0,0.0,137


Combined Data

In [15]:
# Combine data from three sources
df_cleaned = pd.merge(file_cleaned, behaviour_cleaned, on='file_name', how='inner')
df_cleaned = pd.merge(df_cleaned, cuckoo_reports, on='file_name', how='left').fillna(0)

# Backup before modifications
df_cleaned_pre_copy = df_cleaned.copy()

# Drop 'file_name' column
df_cleaned.drop('file_name', axis=1, inplace=True)

# Display cleaned DataFrame
df_cleaned

Unnamed: 0,apt_group,unique_sources,malicious_count,imports_count,imported_functions,files_written_count,files_deleted_count,registry_keys_count,processes_count,mutexes_count,...,signature_mark_call_count,api_InternetOpenA,api_DeleteUrlCacheEntryA,api_InternetConnectA,api_InternetOpenUrlA,api_InternetQueryOptionA,api_HttpSendRequestA,api_RegCreateKeyExW,api_gethostbyname,average_pkt_size
0,APT 1,13,65,7,85,14,11,637,26,17,...,60.0,1.0,0.0,1.0,0.0,0.0,1.0,9.0,1.0,134.0
1,APT 1,7,63,3,53,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,304.0
2,APT 1,8,63,5,67,72,31,957,64,48,...,76.0,0.0,0.0,0.0,0.0,2.0,0.0,21.0,0.0,121.0
3,APT 1,4,53,4,70,0,0,0,0,0,...,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,130.0
4,APT 1,4,62,5,67,29,14,597,53,32,...,76.0,0.0,0.0,0.0,0.0,2.0,0.0,21.0,0.0,128.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3587,Dark Hotel,6,63,9,163,51,9,815,50,30,...,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,139.0
3588,Dark Hotel,10,65,9,75,14,5,245,11,16,...,56.0,3.0,0.0,3.0,0.0,0.0,3.0,11.0,3.0,152.0
3589,Dark Hotel,14,66,10,88,0,0,31,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,133.0
3590,Dark Hotel,11,63,8,231,15,3,125,12,1,...,424.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,136.0


In [16]:
df_cleaned.dtypes

apt_group                          object
unique_sources                      int64
malicious_count                     int64
imports_count                       int64
imported_functions                  int64
files_written_count                 int64
files_deleted_count                 int64
registry_keys_count                 int64
processes_count                     int64
mutexes_count                       int64
command_executions                  int64
ip_traffic_count                    int64
modules_loaded_count                int64
process_call_count                float64
system32_module_count             float64
syswow64_module_count             float64
pe_resources_count                float64
pe_imports_count                  float64
files_created_in_administrator    float64
files_not_ddl_exe_count           float64
explorer_accessed_count           float64
regkey_opened_count               float64
summary_values_count              float64
keyboard_control_count            

In [17]:
df_cleaned.apt_group.value_counts()

apt_group
Gorgon Group      961
APT 1             404
Equation Group    395
Winnti            387
APT 29            283
Dark Hotel        273
APT 10            244
APT 28            211
APT 30            164
Energetic Bear    132
APT 21            106
APT 19             32
Name: count, dtype: int64

Performing PCA Analysis on the Combined Data

In [18]:
# Create a copy of the cleaned DataFrame and remove 'apt_group' column
df_cleaned_copy = df_cleaned.copy()
df_cleaned_copy = df_cleaned_copy.drop('apt_group', axis=1)

# Fit PCA on the modified DataFrame
pca = PCA().fit(df_cleaned_copy)

# Calculate and sort feature importance from PCA results
feature_importance = np.abs(pca.components_).sum(axis=0)
important_features = feature_importance.argsort()[::-1]
sorted_features = df_cleaned_copy.columns[important_features]

# Display sorted features by importance
print("Features ranked by importance:", sorted_features)

Features ranked by importance: Index(['syswow64_module_count', 'api_InternetConnectA', 'malicious_count',
       'api_InternetOpenUrlA', 'api_InternetOpenA', 'api_HttpSendRequestA',
       'api_DeleteUrlCacheEntryA', 'files_written_count', 'pe_resources_count',
       'files_created_in_administrator', 'wpad_regkey_count',
       'files_deleted_count', 'mutexes_count', 'keyboard_control_count',
       'unique_sources', 'files_not_ddl_exe_count', 'api_RegCreateKeyExW',
       'pe_imports_count', 'modules_loaded_count', 'imported_functions',
       'ip_traffic_count', 'regkey_opened_count', 'command_executions',
       'system32_module_count', 'processes_count', 'explorer_accessed_count',
       'process_call_count', 'average_pkt_size', 'summary_values_count',
       'imports_count', 'registry_keys_count', 'api_InternetQueryOptionA',
       'signature_mark_call_count', 'signatures_count', 'strings_count',
       'api_gethostbyname'],
      dtype='object')


<br/><br/>

--------------

<br/><br/>

## Data Preprocessed

In [19]:
# Identify numerical and categorical columns from DataFrame
numerical_cols = df_cleaned.select_dtypes(include=['int8', 'int16', 'int32', 'float16', 'float32', 'int64', 'float64']).columns.tolist()
categorical_cols = df_cleaned.select_dtypes(include=['category']).columns.tolist()

# Define the column to be excluded from feature sets
final_column = 'apt_group'

# Remove the final column from numerical columns if it exists there
if final_column in numerical_cols: numerical_cols.remove(final_column)

In [20]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Copy cleaned DataFrame for preprocessing
df_preprocessed = df_cleaned.copy()

# Encode categorical columns with integer codes
for col in categorical_cols:
    df_preprocessed[col] = df_preprocessed[col].astype('category').cat.codes

# Encode the target column using LabelEncoder
le = LabelEncoder()
df_preprocessed[final_column] = le.fit_transform(df_preprocessed[final_column])

# Retrieve 'file_name' from backup to maintain reference
df_preprocessed['file_name'] = df_cleaned_pre_copy['file_name']

# Store encoded target column back to original DataFrame for reference
df_cleaned[final_column + '_encoded'] = df_preprocessed[final_column]

# Display preprocessed DataFrame
df_preprocessed

Unnamed: 0,apt_group,unique_sources,malicious_count,imports_count,imported_functions,files_written_count,files_deleted_count,registry_keys_count,processes_count,mutexes_count,...,api_InternetOpenA,api_DeleteUrlCacheEntryA,api_InternetConnectA,api_InternetOpenUrlA,api_InternetQueryOptionA,api_HttpSendRequestA,api_RegCreateKeyExW,api_gethostbyname,average_pkt_size,file_name
0,0,13,65,7,85,14,11,637,26,17,...,1.0,0.0,1.0,0.0,0.0,1.0,9.0,1.0,134.0,8c414cc53009cefac9ad9aa3ffd766085a7b76aa56f69a...
1,0,7,63,3,53,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,304.0,051f9ff45c531ad265489f563e6babca55f4a3f94604ff...
2,0,8,63,5,67,72,31,957,64,48,...,0.0,0.0,0.0,0.0,2.0,0.0,21.0,0.0,121.0,010f36d6b66747e906c8f7025df1e154ceabc598f9c1d9...
3,0,4,53,4,70,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,130.0,6e417e9fadda9948ed7a8bf472d48285126369c407aad8...
4,0,4,62,5,67,29,14,597,53,32,...,0.0,0.0,0.0,0.0,2.0,0.0,21.0,0.0,128.0,6f6d7fe35d8e2b17bfd85752b46f2d20f71087231d8830...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3587,7,6,63,9,163,51,9,815,50,30,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,139.0,6102993eb00bd97832b74fd802b486abb7cb43712eccfe...
3588,7,10,65,9,75,14,5,245,11,16,...,3.0,0.0,3.0,0.0,0.0,3.0,11.0,3.0,152.0,6d6d550d6415fc64c4dc7c68655cced8ece6b911e31176...
3589,7,14,66,10,88,0,0,31,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,133.0,804d47631c16751f26af0c0f892d7036f628b314bf2322...
3590,7,11,63,8,231,15,3,125,12,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,136.0,98165b4667ae606a8ff0c8f398f584c264f1bf337344f4...


In [21]:
from imblearn.over_sampling import SMOTE

# Initialize SMOTE with the minority class strategy and a fixed random state
sm = SMOTE(sampling_strategy='minority', random_state=42)

# Combine numerical and categorical columns for features
feature_cols = numerical_cols + categorical_cols
X = df_preprocessed[feature_cols]  # Feature set
y = df_preprocessed[final_column]  # Target variable

# Apply SMOTE to generate synthetic samples
oversampled_X, oversampled_Y = sm.fit_resample(X, y)

# Split the oversampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(oversampled_X, oversampled_Y, test_size=0.4, random_state=1)


In [22]:
# Copy oversampled features to include original file names for traceability
df_sampled_x_with_origin = oversampled_X.copy()

# Define function to locate the original file name based on feature equality
def find_original_file(row):
    # Search for the original row in pre-copy data that matches the oversampled row
    all_files = df_cleaned_pre_copy[df_cleaned_pre_copy[feature_cols].eq(row).all(1)]
    if all_files.empty:
        return np.nan  # Return NaN if no match found
    else:
        return all_files['file_name'].values[0]  # Return the file name if found
    
# Apply function to add original file names to the DataFrame
df_sampled_x_with_origin['file_name'] = df_sampled_x_with_origin.apply(find_original_file, axis=1)

# Display DataFrame with file names
df_sampled_x_with_origin

Unnamed: 0,unique_sources,malicious_count,imports_count,imported_functions,files_written_count,files_deleted_count,registry_keys_count,processes_count,mutexes_count,command_executions,...,api_InternetOpenA,api_DeleteUrlCacheEntryA,api_InternetConnectA,api_InternetOpenUrlA,api_InternetQueryOptionA,api_HttpSendRequestA,api_RegCreateKeyExW,api_gethostbyname,average_pkt_size,file_name
0,13,65,7,85,14,11,637,26,17,8,...,1.0,0.0,1.0,0.0,0.000000,1.0,9.000000,1.0,134.000000,8c414cc53009cefac9ad9aa3ffd766085a7b76aa56f69a...
1,7,63,3,53,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,304.000000,051f9ff45c531ad265489f563e6babca55f4a3f94604ff...
2,8,63,5,67,72,31,957,64,48,27,...,0.0,0.0,0.0,0.0,2.000000,0.0,21.000000,0.0,121.000000,010f36d6b66747e906c8f7025df1e154ceabc598f9c1d9...
3,4,53,4,70,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,130.000000,6e417e9fadda9948ed7a8bf472d48285126369c407aad8...
4,4,62,5,67,29,14,597,53,32,12,...,0.0,0.0,0.0,0.0,2.000000,0.0,21.000000,0.0,128.000000,6f6d7fe35d8e2b17bfd85752b46f2d20f71087231d8830...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,14,51,4,125,0,0,10,0,0,0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,103.340421,
4517,13,53,15,535,25,29,289,43,23,14,...,0.0,0.0,0.0,0.0,0.069405,0.0,2.220953,0.0,134.138810,
4518,9,2,1,30,0,0,25,1,2,1,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,116.349099,
4519,7,48,10,201,2,0,13,3,0,3,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,126.087249,


In [23]:
from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Get column names for back-transforming scaled data into DataFrame
column_names = oversampled_X.columns.tolist()

# Scale training data and reformat as DataFrame with original column names
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=column_names)

# Scale testing data and reformat as DataFrame
X_test = pd.DataFrame(scaler.transform(X_test), columns=column_names)


In [24]:
X_train

Unnamed: 0,unique_sources,malicious_count,imports_count,imported_functions,files_written_count,files_deleted_count,registry_keys_count,processes_count,mutexes_count,command_executions,...,signature_mark_call_count,api_InternetOpenA,api_DeleteUrlCacheEntryA,api_InternetConnectA,api_InternetOpenUrlA,api_InternetQueryOptionA,api_HttpSendRequestA,api_RegCreateKeyExW,api_gethostbyname,average_pkt_size
0,0.002985,0.732394,0.368421,0.0224,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.011321,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.196319
1,0.001493,0.478873,0.000000,0.0000,0.063366,0.007740,0.403509,0.000000,0.036822,0.010526,...,0.041509,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.225460
2,0.011940,0.901408,0.105263,0.1504,0.013861,0.001548,0.557018,0.064815,0.044574,0.031579,...,0.241509,0.000782,0.0,0.000782,0.000000,0.000000,0.000392,0.001145,0.0,0.559816
3,0.020896,0.957746,0.263158,0.2096,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.128302,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000655,0.0,0.205521
4,0.005970,0.605634,0.000000,0.0000,0.001980,0.003096,0.172697,0.064815,0.075581,0.000000,...,0.003774,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.202454
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2707,0.016418,0.774648,0.105263,0.0080,0.000000,0.000000,0.000000,0.004630,0.000000,0.021053,...,0.007547,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.401840
2708,0.011940,0.760563,0.263158,0.2224,0.009901,0.000000,0.301535,0.041667,0.015504,0.010526,...,0.094340,0.000000,0.0,0.003912,0.000000,0.023474,0.000000,0.001964,0.0,0.176380
2709,0.004478,0.732394,0.473684,0.3376,0.000000,0.000000,0.000000,0.009259,0.001938,0.021053,...,0.018868,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.203988
2710,0.005970,0.690141,0.368421,0.1984,0.000000,0.000000,0.024123,0.004630,0.005814,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.211330


<br/><br/>

-----

<br/><br/>

Import Dependencies

In [25]:
import gymnasium
from gymnasium import spaces

## Environment Setup

In [26]:

class APTEnv(gymnasium.Env):
    """Custom Environment for APT (Advanced Persistent Threat) classification that follows gym interface."""
    
    metadata = {"render.modes": ["human"]}

    def __init__(self, data, label):
        super(APTEnv, self).__init__()
        
        # Set up spaces for actions and observations
        number_of_columns = data.shape[1]
        self.action_space = spaces.Discrete(label.nunique())
        self.observation_space = spaces.Box(low=0, high=1, shape=(number_of_columns,), dtype=np.float64)
        
        # Initialize data and labels
        self.data = data
        self.label = label.tolist()
        self.history_t = number_of_columns

    def step(self, action):
        # Execute one time step within the environment
        correct_action = self.label[self.t]
        self.reward = int(action == correct_action)
        self.t += 1
        
        # Update observation based on data columns
        self.observation = np.array([self.data.iloc[self.t, : ][col] for col in self.data.columns])
        
        # Check if episode is done
        self.done = (self.t == len(self.data) - 1)
        
        # Return step information
        return self.observation, self.reward, self.done, False, {}

    def reset(self, seed=None, options=None):
        # Reset the environment state
        self.t = 0
        self.done = False
        self.observation = np.array([0.0 for _ in range(self.history_t)])
        
        # Return initial observations and info
        return self.observation, {}

In [27]:
from stable_baselines3.common.env_checker import check_env

# Create environment instances for training and evaluation
env = APTEnv(X_train, y_train)
eval_env = APTEnv(X_test, y_test)

# Validate the environments to ensure compatibility with Stable Baselines3
check_env(env)
check_env(eval_env)


In [76]:
import time
from stable_baselines3.common.callbacks import BaseCallback
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Lists to store evaluation metrics and other statistics
accuracy, precisions, recalls, f1_scores = [], [], [], []
all_predicted_actions, all_agent_rewards, training_time = [], [], []
debug_predictions, debug_true_labels = [], []
training_accuracy = []

# True labels for testing
y_true = y_test.to_list()

class CustomEvaluationCallback(BaseCallback):
    """
    Custom callback for evaluating agent performance periodically during training.
    """
    def __init__(self, model, eval_env, verbose=1):
        super().__init__(verbose)
        self.env = eval_env
        self.model = model
        self.step_tracker = 0
        self.steps = 0
        self.start_time = time.time()

    def _on_training_start(self):
        """ Record the start time of training. """
        self.start_time = time.time()

    def _on_step(self) -> bool:
        """
        Evaluate the agent every 500 steps by predicting over the test environment,
        calculating metrics, and recording them.
        """
        self.step_tracker += 1
        self.steps += 1
        
        if self.step_tracker == 500:
            done, rewards = False, 0
            self.step_tracker = 0
            all_predicted_actions.clear()
            observations, _ = self.env.reset(seed=42)
        
            while not done:
                action, _ = self.model.predict(observations)
                all_predicted_actions.append(action)
                observations, reward, done, _, _ = self.env.step(action)
                rewards += reward

            self.env.close()
            max_size = len(all_predicted_actions)
        
            # Append current predictions and true labels for debugging
            debug_predictions.append(all_predicted_actions.copy())
            debug_true_labels.append(y_true[:max_size].copy())
        
            # Calculate and append performance metrics
            accuracy.append(accuracy_score(y_true[:max_size], all_predicted_actions) * 100)
            precisions.append(precision_score(y_true[:max_size], all_predicted_actions, average='weighted') * 100)
            recalls.append(recall_score(y_true[:max_size], all_predicted_actions, average='weighted') * 100)
            f1_scores.append(f1_score(y_true[:max_size], all_predicted_actions, average='weighted') * 100)
            
            all_agent_rewards.append(rewards)
            
            training_predictions = self.model.predict(X_train)[0]
            training_accuracy.append(accuracy_score(y_train, training_predictions) * 100)
        
            print(f"After {self.steps} steps, the accuracy is {accuracy[-1]:.2f}%")
        
        return True

    def _on_training_end(self):
        """ Calculate total training time and store it. """
        self.total_training_time = time.time() - self.start_time
        training_time.append(self.total_training_time)

In [77]:
import torch
from stable_baselines3 import DQN

def train(env, eval_env, steps: int = 1, seed: int | None = 0, **env_kwargs):
    """
    Trains a DQN model on a given environment and evaluates it using a custom callback.
    """
    # Reset the environment with the specified seed
    env.reset(seed=seed)
    print("Starting training")

    # Define a learning rate schedule that decays exponentially
    def learning_rate_schedule(step):
        return 1e-3 * (0.99 ** (step / 1000))

    # Initialize the DQN model with specified parameters and architecture
    model = DQN(
        policy='MlpPolicy',
        env=env,
        verbose=1,
        learning_rate=learning_rate_schedule,
        buffer_size=int(1e5),
        batch_size=256,
        gradient_steps=3,
        tau=0.005,
        exploration_fraction=0.1,
        exploration_final_eps=0.02,
        gamma=0.99,
        policy_kwargs={
            "net_arch": [1024, 512, 512, 256],
            "activation_fn": torch.nn.LeakyReLU,
        }
    )

    # Create and assign a custom evaluation callback to monitor training
    # custom_callback = CustomEvaluationCallback(model, eval_env)
    custom_callback = CustomEvaluationCallback(model, eval_env)

    # Begin learning and continue for the specified number of timesteps
    model.learn(total_timesteps=steps, callback=custom_callback, reset_num_timesteps=False)
    print("Finished training")

    # Close the environment after training
    env.close()

    return model

In [78]:
# Define environment configuration if any
env_kwargs = {}

# Clear previous training metrics and debug data
accuracy.clear()
precisions.clear()
f1_scores.clear()
recalls.clear()
training_accuracy.clear()

debug_predictions.clear()
debug_true_labels.clear()

# Start training the model with specified environment, steps, and seed
trained_model = train(env, eval_env, steps=20000, seed=1)

Starting training
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
After 500 steps, the accuracy is 19.30%
After 1000 steps, the accuracy is 36.62%
After 1500 steps, the accuracy is 53.26%
After 2000 steps, the accuracy is 69.14%
After 2500 steps, the accuracy is 72.95%
After 3000 steps, the accuracy is 76.71%
After 3500 steps, the accuracy is 79.65%
After 4000 steps, the accuracy is 80.97%
After 4500 steps, the accuracy is 81.97%
After 5000 steps, the accuracy is 82.41%
After 5500 steps, the accuracy is 83.19%
After 6000 steps, the accuracy is 83.41%
After 6500 steps, the accuracy is 84.02%
After 7000 steps, the accuracy is 84.24%
After 7500 steps, the accuracy is 84.35%
After 8000 steps, the accuracy is 85.07%
After 8500 steps, the accuracy is 84.62%
After 9000 steps, the accuracy is 85.45%
After 9500 steps, the accuracy is 85.67%
After 10000 steps, the accuracy is 85.45%
After 10500 steps, the accuracy is 85.90%
--------------------------

<br/><br/>

--------------

<br/><br/>

## Model Evaluation

In [105]:
# Create a dictionary mapping APT groups to unique encoded values
apt_group_codes = df_cleaned.groupby(final_column)[final_column + '_encoded'].unique().apply(lambda x: x[0]).to_dict()

# Reverse the dictionary to map encoded values back to APT group names
reversed_apt_group_codes = {v: k for k, v in apt_group_codes.items()}

In [87]:
import plotly.graph_objects as go

# Create a new figure object
fig = go.Figure()

filtered_training_accuracy = training_accuracy.copy()[:-1]

# Add a line plot for filtered_training_accuracy with markers
fig.add_traces(go.Scatter(
    x=list(range(len(filtered_training_accuracy))), 
    y=filtered_training_accuracy, 
    name='Accuracy', 
    mode='lines+markers',
    line=dict(color='orange')
))

# Set the layout of the figure with titles for axes and the plot
fig.update_layout(
    title='Model Training Accuracy Over Training Steps',
    xaxis_title='Training Steps',
    yaxis_title='Accuracy (%)'
)

# Annotate each point with its percentage value and adjust position based on change
for i, value in enumerate(filtered_training_accuracy):
    # Calculate change direction for annotation positioning
    offset = 1.5 if i > 0 and value > filtered_training_accuracy[i-1] else -1.5
    # Add annotations at each point
    fig.add_annotation(
        y=value + offset, 
        x=i, 
        text=f'{value:.2f}%', 
        showarrow=False, 
        font=dict(color='black', size=6)
    )

# Display the figure
fig.show()

In [155]:
from sklearn.metrics import classification_report

predicted_values = trained_model.predict(X_train)[0]

print(classification_report(y_train.values, predicted_values))

              precision    recall  f1-score   support

           0       0.87      0.80      0.84       164
           1       0.92      0.82      0.87        84
           2       0.92      1.00      0.96       391
           3       0.90      0.84      0.87        45
           4       0.62      0.75      0.68        84
           5       0.85      0.76      0.80        99
           6       0.78      0.80      0.79        64
           7       0.81      0.69      0.74       126
           8       0.98      0.91      0.94        45
           9       1.00      0.98      0.99       165
          10       0.93      0.98      0.95       375
          11       0.89      0.83      0.86       167

    accuracy                           0.89      1809
   macro avg       0.87      0.85      0.86      1809
weighted avg       0.89      0.89      0.89      1809



In [156]:
report = classification_report(y_train.values, predicted_values, output_dict=True)

# Convert the dictionary to a DataFrame
df_report = pd.DataFrame(report).transpose()

# Optional: Convert columns to a more readable format if necessary
# df_report.reset_index(inplace=True)
df_report.rename(columns={'index': 'Class'}, inplace=True)

# remove the last 3 rows
df_report = df_report[:-3]

df_report

Unnamed: 0,precision,recall,f1-score,support
0,0.874172,0.804878,0.838095,164.0
1,0.92,0.821429,0.867925,84.0
2,0.921986,0.997442,0.958231,391.0
3,0.904762,0.844444,0.873563,45.0
4,0.623762,0.75,0.681081,84.0
5,0.852273,0.757576,0.802139,99.0
6,0.784615,0.796875,0.790698,64.0
7,0.805556,0.690476,0.74359,126.0
8,0.97619,0.911111,0.942529,45.0
9,1.0,0.981818,0.990826,165.0


In [162]:
heatmap_data = df_report[df_report.columns[:-1]].copy()
heatmap_data = heatmap_data * 100

# create a heatmap
fig = go.Figure(data=go.Heatmap(
    z=heatmap_data.values,
    x=heatmap_data.columns,
    y=heatmap_data.index,
    zmin=30, zmax=100,
    colorscale='Viridis'))

fig.update_layout(xaxis_title='Metric', yaxis_title='APT Group', height=800, width=600, title="Model Training")

# show text inside the heatmap
for i in range(len(heatmap_data.index)):
    for j in range(len(heatmap_data.columns)):
        if not np.isnan(heatmap_data.values[i, j]):
            value = "{:.2f}%".format(heatmap_data.values[i, j])
            fig.add_annotation(x=heatmap_data.columns[j], y=heatmap_data.index[i], text=value, showarrow=False)


fig.data[0].y = [reversed_apt_group_codes[int(i)] for i in heatmap_data.index]

fig.show()

In [131]:
# Create a DataFrame from the last set of predictions and true labels for debugging
debug_model = pd.DataFrame({'Predictions': debug_predictions[-1], 'True Labels': debug_true_labels[-1]})
debug_model['match'] = debug_model.Predictions == debug_model['True Labels']

# Print the highest and the most recent accuracy from the evaluations
print("Highest Accuracy: ", (max(accuracy)))
print(f"Last Accuracy: {accuracy[-1]:.2f}%")

# Analyze and display count of incorrect predictions
print("\n\n --- Debugging wrong predictions --- ")
df_wrong_predictions = debug_model[debug_model['match'] == False]['True Labels'].value_counts()
display(df_wrong_predictions)

# Analyze and display count of correct predictions
print("\n\n --- Debugging correct predictions --- ")
df_correct_predictions = debug_model[debug_model['match'] == True]['True Labels'].value_counts()
display(df_correct_predictions)

# Identify and display frequently missing APT groups in predictions
print("\n\n --- Debugging missing apt groups --- ")
df_missing_apt_groups = df_wrong_predictions[df_wrong_predictions.values > 40].reset_index()
df_missing_apt_groups['apt_group'] = df_missing_apt_groups['True Labels'].map(reversed_apt_group_codes)
display(df_missing_apt_groups)

Highest Accuracy:  89.26991150442478
Last Accuracy: 88.00%


 --- Debugging wrong predictions --- 


True Labels
7     37
11    32
0     30
4     28
5     23
1     15
6     13
3     10
10     9
2      7
9      7
8      6
Name: count, dtype: int64



 --- Debugging correct predictions --- 


True Labels
2     384
10    366
9     158
11    135
0     134
7      89
5      76
1      69
4      56
6      50
8      39
3      35
Name: count, dtype: int64



 --- Debugging missing apt groups --- 


Unnamed: 0,True Labels,count,apt_group


In [132]:
# Initialize a DataFrame to accumulate prediction matches across debug sessions
hard_to_predict = pd.DataFrame()

# Iterate over each debug session to compute cumulative correct predictions
for i in range(len(debug_predictions)):
    debug_model = pd.DataFrame({'Predictions': debug_predictions[i], 'True Labels': debug_true_labels[i]})
    debug_model['match'] = debug_model.Predictions == debug_model['True Labels']
    
    # Initialize or accumulate match counts
    if 'match' not in hard_to_predict.columns:
        hard_to_predict['match'] = debug_model['match'].astype(int)
    else:
        hard_to_predict['match'] += debug_model['match'].astype(int)

# Set index to align with the original test labels and attach corresponding file names
hard_to_predict.index = y_test.index[:len(debug_predictions[-1])]
hard_to_predict['file_name'] = df_sampled_x_with_origin.loc[hard_to_predict.index, 'file_name']

# Filter hard_to_predict DataFrame for a specific APT group
filter_for_apt_group_code = 0
df_filtered_hard_to_predict = hard_to_predict[hard_to_predict.index.isin(y_test[y_test == filter_for_apt_group_code].index)]

# Group by file name and sum the matches to identify files consistently predicted correctly
df_filtered_hard_to_predict.groupby('file_name').sum().sort_values('match', ascending=False)


Unnamed: 0_level_0,match
file_name,Unnamed: 1_level_1
7b345dcc0dc2a441a00d56feb7a3d0058eb11eaa82f71f12309289f651c30924.json,40
f3c57637808e887d0c6ae4b4e1b63ab201971d6f12eb1858b1550fc8291cdb86.json,40
1e3732e455592c5cd6e787cbba24217856f3156c65b9cba444b258e4414039a7.json,40
1609f20fadda4f1824cf21920778a71b8f3874ab9655d080aa10cf9d17cfd372.json,40
fff5b385418e350dd366d1fd6d34801059304e89896a94ebb03973c48fb6066c.json,39
...,...
1a6a112fa17b49e57ce20abf787054d86f7ec0b52c7728c869db2ff287708e74.json,0
c9b2a0d55146b68ac4480a5d1f3fed19aa2ca271d184b0d5802ba39793d9f299.json,0
1039f5492d975d1f215255397ebc2419fb136623682ee004e39970fe9b84dce3.json,0
e7cfe7169b058b460fd172b59e809779a98ca0f17e4202fd1f0df6795626fac4.json,0


In [133]:
# Filter and rename columns for debugging
df_debugging = df_filtered_hard_to_predict[df_filtered_hard_to_predict['match'] > 20].rename(columns={'file_name': 'file'})
df_debugging['file'] = df_debugging['file'].str.replace('.json', '.zip')

# Merge debugging DataFrame with malware dataset for additional data
df_debugging = pd.merge(df_debugging, pd.read_csv('malware.csv'), on='file', how='left')

# Clean up and create a new directory for debugging reports
os.system('rm -rf debug')
os.system('mkdir debug')

# Copy the first 10 cuckoo JSON reports for in-depth debugging
for cuckoo_id in df_debugging.cuckoo_id.values[:10]:
  os.system('cp "{}" "{}"'.format(os.path.join(base_dir, 'cuckoo', str(int(cuckoo_id)), 'reports', 'report.json'), os.path.join(base_dir, 'debug', f'{int(cuckoo_id)}.json')))

# Display the debugging DataFrame
df_debugging

Unnamed: 0,match,file,apt,cuckoo_id
0,40,1e3732e455592c5cd6e787cbba24217856f3156c65b9cb...,APT 1,5121061.0
1,38,4b1a437fbe161b0f1dd4d9eca647b4b82f89e810f6eede...,APT 1,5121106.0
2,36,f27593fd1d391f9925230a1abc12b8f3791fc43ea980ec...,APT 1,5121822.0
3,38,1d0d00c76353c8a1d2e33af602238244f0e0417193d7f6...,APT 1,5121052.0
4,37,262786523f13a3c6e9f58015dd9bcdeeb927a4622718ca...,APT 1,5121506.0
...,...,...,...,...
126,25,33e6c3f5a66512c136e53ede2095fc240973fa58a9d8a7...,APT 1,5121254.0
127,38,b8eac20bec1c24cfd556f543764c85fb614123b34c98f0...,APT 1,5121580.0
128,37,406e73948c78b1e692646ea0edbadbb366bede04036114...,APT 1,5121358.0
129,38,14a22f11c0121492cfabc529bcffecda5d076e79e459a8...,APT 1,5121232.0


In [134]:
# Create a new figure object
fig = go.Figure()

# Add a line plot for accuracy with markers
fig.add_traces(go.Scatter(
    x=list(range(len(accuracy))), 
    y=accuracy, 
    name='Accuracy', 
    mode='lines+markers'
))

# Set the layout of the figure with titles for axes and the plot
fig.update_layout(
    title='Model Accuracy Over Training Steps',
    xaxis_title='Training Steps',
    yaxis_title='Accuracy (%)'
)

# Annotate each point with its percentage value and adjust position based on change
for i, value in enumerate(accuracy):
    # Calculate change direction for annotation positioning
    offset = 1.5 if i > 0 and value > accuracy[i-1] else -1.5
    # Add annotations at each point
    fig.add_annotation(
        y=value + offset, 
        x=i, 
        text=f'{value:.2f}%', 
        showarrow=False, 
        font=dict(color='black', size=6)
    )

# Display the figure
fig.show()

In [154]:
accuracy

[19.303097345132745,
 36.61504424778761,
 53.26327433628318,
 69.13716814159292,
 72.95353982300885,
 76.71460176991151,
 79.64601769911505,
 80.97345132743364,
 81.96902654867256,
 82.41150442477876,
 83.1858407079646,
 83.4070796460177,
 84.0154867256637,
 84.23672566371681,
 84.34734513274337,
 85.06637168141593,
 84.62389380530973,
 85.45353982300885,
 85.67477876106194,
 85.45353982300885,
 85.89601769911505,
 87.16814159292035,
 86.61504424778761,
 86.67035398230088,
 87.72123893805309,
 87.66592920353983,
 87.11283185840708,
 86.11725663716814,
 88.10840707964603,
 87.16814159292035,
 87.44469026548673,
 87.66592920353983,
 87.16814159292035,
 88.38495575221239,
 88.55088495575221,
 87.99778761061947,
 88.60619469026548,
 89.26991150442478,
 88.10840707964603,
 87.99778761061947]

In [135]:
fig = go.Figure()

for name, data in [
  ['Precision', precisions],
  ['Recall', recalls],
  ['F1 Score', f1_scores]
]:
  fig.add_traces(go.Scatter(
      x=list(range(len(data))), 
      y=data, 
      name=name, 
      mode='lines+markers'
  ))

fig.update_layout(
    title='Model Performance Metrics Over Training Steps',
    xaxis_title='Training Steps',
    yaxis_title='Percentage (%)'
)

fig.show()

In [136]:
df___  = pd.DataFrame({'precison' : precisions, 'recall' : recalls, 'f1_score': f1_scores})

df___

Unnamed: 0,precison,recall,f1_score
0,27.17832,19.303097,21.68016
1,41.203754,36.615044,38.190654
2,53.878737,53.263274,52.590403
3,69.967348,69.137168,67.468271
4,70.820699,72.95354,70.990231
5,75.208718,76.714602,75.27609
6,78.300821,79.646018,78.59736
7,80.358151,80.973451,80.366741
8,81.638103,81.969027,81.602969
9,81.922192,82.411504,81.990793


<br/><br/>

--------------

<br/><br/>

<br/>

## K-Nearest Neighbors

In [137]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize KNN model with number of neighbors equal to the unique count of labels in training data
model = KNeighborsClassifier(n_neighbors=len(y_train.unique()))

# Fit the model to the training data
model.fit(X_train, y_train)

In [138]:
# Predict labels for the training set
y_pred = model.predict(X_train)

# Output the accuracy of predictions on the training set
print('Training Accuracy:', accuracy_score(y_train, y_pred))
print(classification_report(y_train, y_pred))

Training Accuracy: 0.8207964601769911
              precision    recall  f1-score   support

           0       0.74      0.85      0.79       240
           1       0.88      0.67      0.76       160
           2       0.78      1.00      0.88       570
           3       0.70      0.57      0.63        61
           4       0.64      0.39      0.48       127
           5       0.91      0.58      0.70       184
           6       0.80      0.71      0.75       100
           7       0.77      0.52      0.62       147
           8       0.83      0.91      0.87        87
           9       0.96      1.00      0.98       230
          10       0.89      0.92      0.91       586
          11       0.74      0.73      0.74       220

    accuracy                           0.82      2712
   macro avg       0.80      0.74      0.76      2712
weighted avg       0.82      0.82      0.81      2712



In [139]:
# Predict labels for the test set
y_pred = model.predict(X_test)

# Output the accuracy of predictions on the test set
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.77      0.71       164
           1       0.85      0.73      0.78        84
           2       0.78      0.99      0.87       391
           3       0.85      0.78      0.81        45
           4       0.62      0.30      0.40        84
           5       0.80      0.56      0.65        99
           6       0.82      0.72      0.77        64
           7       0.74      0.45      0.56       126
           8       0.73      0.82      0.77        45
           9       0.96      0.99      0.98       165
          10       0.88      0.94      0.91       375
          11       0.72      0.64      0.68       167

    accuracy                           0.80      1809
   macro avg       0.79      0.72      0.74      1809
weighted avg       0.80      0.80      0.79      1809

Accuracy: 0.8021006080707573


<br/>

## Support Vector Machines (SVC)

In [140]:
from sklearn.svm import SVC

# Initialize the Support Vector Classifier
model = SVC()

# Fit the model to the training data
model.fit(X_train, y_train)

In [141]:
# Predict labels for the training set
y_pred = model.predict(X_train)

# Output the accuracy of predictions on the training set
print('Training Accuracy:', accuracy_score(y_train, y_pred))
print(classification_report(y_train, y_pred))

Training Accuracy: 0.803834808259587
              precision    recall  f1-score   support

           0       0.65      0.82      0.73       240
           1       0.97      0.63      0.77       160
           2       0.76      0.99      0.86       570
           3       0.95      0.30      0.45        61
           4       0.81      0.17      0.27       127
           5       0.80      0.65      0.72       184
           6       0.87      0.65      0.74       100
           7       0.85      0.56      0.68       147
           8       0.79      0.87      0.83        87
           9       0.99      0.97      0.98       230
          10       0.86      0.97      0.91       586
          11       0.67      0.67      0.67       220

    accuracy                           0.80      2712
   macro avg       0.83      0.69      0.72      2712
weighted avg       0.82      0.80      0.79      2712



In [142]:
# Predict labels for the test set
y_pred = model.predict(X_test)

# Output the accuracy of predictions on the test set
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.60      0.71      0.65       164
           1       0.91      0.63      0.75        84
           2       0.76      0.97      0.85       391
           3       1.00      0.44      0.62        45
           4       0.75      0.11      0.19        84
           5       0.70      0.61      0.65        99
           6       0.80      0.69      0.74        64
           7       0.74      0.41      0.53       126
           8       0.71      0.82      0.76        45
           9       0.99      0.98      0.98       165
          10       0.82      0.97      0.89       375
          11       0.67      0.63      0.65       167

    accuracy                           0.77      1809
   macro avg       0.79      0.66      0.69      1809
weighted avg       0.78      0.77      0.75      1809

Accuracy: 0.7744610281923715


<br/>

## Decision Tree Classifier

In [143]:
# DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree Classifier
model = DecisionTreeClassifier()

# Fit the model to the training data
model.fit(X_train, y_train)

In [144]:
# Predict labels for the training set
y_pred = model.predict(X_train)

# Output the accuracy of predictions on the training set
print('Training Accuracy:', accuracy_score(y_train, y_pred))
print(classification_report(y_train, y_pred))

Training Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       240
           1       1.00      1.00      1.00       160
           2       1.00      1.00      1.00       570
           3       1.00      1.00      1.00        61
           4       1.00      1.00      1.00       127
           5       1.00      1.00      1.00       184
           6       1.00      1.00      1.00       100
           7       1.00      1.00      1.00       147
           8       1.00      1.00      1.00        87
           9       1.00      1.00      1.00       230
          10       1.00      1.00      1.00       586
          11       1.00      1.00      1.00       220

    accuracy                           1.00      2712
   macro avg       1.00      1.00      1.00      2712
weighted avg       1.00      1.00      1.00      2712



In [145]:
# Predict labels for the test set
y_pred = model.predict(X_test)

# Output the accuracy of predictions on the test set
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.79      0.77       164
           1       0.68      0.80      0.73        84
           2       0.93      0.95      0.94       391
           3       0.74      0.76      0.75        45
           4       0.62      0.57      0.59        84
           5       0.70      0.71      0.70        99
           6       0.86      0.78      0.82        64
           7       0.70      0.60      0.64       126
           8       0.64      0.84      0.73        45
           9       0.99      0.98      0.99       165
          10       0.93      0.94      0.93       375
          11       0.82      0.72      0.76       167

    accuracy                           0.84      1809
   macro avg       0.78      0.79      0.78      1809
weighted avg       0.84      0.84      0.84      1809

Accuracy: 0.8385848535102266


<br/>

## Stochastic Gradient Descent

In [146]:
from sklearn.linear_model import SGDClassifier

# Initialize the model
model = SGDClassifier()

# Fit the model to the training data
model.fit(X_train, y_train)

In [147]:
# Predict labels for the training set
y_pred = model.predict(X_train)

# Output the accuracy of predictions on the training set
print('Training Accuracy:', accuracy_score(y_train, y_pred))
print(classification_report(y_train, y_pred))

Training Accuracy: 0.6707227138643068
              precision    recall  f1-score   support

           0       0.79      0.60      0.68       240
           1       0.91      0.49      0.64       160
           2       0.63      0.93      0.75       570
           3       0.50      0.18      0.27        61
           4       0.67      0.06      0.12       127
           5       0.96      0.14      0.24       184
           6       0.88      0.35      0.50       100
           7       0.78      0.32      0.45       147
           8       0.73      0.40      0.52        87
           9       0.95      0.98      0.97       230
          10       0.77      0.95      0.85       586
          11       0.28      0.56      0.38       220

    accuracy                           0.67      2712
   macro avg       0.74      0.50      0.53      2712
weighted avg       0.73      0.67      0.64      2712



In [148]:
# Predict labels for the test set
y_pred = model.predict(X_test)

# Output the accuracy of predictions on the test set
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.56      0.64       164
           1       0.92      0.57      0.71        84
           2       0.63      0.93      0.75       391
           3       0.67      0.18      0.28        45
           4       0.43      0.04      0.07        84
           5       0.58      0.07      0.13        99
           6       0.88      0.44      0.58        64
           7       0.88      0.29      0.44       126
           8       0.59      0.42      0.49        45
           9       0.95      0.99      0.97       165
          10       0.78      0.94      0.85       375
          11       0.39      0.71      0.51       167

    accuracy                           0.68      1809
   macro avg       0.71      0.51      0.53      1809
weighted avg       0.71      0.68      0.65      1809

Accuracy: 0.6843559977888336


<br/>

## Neural Network Models 

In [149]:
from sklearn.neural_network import MLPClassifier

# Initialize the model
model = MLPClassifier()

# Fit the model to the training data
model.fit(X_train, y_train)

In [150]:
# Predict labels for the training set
y_pred = model.predict(X_train)

# Output the accuracy of predictions on the training set
print('Training Accuracy:', accuracy_score(y_train, y_pred))
print(classification_report(y_train, y_pred))

Training Accuracy: 0.8484513274336283
              precision    recall  f1-score   support

           0       0.77      0.89      0.83       240
           1       0.94      0.70      0.80       160
           2       0.87      0.98      0.92       570
           3       0.76      0.36      0.49        61
           4       0.63      0.42      0.50       127
           5       0.78      0.72      0.75       184
           6       0.91      0.81      0.86       100
           7       0.81      0.59      0.68       147
           8       0.87      0.92      0.89        87
           9       0.98      1.00      0.99       230
          10       0.89      0.97      0.93       586
          11       0.70      0.75      0.73       220

    accuracy                           0.85      2712
   macro avg       0.83      0.76      0.78      2712
weighted avg       0.85      0.85      0.84      2712



In [151]:
# Predict labels for the test set
y_pred = model.predict(X_test)

# Output the accuracy of predictions on the test set
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.74      0.71       164
           1       0.84      0.68      0.75        84
           2       0.86      0.96      0.91       391
           3       0.88      0.49      0.63        45
           4       0.66      0.37      0.47        84
           5       0.61      0.63      0.62        99
           6       0.73      0.70      0.71        64
           7       0.70      0.48      0.57       126
           8       0.74      0.89      0.81        45
           9       0.99      0.98      0.98       165
          10       0.87      0.96      0.92       375
          11       0.70      0.73      0.72       167

    accuracy                           0.81      1809
   macro avg       0.77      0.72      0.73      1809
weighted avg       0.80      0.81      0.80      1809

Accuracy: 0.806522940851299
