In [1]:
import string, random

MALICIOUS_FILES_EXT = ["ps1", "dll", "exe", "hta", "bat", "vbs", "vbe", "js", "iso"]
MALICIOUS_DOCS_EXT = ["docm", "xlsm"]

WORLD_WRITABLE_DIRECTORIES = [
    "C:\\Windows\\Tasks",
    "C:\\Windows\\Temp",
    "C:\\Windows\\Tracing",
    "C:\\Windows\\System32\\Spool\\Drivers\\color",
    "C:\\Windows\\Registration\\CRMLog"
]

SUSPICIOUS_LNK_DIRECTORIES = [
    "C:\\ProgramData\\Microsoft\\Windows\\Start Menu\\Programs\\StartUp",
    "C:\\Users\\myuser\\AppData\\Roaming\\\Microsoft\\Windows\\Start Menu\\Programs\\Startup"
]

SUSPICIOUS_USER_DIRECTORIES = [
    "C:\\Users\\myuser",
    "C:\\Users\\myuser\\Documents",
    "C:\\Users\\myuser\\AppData",
    "C:\\Users\\myuser\\AppData\\Local",
    "C:\\Users\\myuser\\AppData\\Local\\Temp",
    "C:\\Users\\myuser\\AppData\\Roaming",
    "C:\\Users\\myuser\\AppData\\Roaming\\\Microsoft\\Windows\\",
    "C:\\Users\\myuser\\AppData\\Roaming\\\Microsoft\\Windows\\Start Menu"
]

SUSPICIOUS_SYSTEM_DIRECTORIES = [
    "C:\\Windows",
    "C:\\ProgramData"
]

FOLDERS = ["Downloads", "Desktop", "Documents"]
USUAL_FOLDERS = ["C:\\Users\\myuser\\" + folder for folder in FOLDERS]

def get_file_name(a):
    return a.split("\\")[-1].strip()

def change_extension(a, newext):
    return ''.join(a.split(".")[:-1]) + "." + newext

with open("../dataset_win10_fullfilesystem.txt") as f:
    win10 = f.readlines()

with open("../dataset_smbmap_cleaned.txt") as f:
    temp = f.readlines()
    smbmap = [x.strip() for x in temp]

syswow64 = [x for x in win10 if "C:\\Windows\\SysWOW64\\" in x]
all_syswow64_files = list(set([get_file_name(x) for x in syswow64]))
pe_syswow64_files = [x for x in all_syswow64_files if (x.endswith("dll") or x.endswith("exe"))]

system32 = [x for x in win10 if "C:\\Windows\\System32\\" in x]
all_system32_files = list(set([get_file_name(x) for x in system32]))
pe_system32_files = [x for x in all_system32_files if (x.endswith("dll") or x.endswith("exe"))]

doc_files = [x for x in smbmap if (x.endswith("doc") or x.endswith("docx") or x.endswith("xls") or x.endswith("xlsx"))]
all_doc_files = list(set([get_file_name(x) for x in doc_files]))

exe_files = [x for x in smbmap if x.endswith("exe")]
all_exe_files = list(set([get_file_name(x) for x in exe_files]))

In [2]:
malicious = []

wow_not_system = [x for x in all_syswow64_files if x not in all_system32_files and (x.endswith("dll") or x.endswith("exe"))]
malicious.extend(["C:\\Windows\\System32\\"+x for x in wow_not_system])

system_not_wow = [x for x in all_system32_files if x not in all_syswow64_files and (x.endswith("dll") or x.endswith("exe"))]
malicious.extend(["C:\\Windows\\SysWOW64\\"+x for x in system_not_wow])

# exe hidden as dlls and vice versa
malicious.extend(["C:\\Windows\\System32\\" + change_extension(x, "dll") for x in pe_system32_files if x.endswith("exe") and change_extension(x, "dll") not in pe_system32_files] + \
                ["C:\\Windows\\System32\\" + change_extension(x, "exe") for x in pe_system32_files if x.endswith("dll") and change_extension(x, "exe") not in pe_system32_files] + \
                ["C:\\Windows\\SysWOW64\\"+change_extension(x, "dll") for x in pe_syswow64_files if x.endswith("exe") and change_extension(x, "dll") not in pe_syswow64_files] + \
                ["C:\\Windows\\SysWOW64\\"+change_extension(x, "exe") for x in pe_syswow64_files if x.endswith("dll") and change_extension(x, "exe") not in pe_syswow64_files])

# DLL order hijacking, SYSTEM persistence, etc. malware
for file in all_exe_files + pe_system32_files + wow_not_system:
    malicious.extend([folder + "\\" + file for folder in WORLD_WRITABLE_DIRECTORIES + SUSPICIOUS_USER_DIRECTORIES + SUSPICIOUS_SYSTEM_DIRECTORIES + USUAL_FOLDERS])

for file in all_exe_files + pe_system32_files + pe_syswow64_files:
    malicious.extend([folder + "\\" + change_extension(file, "lnk") for folder in SUSPICIOUS_LNK_DIRECTORIES])


In [3]:
# generating suspicious names
SUSPICIOUS_NAMES = ["invoice", "transaction", "report", "payment", "sales", "review", "msf", "revision", "cv", "resume"]
YEARS = ["2021", "2020", "2022"]

simple_names = [year + "_" + name for year in YEARS for name in SUSPICIOUS_NAMES]
simple_names.extend([name + "_" + year for year in YEARS for name in SUSPICIOUS_NAMES])

# applying names to suspicious docs
SUSPICIOUS_DOC_NAMES = []
for name in simple_names + SUSPICIOUS_NAMES:
    SUSPICIOUS_DOC_NAMES.extend([name + "." + ext for ext in MALICIOUS_DOCS_EXT])

for name in SUSPICIOUS_DOC_NAMES:
    malicious.extend([folder + "\\" + name for folder in USUAL_FOLDERS])

# applying names to suspicious launchers
SUSPICIOUS_LAUNCHER_NAMES = []
for name in simple_names + SUSPICIOUS_NAMES:
    SUSPICIOUS_LAUNCHER_NAMES.extend([name + "." + ext for ext in MALICIOUS_FILES_EXT])

for name in SUSPICIOUS_LAUNCHER_NAMES:
    malicious.extend([folder + "\\" + name for folder in SUSPICIOUS_USER_DIRECTORIES])

In [4]:
MACRO_DOCS = []
for doc in all_doc_files:
    MACRO_DOCS.extend([change_extension(doc, ext) for ext in MALICIOUS_DOCS_EXT])

for doc in MACRO_DOCS:
    malicious.extend([folder + "\\" + doc for folder in USUAL_FOLDERS])

In [5]:
# applying RANDOM names to suspicious launchers
RANDOM_NAMES = [''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase, k=8)) for _ in range(500)]

SUSPICIOUS_LAUNCHER = []
for name in RANDOM_NAMES:
    SUSPICIOUS_LAUNCHER.extend([name + "." + ext for ext in MALICIOUS_FILES_EXT])

for name in SUSPICIOUS_LAUNCHER:
    malicious.extend([folder + "\\" + name for folder in SUSPICIOUS_USER_DIRECTORIES])

In [6]:
print(len(malicious))
with open("../dataset_malicious_augumented.txt", "w") as f:
    f.writelines([x+"\n" for x in malicious])

224280
