## 1. PE File

### 1.1. Preliminaries

In [None]:
import pefile

ModuleNotFoundError: ignored

### 1.2. Sample Code

In [None]:
import os

file_path = './sample_data/'
files = os.listdir(file_path)

In [None]:
filename = files[0]
full_name = os.path.join(file_path, filename)
full_name

In [None]:
pe = pefile.PE(full_name)

In [None]:
print(pe)

In [None]:
print(pe.DOS_HEADER)

In [None]:
pe.parse_rich_header()

In [None]:
print(pe.NT_HEADERS)

In [None]:
print(pe.FILE_HEADER)

In [None]:
print(pe.OPTIONAL_HEADER)

In [None]:
for section in pe.sections:
    print(section)

In [None]:
for entry_import in pe.DIRECTORY_ENTRY_IMPORT:
    print(entry_import)

In [None]:
print(pe.DIRECTORY_ENTRY_EXPORT)

### 1.3. Practice

In [None]:
import os
import csv
import pefile
import pandas as pd

In [None]:
COLS = [
    'filename', 'e_magic', 'e_lfanew', 'e_minalloc', 'e_ovno', 'Signature', 'Machine', 'NumberOfSections',
    'TimeDateStamp', 'PointerToSymbolTable', 'NumberOfSymbols', 'SizeOfOptionalHeader', 'Characteristics',
    'Magic', 'SizeOfCode', 'AddressOfEntryPoint', 'BaseOfCode', 'ImageBase', "EntryPoint", "SectionAlignment",
    "FileAlignment", 'SizeOfImage', 'SizeOfHeaders', 'CheckSum', 'Subsystem', 'NumberOfRvaAndSizes',
    'CompareNumberOfSections', '.textSectionName', '.textSectionVirtualSize',
    '.textSection|VirtualSize-SizeOfRawData|', '.textSectionVirtualAddress', '.textSectionSizeOfRawData',
    '.textSectionPointerToRawData', '.textSectionCharacteristics', '.textSectionEntropy',
    '.dataSectionName', '.dataSectionVirtualSize', '.dataSection|VirtualSize-SizeOfRawData|',
    '.dataSectionVirtualAddress', '.dataSectionSizeOfRawData', '.dataSectionPointerToRawData',
    '.dataSectionCharacteristics', '.dataSectionEntropy', '.rsrcSectionName', '.rsrcSectionVirtualSize',
    '.rsrcSection|VirtualSize-SizeOfRawData|', '.rsrcSectionVirtualAddress', '.rsrcSectionSizeOfRawData',
    '.rsrcSectionPointerToRawData', '.rsrcSectionCharacteristics', '.rsrcSectionEntropy',
    '.rdataSectionName', '.rdataSectionVirtualSize', '.rdataSection|VirtualSize-SizeOfRawData|',
    '.rdataSectionVirtualAddress', '.rdataSectionSizeOfRawData', '.rdataSectionPointerToRawData',
    '.rdataSectionCharacteristics', '.rdataSectionEntropy', '.relocSectionName', '.relocSectionVirtualSize',
    '.relocSection|VirtualSize-SizeOfRawData|', '.relocSectionVirtualAddress', '.relocSectionSizeOfRawData',
    '.relocSectionPointerToRawData', '.relocSectionCharacteristics', '.relocSectionEntropy',
    'TotalNumberOfFunctionInIAT', 'TotalNumberOfFunctionInEAT'
]

NULL_ROW = [0 for x in COLS]


def extract_pe_features(pe, filename):
    # add filename
    row = [filename]
    # add DOS_HEADER
    row.extend([pe.DOS_HEADER.e_magic, pe.DOS_HEADER.e_lfanew, pe.DOS_HEADER.e_minalloc, pe.DOS_HEADER.e_ovno])
    # add NT_HEADERS
    row.extend([pe.NT_HEADERS.Signature])
    # add FILE_HEADER
    row.extend([pe.FILE_HEADER.Machine, pe.FILE_HEADER.NumberOfSections, pe.FILE_HEADER.TimeDateStamp,
                pe.FILE_HEADER.PointerToSymbolTable, pe.FILE_HEADER.NumberOfSymbols,
                pe.FILE_HEADER.SizeOfOptionalHeader, pe.FILE_HEADER.Characteristics])
    # add OPTIONAL_HEADER
    row.extend([pe.OPTIONAL_HEADER.Magic, pe.OPTIONAL_HEADER.SizeOfCode,
                pe.OPTIONAL_HEADER.AddressOfEntryPoint, pe.OPTIONAL_HEADER.BaseOfCode,
                pe.OPTIONAL_HEADER.ImageBase,
                pe.OPTIONAL_HEADER.AddressOfEntryPoint + pe.OPTIONAL_HEADER.ImageBase,
                pe.OPTIONAL_HEADER.SectionAlignment, pe.OPTIONAL_HEADER.FileAlignment,
                pe.OPTIONAL_HEADER.SizeOfImage, pe.OPTIONAL_HEADER.SizeOfHeaders,
                pe.OPTIONAL_HEADER.CheckSum, pe.OPTIONAL_HEADER.Subsystem,
                pe.OPTIONAL_HEADER.NumberOfRvaAndSizes])

    # add CompareNumberOfSections
    total_section_number = 0
    for section in pe.sections:
        total_section_number += 1
    if pe.FILE_HEADER.NumberOfSections == total_section_number:
        row.extend(["1"]) #1 is true
    else:
        row.extend(["0"]) #0 is false

    # add .text features
    text_number = 0
    for section in pe.sections:
        try:
            if section.Name == b".text\x00\x00\x00":
                text_number += 1
                row.extend([section.Name, section.Misc_VirtualSize,
                            abs(section.Misc_VirtualSize - section.SizeOfRawData),
                            section.VirtualAddress, section.SizeOfRawData, section.PointerToRawData,
                            section.Characteristics, section.get_entropy()])
                break
        except AttributeError:
            row.extend(["Error", "Error", "Error", "Error", "Error", "Error", "Error", "Error"])
    if text_number == 0:
        row.extend(["None", "None", "None", "None", "None", "None", "None", "None"])

    # add .data features
    data_number = 0
    for section in pe.sections:
        try:
            if section.Name == b".data\x00\x00":
                data_number += 1
                row.extend([section.Name, section.Misc_VirtualSize,
                            abs(section.Misc_VirtualSize - section.SizeOfRawData),
                            section.VirtualAddress, section.SizeOfRawData, section.PointerToRawData,
                            section.Characteristics, section.get_entropy()])
                break
        except AttributeError:
            row.extend(["Error", "Error", "Error", "Error", "Error", "Error", "Error", "Error"])
    if data_number == 0:
        row.extend(["None", "None", "None", "None", "None", "None", "None", "None"])

    # add .rsrc features
    rsrc_number = 0
    for section in pe.sections:
        try:
            if section.Name == b".rsrc\x00\x00\x00":
                rsrc_number += 1
                row.extend([section.Name, section.Misc_VirtualSize,
                            abs(section.Misc_VirtualSize - section.SizeOfRawData),
                            section.VirtualAddress, section.SizeOfRawData, section.PointerToRawData,
                            section.Characteristics, section.get_entropy()])
                break
        except AttributeError:
            row.extend(["Error", "Error", "Error", "Error", "Error", "Error", "Error", "Error"])
    if rsrc_number == 0:
        row.extend(["None", "None", "None", "None", "None", "None", "None", "None"])

    # add .rdata features
    rdata_number = 0
    for section in pe.sections:
        try:
            if section.Name == b".rdata\x00\x00":
                rdata_number += 1
                row.extend([section.Name, section.Misc_VirtualSize,
                            abs(section.Misc_VirtualSize - section.SizeOfRawData),
                            section.VirtualAddress, section.SizeOfRawData, section.PointerToRawData,
                            section.Characteristics, section.get_entropy()])
                break
        except AttributeError:
            row.extend(["Error", "Error", "Error", "Error", "Error", "Error", "Error", "Error"])
    if rdata_number == 0:
        row.extend(["None", "None", "None", "None", "None", "None", "None", "None"])

    # add .reloc features
    reloc_number = 0
    for section in pe.sections:
        try:
            if section.Name == b".reloc\x00\x00":
                reloc_number += 1
                row.extend([section.Name, section.Misc_VirtualSize,
                            abs(section.Misc_VirtualSize - section.SizeOfRawData),
                            section.VirtualAddress, section.SizeOfRawData, section.PointerToRawData,
                            section.Characteristics, section.get_entropy()])
                break
        except AttributeError:
            row.extend(["Error", "Error", "Error", "Error", "Error", "Error", "Error", "Error"])
    if reloc_number == 0:
        row.extend(["None", "None", "None", "None", "None", "None", "None", "None"])

    # add total_iat_number
    try:
        pe.parse_data_directories()
        total_iat_number = 0
        for entry in pe.DIRECTORY_ENTRY_IMPORT:
            for imp in entry.imports:
                total_iat_number += 1
        row.extend([total_iat_number])
    except AttributeError:
        total_iat_number = 0
        row.extend([total_iat_number])

    # add total_eat_number
    try:
        total_eat_number = 0
        for exp in pe.DIRECTORY_ENTRY_EXPORT.symbols:
            total_eat_number += 1
        row.extend([total_eat_number])
    except AttributeError:
        total_eat_number = 0
        row.extend([total_eat_number])
    
    return row

In [None]:
with open('./pe_features.csv', 'w') as fp:
    wp = csv.writer(fp)
    wp.writerow(COLS)
    
    filepath = './sample_data'
    files = os.listdir(filepath)
    
    cnt = 0
    for idx, filename in enumerate(files):
        fullname = os.path.join(filepath, filename)
        try:
            pe = pefile.PE(fullname)
            feature = extract_pe_features(pe, filename)
        except:
            print('[+] {}'.format(filename))
            feature = NULL_ROW
            feature[0] = filename
        wp.writerow(feature)

In [None]:
data = pd.read_csv('./pe_features.csv')
data

In [None]:
for col in data.columns:
    if 'None' in data[col].tolist() or 'Error' in data[col].tolist():
        data = data.drop(columns=col, axis=0)
        print(col)

In [None]:
data.to_csv('./pe_features.csv', index=False)
data

In [None]:
label = pd.read_csv('./sample_data_label.csv')
label

In [None]:
data = data.join(label.set_index('filename')['label'], on='filename')
data

In [None]:
targets = data['label']
targets

In [None]:
data = data.drop(columns=['filename', 'label'], axis=1)
data

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

In [None]:
tr_X, te_X, tr_y, te_y = train_test_split(data, targets, test_size=0.2, random_state=10)
tr_X.shape, tr_y.shape, te_X.shape, te_y.shape

In [None]:
model = lgb.LGBMClassifier()
model.fit(tr_X, tr_y)

pred_y = model.predict(te_X)
score = model.predict_proba(te_X)
score = score[:, 1]

report = classification_report(te_y.values, pred_y)
auc = roc_auc_score(te_y.values, score)

In [None]:
print(report)

In [None]:
print('AUC Score: {:.4f}'.format(auc))

## 2. API Call Sequence

### 2.1. Preliminaries

In [None]:
import os
import json
import pickle
import distance

In [None]:
filepath = './json'
files = os.listdir(filepath)
files

['fb969657f18a277bf9373c52f2fca0e7.json',
 '74077c3e366f9fb15a268babc8128dd8.json',
 '68577fea5de0f7b828b797caf5aecff2.json',
 '107440b4d3c67f405721cf37fd723212.json',
 '9d862552b3831b0afdbd3e40a1d4a80b.json',
 '8a1c314d78b585eda995501472fe53df.json',
 'c5dbb0aa83dde12652ca467f82657781.json',
 'a2ce41ae024f8f2d527e06a7e0d4a40c.json',
 '81e69505b593da32940babf5ff4ccfdf.json',
 'ae66adf98b16c3f17af1039f9f371251.json',
 'acf795736753a906f9e4f89d852fab4c.json',
 'c1cdd95aea7665d23cfe52da1e5a9c36.json',
 '122352308fad788779174f5320772a3a.json',
 '4b8bc391024dc6129207641be65301a2.json',
 '18071f53f109e19b71911c14bcc5d4a5.json',
 'f25f96953199aaa49f8f5864be4cf786.json',
 'c9f3e391a95695e8afe542431c8919d1.json',
 '9c6e9451cbceb20d12b8af6f8881a246.json',
 'e5d9165051d34d9d41b0c61d7e076d0f.json',
 'c3e2f29993392a1d8a4c90b90222364a.json',
 'f0b950d82215cec10337ba876aadf1f9.json',
 'e5b64f90213152a370f3f49987ed1049.json',
 '83d0852f2a964750617a6372867c5f3b.json',
 '95a72a8266393f7169a1e99a2ec4dfdc

In [None]:
with open('api_dictionary.pkl', 'rb') as fp:
    api_dictionary = pickle.load(fp)
api_dictionary

{'__process__': 'A',
 '__anomaly__': 'A',
 '__exception__': 'A',
 '__missing__': 'A',
 'CertControlStore': 'B',
 'CertCreateCertificateContext': 'B',
 'CertOpenStore': 'B',
 'CertOpenSystemStoreA': 'B',
 'CertOpenSystemStoreW': 'B',
 'DecryptMessage': 'C',
 'EncryptMessage': 'C',
 'CryptAcquireContextA': 'C',
 'CryptAcquireContextW': 'C',
 'CryptCreateHash': 'C',
 'CryptDecrypt': 'C',
 'CryptEncrypt': 'C',
 'CryptExportKey': 'C',
 'CryptGenKey': 'C',
 'CryptHashData': 'C',
 'CryptDecodeMessage': 'C',
 'CryptDecodeObjectEx': 'C',
 'CryptDecryptMessage': 'C',
 'CryptEncryptMessage': 'C',
 'CryptHashMessage': 'C',
 'CryptProtectData': 'C',
 'CryptProtectMemory': 'C',
 'CryptUnprotectData': 'C',
 'CryptUnprotectMemory': 'C',
 'PRF': 'C',
 'Ssl3GenerateKeyMaterial': 'C',
 'SetUnhandledExceptionFilter': 'D',
 'RtlAddVectoredContinueHandler': 'D',
 'RtlAddVectoredExceptionHandler': 'D',
 'RtlDispatchException': 'D',
 'RtlRemoveVectoredContinueHandler': 'D',
 'RtlRemoveVectoredExceptionHandler

### 2.2. Sample Code

In [None]:
filename = files[0]
fullname = os.path.join(filepath, filename)
fullname

'./json/fb969657f18a277bf9373c52f2fca0e7.json'

In [None]:
with open(fullname) as json_file:
    json_data = json.load(json_file)
json_data

{'info': {'added': 1622733090.113817,
  'started': 1622736015.682853,
  'duration': 60,
  'ended': 1622736075.883966,
  'owner': None,
  'score': 2.4,
  'id': 69,
  'category': 'file',
  'git': {'head': '13cbe0d9e457be3673304533043e992ead1ea9b2',
   'fetch_head': '13cbe0d9e457be3673304533043e992ead1ea9b2'},
  'monitor': '2deb9ccd75d5a7a3fe05b2625b03a8639d6ee36b',
  'package': 'exe',
  'route': 'none',
  'custom': None,
  'machine': {'status': 'stopped',
   'name': 'cuckoo1',
   'label': 'Windows7',
   'manager': 'VirtualBox',
   'started_on': '2021-06-03 16:00:16',
   'shutdown_on': '2021-06-03 16:01:15'},
  'platform': 'windows',
  'version': '2.0.7',
  'options': 'procmemdump=yes,route=none'},
 'procmemory': [{'regions': [{'protect': 'rw',
     'end': '0x00020000',
     'addr': '0x00010000',
     'state': 4096,
     'offset': 24,
     'type': 262144,
     'size': 65536},
    {'protect': 'rw',
     'end': '0x00030000',
     'addr': '0x00020000',
     'state': 4096,
     'offset': 6558

In [None]:
for item in json_data:
    print(item)

info
procmemory
target
virustotal
network
signatures
static
behavior
debug
screenshots
strings
metadata


In [None]:
for item in json_data['behavior']:
    print(item)

generic
apistats
processes
processtree


In [None]:
api_sequence_list = list()
for call_sequence in json_data['behavior']['processes']:
    for call in call_sequence['calls']:
        api_sequence_list.append(call['api'])

In [None]:
api_sequence_list

['NtAllocateVirtualMemory',
 'NtFreeVirtualMemory',
 'NtAllocateVirtualMemory',
 'NtAllocateVirtualMemory',
 'NtAllocateVirtualMemory',
 'GetFileType',
 'GetFileType',
 'GetFileType',
 'LdrGetDllHandle',
 'LdrGetProcedureAddress',
 'SetUnhandledExceptionFilter',
 'WriteConsoleA',
 'SetUnhandledExceptionFilter',
 'NtTerminateProcess',
 'NtTerminateProcess',
 'NtTerminateProcess']

In [None]:
api_strings = ''.join([api_dictionary[api] for api in api_sequence_list])
api_strings

'KKKKKEEEPPDGDKKK'

In [None]:
api_string1 = get_api_strings(files[5])
api_string2 = get_api_strings(files[6])

print(distance.levenshtein(api_string1, api_string2))

119


### 2.3. Practice

In [None]:
def leven_dist(target, base):
    max_len = len(base) if len(base) > len(target) else len(target)    
    dist = distance.levenshtein(target, base)
    return 1.0 - (dist / max_len)

In [None]:
def get_api_strings(filename, json_path='./json'):
    fullname = os.path.join(json_path, filename)
    with open(fullname) as fp:
        json_data = json.load(fp)
        
    api_sequence_list = list()
    for call_sequence in json_data['behavior']['processes']:
        for call in call_sequence['calls']:
            api_sequence_list.append(call['api'])
    return ''.join([api_dictionary[api] for api in api_sequence_list])

In [None]:
label = pd.read_csv('./sample_data_label.csv')
label

Unnamed: 0,filename,label
0,9d862552b3831b0afdbd3e40a1d4a80b.vir,0
1,49ecec06377ef6d2881aabc1d19a8920.vir,0
2,acf795736753a906f9e4f89d852fab4c.vir,1
3,3e2f89c6a7a29eab827c77396ffaf02c.vir,1
4,dbb4bbd127846527c18169a95e874fc4.vir,0
...,...,...
95,33b5fb836887814e400a873e0b66dd7d.vir,1
96,2d1c4d8672a6e242f9f7414e4fc1068f.vir,1
97,fb969657f18a277bf9373c52f2fca0e7.vir,0
98,4b16dbae76d9efd71905ba6a9664c6ae.vir,0


In [None]:
base_file = '70ecc519ea38fd46b83566776a8ffe74.json'
base_string = get_api_strings(base_file)
base_string, label['label'][label.filename==base_file.split('.')[0] + '.vir']

('OOLLPPLPLPLLLLLLPLPPELPLPLPELLLPLLLPELLPLLPPELLLPLLPPPLLPEEPPPEPLLLPPPPPPPPLLPPPOPPEEEKKPPKPEEKKPPKPPLLLLPLLLPEPLLLPEPEKKKPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPKPPKOPPPPPPPPPPEEPEPEPPEPPEPPLLLPLLLPLLLPLLLPPPPPPPPPLLLPLLLLPPPPPPPEOPPPKKPPPPPEEEEEEPEPPPPPPLLELLPPPELLLPLLLLLLLLLPLLPPPKPPPPPPPKKPPKPPPPPPPKKKKPPPKPPPPPPPPPPPKPPKKPPPPPKKPPPPPKKKKOPPPPDDKPPKKKKKKPLLLLLLLPLLLLLPLPKEEEEPPPPLPPPKPPPGEELPPPKKPPPKKKKKKKPKLLLLELLLPPLLLPLLLLLPLLLLLLPLLLLLLLPLLPPPKKPPPPPPPPPPPPPPPKKPPPPPPPPPPPPPPPKKPPPPPPPPPEEKKKKPPEEPEPEPEPPPEPPPPPPKPPPPPJPPPPPJGPPPKEELLPLPPLEKKKPPPPPKKKPPPPEEEKKPPPEEKKPPPPPPPPPPPPPPPPPLPPPKPPKPEPPPPPKKKKKKEPPPPPEPEKKPKPPPEPEKKPKKKPPPPEPLLELLEEKEKEKEEEEPLKKEEEEPLLLLLLEPPPEEEEEEKKPPQQQQELKKEEEEKKKKKKKKKKKKKKKPGKKKKKLLLLPLLLLLPLLLLLLPLLLLLLPLLLLLLPLLLLLLPLLLLLPPPKKPPPPPPPPELKKKKKKKKKKKKKKKKKGKKPPPPPPPPPPPPPPPPPKKPPKKKKPPPPPKEKKKKKKKGGEPPCPGCKKPKKELLELLEEEEPPPKKKKPPEKKKPKKKPPPPPPPPPPPPPPPPPPKPPPPPPPPPPPPPPPPKKJKKOOOOPPOKKKKPKKPPPKKKKKKKPPPPPPPPPPPPPPPPPPPKKPLLLPLL

In [None]:
for filename in files:
    fullname = os.path.join(filepath, filename)
    api_string = get_api_strings(filename)
    name = filename.split('.')[0] + '.vir'
    dist = leven_dist(api_string, base_string)
    print('[+] {}: {:.4f}'.format(name, dist))

[+] fb969657f18a277bf9373c52f2fca0e7.vir: 0.0126
[+] 74077c3e366f9fb15a268babc8128dd8.vir: 0.0720
[+] 68577fea5de0f7b828b797caf5aecff2.vir: 0.0000
[+] 107440b4d3c67f405721cf37fd723212.vir: 0.0000
[+] 9d862552b3831b0afdbd3e40a1d4a80b.vir: 0.0076
[+] 8a1c314d78b585eda995501472fe53df.vir: 0.1491
[+] c5dbb0aa83dde12652ca467f82657781.vir: 0.0662
[+] a2ce41ae024f8f2d527e06a7e0d4a40c.vir: 0.1404
[+] 81e69505b593da32940babf5ff4ccfdf.vir: 0.0000
[+] ae66adf98b16c3f17af1039f9f371251.vir: 0.0000
[+] acf795736753a906f9e4f89d852fab4c.vir: 0.0000
[+] c1cdd95aea7665d23cfe52da1e5a9c36.vir: 0.3509
[+] 122352308fad788779174f5320772a3a.vir: 0.0229
[+] 4b8bc391024dc6129207641be65301a2.vir: 0.0996
[+] 18071f53f109e19b71911c14bcc5d4a5.vir: 0.0000
[+] f25f96953199aaa49f8f5864be4cf786.vir: 0.0210
[+] c9f3e391a95695e8afe542431c8919d1.vir: 0.0000
[+] 9c6e9451cbceb20d12b8af6f8881a246.vir: 0.2526
[+] e5d9165051d34d9d41b0c61d7e076d0f.vir: 0.0426
[+] c3e2f29993392a1d8a4c90b90222364a.vir: 0.0000
[+] f0b950d82215cec1

## 3. Strings & Byte Histogram

### 3.1. Preliminaries

In [None]:
import re
import os
import numpy as np

### 3.2. Sample Code

In [None]:
filepath = './sample_data'
files = os.listdir(filepath)

In [None]:
filename = files[0]
fullname = os.path.join(filepath, filename)
fullname

'./sample_data/ab415c42e9a35b2670025695ef4dbccc.vir'

In [None]:
with open(fullname, 'rb') as fp:
    byte_data = fp.read()
byte_data

b'MZ\x90\x00\x03\x00\x00\x00\x04\x00\x00\x00\xff\xff\x00\x00\xb8\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xd8\x00\x00\x00\x0e\x1f\xba\x0e\x00\xb4\t\xcd!\xb8\x01L\xcd!This program cannot be run in DOS mode.\r\r\n$\x00\x00\x00\x00\x00\x00\x00\xf2\xfa\x92\xca\xb6\x9b\xfc\x99\xb6\x9b\xfc\x99\xb6\x9b\xfc\x99\xcd\x87\xf0\x99\xbc\x9b\xfc\x995\x87\xf2\x99\xae\x9b\xfc\x99^\x84\xf6\x99\xda\x9b\xfc\x99\xb6\x9b\xfd\x99h\x9b\xfc\x99\xd4\x84\xef\x99\xb9\x9b\xfc\x99^\x84\xf7\x99\xfc\x9b\xfc\x99\x0e\x9d\xfa\x99\xb7\x9b\xfc\x99Rich\xb6\x9b\xfc\x99\x00\x00\x00\x00\x00\x00\x00\x00PE\x00\x00L\x01\x03\x00\t\xc5\xc4<\x00\x00\x00\x00\x00\x00\x00\x00\xe0\x00\x0f\x01\x0b\x01\x06\x00\x00`\x01\x00\x00\x10\x00\x00\x00 \x02\x00`\x88\x03\x00\x000\x02\x00\x00\x90\x03\x00\x00\x00@\x00\x00\x10\x00\x00\x00\x02\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x00\xa0\x03\

#### String Features

In [None]:
all_strings = re.compile(b'[\x20-\x7f]{2,}')
string_list = all_strings.findall(byte_data)
string_list

[b'MZ',
 b'!This program cannot be run in DOS mode.',
 b'Rich',
 b'PE',
 b'UPX0',
 b'UPX1',
 b'.rsrc',
 b'1.24',
 b'UPX!',
 b'qU',
 b'ZX',
 b'XcB',
 b'D$',
 b'SUVWR',
 b'hp',
 b'F h|',
 b'P$',
 b'$@',
 b'Pl',
 b'LWX',
 b'm[k',
 b'2X',
 b'o2',
 b'l_^][',
 b'VPh',
 b'a\\(',
 b'd| ',
 b'd*',
 b'4|',
 b'_U',
 b'o\x7f',
 b'Qh',
 b'J~',
 b'Rh',
 b'@Jt',
 b'd9@',
 b'dd',
 b'N GH',
 b'l_',
 b'M8',
 b'Cae',
 b'P0',
 b'S$',
 b'd3_',
 b'm%&T',
 b' G',
 b'ko',
 b'lj',
 b"gH>k'",
 b'\\d',
 b'4fd',
 b'\x7fT',
 b'-2',
 b'WF',
 b"'Qh8",
 b'$Rh0@',
 b'j n$',
 b'ZI',
 b'VF',
 b'.&g',
 b')7',
 b'p1I /N',
 b'BF2*,\\',
 b'!C',
 b'Y0d',
 b'~W3',
 b' Q',
 b'$$x',
 b'WP',
 b'so',
 b'x+',
 b'57',
 b'WQ',
 b'TP',
 b"('",
 b'V$R',
 b'=q',
 b'ot',
 b'@F',
 b'd>',
 b'd3',
 b'TW',
 b'_6',
 b'Q8',
 b'$[',
 b'1iQ',
 b"'agh",
 b'54q',
 b"'W,",
 b'XB',
 b'L$',
 b'vX',
 b'TP',
 b'dE![',
 b'Jh',
 b'p&',
 b'|$ ',
 b'\x7f$',
 b'4)',
 b'H&',
 b'RP/',
 b'tp/',
 b'CBH',
 b'BPVw',
 b'gC',
 b'@$9:',
 b't2',
 b'Zr',
 b'%tp',
 b'

In [None]:
string_list = [x.decode('utf-8') for x in string_list]
string_list

['MZ',
 '!This program cannot be run in DOS mode.',
 'Rich',
 'PE',
 'UPX0',
 'UPX1',
 '.rsrc',
 '1.24',
 'UPX!',
 'qU',
 'ZX',
 'XcB',
 'D$',
 'SUVWR',
 'hp',
 'F h|',
 'P$',
 '$@',
 'Pl',
 'LWX',
 'm[k',
 '2X',
 'o2',
 'l_^][',
 'VPh',
 'a\\(',
 'd| ',
 'd*',
 '4|',
 '_U',
 'o\x7f',
 'Qh',
 'J~',
 'Rh',
 '@Jt',
 'd9@',
 'dd',
 'N GH',
 'l_',
 'M8',
 'Cae',
 'P0',
 'S$',
 'd3_',
 'm%&T',
 ' G',
 'ko',
 'lj',
 "gH>k'",
 '\\d',
 '4fd',
 '\x7fT',
 '-2',
 'WF',
 "'Qh8",
 '$Rh0@',
 'j n$',
 'ZI',
 'VF',
 '.&g',
 ')7',
 'p1I /N',
 'BF2*,\\',
 '!C',
 'Y0d',
 '~W3',
 ' Q',
 '$$x',
 'WP',
 'so',
 'x+',
 '57',
 'WQ',
 'TP',
 "('",
 'V$R',
 '=q',
 'ot',
 '@F',
 'd>',
 'd3',
 'TW',
 '_6',
 'Q8',
 '$[',
 '1iQ',
 "'agh",
 '54q',
 "'W,",
 'XB',
 'L$',
 'vX',
 'TP',
 'dE![',
 'Jh',
 'p&',
 '|$ ',
 '\x7f$',
 '4)',
 'H&',
 'RP/',
 'tp/',
 'CBH',
 'BPVw',
 'gC',
 '@$9:',
 't2',
 'Zr',
 '%tp',
 '&]',
 'eU',
 'ls',
 'EZ',
 'tKUa',
 'uDV',
 'VN',
 '4^t',
 ' 98t&',
 'm2',
 '0K',
 '/L$ U',
 'NW',
 't$,c',
 '

In [None]:
strings = ''.join(string_list)
strings

'MZ!This program cannot be run in DOS mode.RichPEUPX0UPX1.rsrc1.24UPX!qUZXXcBD$SUVWRhpF h|P$$@PlLWXm[k2Xo2l_^][VPha\\(d| d*4|_Uo\x7fQhJ~Rh@Jtd9@ddN GHl_M8CaeP0S$d3_m%&T GkoljgH>k\'\\d4fd\x7fT-2WF\'Qh8$Rh0@j n$ZIVF.&g)7p1I /NBF2*,\\!CY0d~W3 Q$$xWPsox+57WQTP(\'V$R=qot@Fd>d3TW_6Q8$[1iQ\'agh54q\'W,XBL$vXTPdE![Jhp&|$ \x7f$4)H&RP/tp/CBHBPVwgC@$9:t2Zr%tp&]eUlsEZtKUauDVVN4^t 98t&m20K/L$ UNWt$,cL$kwo#1W (PWShwg&pbVP*[EFWoFq-6apYgP_hwVCPFzUs]:VZPHfTOWPQVmg? RWu}#_HtBd!BYn+a~Q(aj:6&dk-[v,PtlKI&tfjkx[Yp:c:4P_;\x7fU0zuWf~|Q)<}+$ah<Wxk[[9d?3C>K[Sh[fLW Q2p3J[AVSRQHpSnMfr/{a,<fJX\'Wh&tstm8teS\x7fuv7Ni[3-p^\'v-twAO0MGe(fctP#XJ&>Uj%x,8v`.P;Bso6oEq2n[]r#@>,s}|jU&\'(7$iu4*h$3ad l?"tdnPn5hD(;Dke]abQ{eU#hDXC%*(!p<_HEkGVkO\x7f]6?Td@PdQe(,DYWz2RBV}ld!Itgj {3aepht4m=8Ha7\x7furG$d|Gs q<@(U?h9U4;B"{z]@Z)$@*0OcH[Dwv9hB2JP/^]al_^vCG;\'qp1RZC[`]dC(V\x7f!AGC&\x7f4mu8gDM@ll49u(t,u\\tX8HBl7G{,lo1~@;Vc\'^6>K\x7f,Y\\I"G[m*u(,|ZjIeSC;a(y rU\\N[W.`PJ`{GtE;x^j\'}<&C:T:s4"U9!<0^I<5E(lE,gWxQ-@kG\\s@|0t!Roh]rzH1RM$IHRx\x7fnXL

In [None]:
path = re.compile('c:\\\\', re.IGNORECASE)
url = re.compile('https?://', re.IGNORECASE)
reg = re.compile('HKEY_')
mz = re.compile('MZ')

In [None]:
path_list = path.findall(strings)
path_list

['c:\\']

In [None]:
url_list = url.findall(strings)
url_list

[]

In [None]:
reg_list = reg.findall(strings)
reg_list

[]

In [None]:
mz_list = mz.findall(strings)
mz_list

['MZ']

#### Byte Histogram Feature

In [None]:
min(byte_data), max(byte_data)

(0, 255)

In [None]:
h = np.bincount(np.frombuffer(byte_data, dtype=np.uint8), minlength=256)

In [None]:
ret = list(h.astype(np.float32).flatten() / h.sum())

In [None]:
sum(ret)

0.9999999999999988

In [None]:
ret

[0.5845064529625696,
 0.008602877281065945,
 0.003276367030349844,
 0.007354124424704709,
 0.006488365356763542,
 0.0020501432203662577,
 0.0033857938270412924,
 0.003392230697434907,
 0.00244279231437675,
 0.0032731485951530367,
 0.0016414019503717292,
 0.0016864600431270315,
 0.0023558945640629525,
 0.001680023172733417,
 0.0015738148112387758,
 0.0017797946638344437,
 0.0024009526568182548,
 0.001708989089504683,
 0.002761417398860674,
 0.0013742718290367223,
 0.0019922113868237265,
 0.0015384120240738953,
 0.0014965723665154001,
 0.0014482958385632905,
 0.0018634739789514338,
 0.0013259953010846127,
 0.001174728846834669,
 0.0013485243474622639,
 0.0017894499694248657,
 0.0012326606803772005,
 0.0011361076244729813,
 0.001268063467542081,
 0.002349457693669338,
 0.0013774902642335297,
 0.0013485243474622639,
 0.0013259953010846127,
 0.0016639309967493804,
 0.0011200154484889446,
 0.0011071417077017154,
 0.001123233883685752,
 0.0015319751536802806,
 0.0009204724662868913,
 0.001145

### 3.3. Practice

In [None]:
def get_string_features(fullname):
    with open(fullname, 'rb') as fp:
        byte_data = fp.read()

    allstrings = re.compile(b'[\x20-\x7f]{2,}')
    string_list = allstrings.findall(byte_data)
    string_list = [x.decode('utf-8') for x in string_list]
    
    strings = ''.join(string_list)
    
    path = re.compile('c:\\\\', re.IGNORECASE)
    url = re.compile('https?://', re.IGNORECASE)
    reg = re.compile('HKEY_')
    mz = re.compile('MZ')
    
    string_features = list()
    string_features.append(len(string_list))
    string_features.append(len(path.findall(strings)))
    string_features.append(len(url.findall(strings)))
    string_features.append(len(reg.findall(strings)))
    string_features.append(len(mz.findall(strings)))
    
    return string_features

In [None]:
get_string_features(fullname)

[13015, 1, 0, 0, 1]

In [None]:
def get_histogram_features(fullname):
    with open(fullname, 'rb') as fp:
        byte_data = fp.read()
        
    h = np.bincount(np.frombuffer(byte_data, dtype=np.uint8), minlength=256)
    
    return list(h.astype(np.float32).flatten() / h.sum())

In [None]:
get_histogram_features(fullname)

[0.5845064529625696,
 0.008602877281065945,
 0.003276367030349844,
 0.007354124424704709,
 0.006488365356763542,
 0.0020501432203662577,
 0.0033857938270412924,
 0.003392230697434907,
 0.00244279231437675,
 0.0032731485951530367,
 0.0016414019503717292,
 0.0016864600431270315,
 0.0023558945640629525,
 0.001680023172733417,
 0.0015738148112387758,
 0.0017797946638344437,
 0.0024009526568182548,
 0.001708989089504683,
 0.002761417398860674,
 0.0013742718290367223,
 0.0019922113868237265,
 0.0015384120240738953,
 0.0014965723665154001,
 0.0014482958385632905,
 0.0018634739789514338,
 0.0013259953010846127,
 0.001174728846834669,
 0.0013485243474622639,
 0.0017894499694248657,
 0.0012326606803772005,
 0.0011361076244729813,
 0.001268063467542081,
 0.002349457693669338,
 0.0013774902642335297,
 0.0013485243474622639,
 0.0013259953010846127,
 0.0016639309967493804,
 0.0011200154484889446,
 0.0011071417077017154,
 0.001123233883685752,
 0.0015319751536802806,
 0.0009204724662868913,
 0.001145

In [None]:
filepath = './sample_data'
files = os.listdir(filepath)

cols = list()
cols.append('filename')
for i in range(256):
    col_name = 'byte_histogram_' + str(i)
    cols.extend([col_name])
cols.extend(['string_total_cnt', 'string_path_cnt', 'string_url_cnt', 'string_reg_cnt', 'string_mz_cnt'])
    
with open('./str_hist_features.csv', 'w') as fp:       
    wr = csv.writer(fp)
    wr.writerow(cols)
    
    for idx, filename in enumerate(files):
        temp = list()
        temp.append(filename)
        fullname = os.path.join(filepath, filename)
        hist_features = get_histogram_features(fullname)
        str_features = get_string_features(fullname)
        temp.extend(hist_features)
        temp.extend(str_features)
        wr.writerow(temp)
        print('[+] {}: {}, {}'.format(filename, len(hist_features), str_features))
    

[+] ab415c42e9a35b2670025695ef4dbccc.vir: 256, [13015, 1, 0, 0, 1]
[+] 1fdca501805304b95ce12f82f2432b39.vir: 256, [47919, 0, 0, 0, 14]
[+] aaf6d41eb64696b1bd5d5f838bf57b36.vir: 256, [1924, 0, 0, 0, 3]
[+] 0b30c77aa3fdb808aede01ff94c051af.vir: 256, [17550, 0, 12, 0, 5]
[+] a2ce41ae024f8f2d527e06a7e0d4a40c.vir: 256, [4423, 0, 0, 0, 1]
[+] a9683b3da36bb3e459bf177dd03e1946.vir: 256, [39334, 0, 36, 3, 12]
[+] dbb4bbd127846527c18169a95e874fc4.vir: 256, [5553, 0, 0, 0, 2]
[+] 2f4b80f37f530363749678a26139e94b.vir: 256, [167057, 1, 1, 0, 53]
[+] d7de2bd651270262441727b750666523.vir: 256, [1301, 0, 0, 0, 1]
[+] 052479f42b180d8c740804445366d167.vir: 256, [70018, 0, 29, 0, 27]
[+] 34001238e08e41d24ad1aeb2e2be03af.vir: 256, [389, 0, 0, 0, 1]
[+] f778372694307ea6e38842b2fd86d00d.vir: 256, [23651, 0, 0, 8, 9]
[+] 68577fea5de0f7b828b797caf5aecff2.vir: 256, [1078, 0, 0, 0, 4]
[+] 8840639e630e82d9e64efeffeb5eede5.vir: 256, [47169, 0, 0, 0, 6]
[+] 7766a6d9867a29422b16e3e5c9aaae7b.vir: 256, [13467, 1, 0, 

In [None]:
data = pd.read_csv('./str_hist_features.csv')
data

Unnamed: 0,filename,byte_histogram_0,byte_histogram_1,byte_histogram_2,byte_histogram_3,byte_histogram_4,byte_histogram_5,byte_histogram_6,byte_histogram_7,byte_histogram_8,...,byte_histogram_251,byte_histogram_252,byte_histogram_253,byte_histogram_254,byte_histogram_255,string_total_cnt,string_path_cnt,string_url_cnt,string_reg_cnt,string_mz_cnt
0,ab415c42e9a35b2670025695ef4dbccc.vir,0.584506,0.008603,0.003276,0.007354,0.006488,0.002050,0.003386,0.003392,0.002443,...,0.000853,0.000998,0.000682,0.002156,0.001361,13015,1,0,0,1
1,1fdca501805304b95ce12f82f2432b39.vir,0.012801,0.003879,0.003974,0.004082,0.004057,0.004029,0.003908,0.003913,0.003932,...,0.003952,0.003765,0.003831,0.003780,0.000512,47919,0,0,0,14
2,aaf6d41eb64696b1bd5d5f838bf57b36.vir,0.275815,0.003387,0.003326,0.002441,0.003601,0.002411,0.004089,0.003570,0.003601,...,0.002838,0.003173,0.002014,0.003723,0.004028,1924,0,0,0,3
3,0b30c77aa3fdb808aede01ff94c051af.vir,0.062913,0.005790,0.004786,0.006596,0.005107,0.003815,0.003803,0.003503,0.004301,...,0.003351,0.003343,0.003317,0.004465,0.027871,17550,0,12,0,5
4,a2ce41ae024f8f2d527e06a7e0d4a40c.vir,0.200887,0.033337,0.016778,0.015279,0.014160,0.011449,0.006886,0.003283,0.014345,...,0.000977,0.000530,0.000791,0.000278,0.001919,4423,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,25f6e7b5b54b0bb6c396400113efce96.vir,0.323570,0.002465,0.000336,0.000186,0.009339,0.000206,0.000231,0.000557,0.001290,...,0.000005,0.000432,0.000015,0.000020,0.000020,97,0,0,0,1
96,2d109301bbed90fb00600e543bdadb0a.vir,0.189198,0.012158,0.013024,0.007525,0.008783,0.007796,0.005112,0.005433,0.011358,...,0.001214,0.006666,0.002164,0.002976,0.046553,71482,5,0,0,38
97,0a7d4c082deff16b2224d1298d590bb1.vir,0.008960,0.006875,0.006680,0.006629,0.006703,0.006669,0.006649,0.006578,0.006807,...,0.002941,0.002958,0.002896,0.002974,0.003080,57538,1,0,0,11
98,81e69505b593da32940babf5ff4ccfdf.vir,0.435399,0.008522,0.002975,0.002653,0.003814,0.002245,0.001983,0.001645,0.003737,...,0.002838,0.005382,0.002193,0.002995,0.037963,9130,0,0,0,10


In [None]:
target = pd.read_csv('./sample_data_label.csv')
target

Unnamed: 0,filename,label
0,9d862552b3831b0afdbd3e40a1d4a80b.vir,0
1,49ecec06377ef6d2881aabc1d19a8920.vir,0
2,acf795736753a906f9e4f89d852fab4c.vir,1
3,3e2f89c6a7a29eab827c77396ffaf02c.vir,1
4,dbb4bbd127846527c18169a95e874fc4.vir,0
...,...,...
95,33b5fb836887814e400a873e0b66dd7d.vir,1
96,2d1c4d8672a6e242f9f7414e4fc1068f.vir,1
97,fb969657f18a277bf9373c52f2fca0e7.vir,0
98,4b16dbae76d9efd71905ba6a9664c6ae.vir,0


In [None]:
data = data.join(target.set_index('filename')['label'], on='filename')
data

Unnamed: 0,filename,byte_histogram_0,byte_histogram_1,byte_histogram_2,byte_histogram_3,byte_histogram_4,byte_histogram_5,byte_histogram_6,byte_histogram_7,byte_histogram_8,...,byte_histogram_252,byte_histogram_253,byte_histogram_254,byte_histogram_255,string_total_cnt,string_path_cnt,string_url_cnt,string_reg_cnt,string_mz_cnt,label
0,ab415c42e9a35b2670025695ef4dbccc.vir,0.584506,0.008603,0.003276,0.007354,0.006488,0.002050,0.003386,0.003392,0.002443,...,0.000998,0.000682,0.002156,0.001361,13015,1,0,0,1,1
1,1fdca501805304b95ce12f82f2432b39.vir,0.012801,0.003879,0.003974,0.004082,0.004057,0.004029,0.003908,0.003913,0.003932,...,0.003765,0.003831,0.003780,0.000512,47919,0,0,0,14,1
2,aaf6d41eb64696b1bd5d5f838bf57b36.vir,0.275815,0.003387,0.003326,0.002441,0.003601,0.002411,0.004089,0.003570,0.003601,...,0.003173,0.002014,0.003723,0.004028,1924,0,0,0,3,1
3,0b30c77aa3fdb808aede01ff94c051af.vir,0.062913,0.005790,0.004786,0.006596,0.005107,0.003815,0.003803,0.003503,0.004301,...,0.003343,0.003317,0.004465,0.027871,17550,0,12,0,5,1
4,a2ce41ae024f8f2d527e06a7e0d4a40c.vir,0.200887,0.033337,0.016778,0.015279,0.014160,0.011449,0.006886,0.003283,0.014345,...,0.000530,0.000791,0.000278,0.001919,4423,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,25f6e7b5b54b0bb6c396400113efce96.vir,0.323570,0.002465,0.000336,0.000186,0.009339,0.000206,0.000231,0.000557,0.001290,...,0.000432,0.000015,0.000020,0.000020,97,0,0,0,1,0
96,2d109301bbed90fb00600e543bdadb0a.vir,0.189198,0.012158,0.013024,0.007525,0.008783,0.007796,0.005112,0.005433,0.011358,...,0.006666,0.002164,0.002976,0.046553,71482,5,0,0,38,0
97,0a7d4c082deff16b2224d1298d590bb1.vir,0.008960,0.006875,0.006680,0.006629,0.006703,0.006669,0.006649,0.006578,0.006807,...,0.002958,0.002896,0.002974,0.003080,57538,1,0,0,11,1
98,81e69505b593da32940babf5ff4ccfdf.vir,0.435399,0.008522,0.002975,0.002653,0.003814,0.002245,0.001983,0.001645,0.003737,...,0.005382,0.002193,0.002995,0.037963,9130,0,0,0,10,1


In [None]:
label = data['label']
label

0     1
1     1
2     1
3     1
4     0
     ..
95    0
96    0
97    1
98    1
99    0
Name: label, Length: 100, dtype: int64

In [None]:
data = data.drop(columns=['filename', 'label'], axis=1)
data

Unnamed: 0,byte_histogram_0,byte_histogram_1,byte_histogram_2,byte_histogram_3,byte_histogram_4,byte_histogram_5,byte_histogram_6,byte_histogram_7,byte_histogram_8,byte_histogram_9,...,byte_histogram_251,byte_histogram_252,byte_histogram_253,byte_histogram_254,byte_histogram_255,string_total_cnt,string_path_cnt,string_url_cnt,string_reg_cnt,string_mz_cnt
0,0.584506,0.008603,0.003276,0.007354,0.006488,0.002050,0.003386,0.003392,0.002443,0.003273,...,0.000853,0.000998,0.000682,0.002156,0.001361,13015,1,0,0,1
1,0.012801,0.003879,0.003974,0.004082,0.004057,0.004029,0.003908,0.003913,0.003932,0.003824,...,0.003952,0.003765,0.003831,0.003780,0.000512,47919,0,0,0,14
2,0.275815,0.003387,0.003326,0.002441,0.003601,0.002411,0.004089,0.003570,0.003601,0.002685,...,0.002838,0.003173,0.002014,0.003723,0.004028,1924,0,0,0,3
3,0.062913,0.005790,0.004786,0.006596,0.005107,0.003815,0.003803,0.003503,0.004301,0.003815,...,0.003351,0.003343,0.003317,0.004465,0.027871,17550,0,12,0,5
4,0.200887,0.033337,0.016778,0.015279,0.014160,0.011449,0.006886,0.003283,0.014345,0.006440,...,0.000977,0.000530,0.000791,0.000278,0.001919,4423,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.323570,0.002465,0.000336,0.000186,0.009339,0.000206,0.000231,0.000557,0.001290,0.000914,...,0.000005,0.000432,0.000015,0.000020,0.000020,97,0,0,0,1
96,0.189198,0.012158,0.013024,0.007525,0.008783,0.007796,0.005112,0.005433,0.011358,0.002569,...,0.001214,0.006666,0.002164,0.002976,0.046553,71482,5,0,0,38
97,0.008960,0.006875,0.006680,0.006629,0.006703,0.006669,0.006649,0.006578,0.006807,0.006668,...,0.002941,0.002958,0.002896,0.002974,0.003080,57538,1,0,0,11
98,0.435399,0.008522,0.002975,0.002653,0.003814,0.002245,0.001983,0.001645,0.003737,0.001508,...,0.002838,0.005382,0.002193,0.002995,0.037963,9130,0,0,0,10


In [None]:
tr_X, te_X, tr_y, te_y = train_test_split(data, targets, test_size=0.2, random_state=10)
tr_X.shape, tr_y.shape, te_X.shape, te_y.shape

((80, 261), (80,), (20, 261), (20,))

In [None]:
model = lgb.LGBMClassifier()
model.fit(tr_X, tr_y)

pred_y = model.predict(te_X)
score = model.predict_proba(te_X)
score = score[:, 1]

report = classification_report(te_y.values, pred_y)
auc = roc_auc_score(te_y.values, score)

In [None]:
print(report)

              precision    recall  f1-score   support

           0       0.64      0.70      0.67        10
           1       0.67      0.60      0.63        10

    accuracy                           0.65        20
   macro avg       0.65      0.65      0.65        20
weighted avg       0.65      0.65      0.65        20



In [None]:
print(auc)

0.8


#### Concatente several features

In [None]:
pe_data = pd.read_csv('./pe_features.csv')
pe_data

NameError: ignored

In [None]:
str_hist_data = pd.read_csv('./str_hist_features.csv')
str_hist_data

NameError: ignored

In [None]:
val_cols = str_hist_data.columns
val_cols = val_cols.drop('filename')
val_cols

In [None]:
data = pe_data.join(str_hist_data.set_index('filename')[val_cols], on='filename')
data

In [None]:
target = pd.read_csv('./sample_data_label.csv')
target

In [None]:
data = data.join(target.set_index('filename')['label'], on='filename')
data

In [None]:
label = data['label']

In [None]:
data = data.drop(columns=['filename', 'label'], axis=1)
data

In [None]:
tr_X, te_X, tr_y, te_y = train_test_split(data, label, test_size=0.2, random_state=10)
tr_X.shape, tr_y.shape, te_X.shape, te_y.shape

In [None]:
model = lgb.LGBMClassifier()
model.fit(tr_X, tr_y)

pred_y = model.predict(te_X)
score = model.predict_proba(te_X)
score = score[:, 1]

report = classification_report(te_y.values, pred_y)
auc = roc_auc_score(te_y.values, score)

In [None]:
print(report)

In [None]:
print(auc)