Skip to content

Commit

Permalink
Merge pull request #3 from choisungwook/master
Browse files Browse the repository at this point in the history
add script
  • Loading branch information
choisungwook committed Nov 6, 2018
2 parents cea14c2 + 1ddbbb6 commit add417c
Show file tree
Hide file tree
Showing 15 changed files with 187 additions and 10,108 deletions.
117 changes: 117 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# vscode
.vscode
74 changes: 24 additions & 50 deletions 01_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,71 +7,45 @@
import jsonlines
import pandas as pd

path_label = ''
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--dataset", help="Dataset path", required=True)
parser.add_argument("-o", "--output", help="output path", required=True)
parser.add_argument("-c", "--csv", help="dataset label", required=True)
args = parser.parse_args()

'''
라벨값 가져오기
'''
def ExtractLabel(filename):
data = pd.read_csv(path_label)
filename = filename.split('/')[-1]
return data[data.hash == filename].values[0][1]

def GetFileLists(path_datasets):
filenames = []
for filename in os.listdir(path_datasets):
if os.path.isfile(os.path.join(path_datasets, filename)):
_filename, extension = os.path.splitext(filename)
#파일 확장자가 vir이면
if extension == '.vir':
filenames.append(filename)
if not os.path.exists(args.dataset):
parser.error("ember model {} does not exist".format(args.dataset))
if not os.path.exists(args.csv):
parser.error("ember model {} does not exist".format(args.csv))
if not os.path.exists(args.output):
os.mkdir(args.output)

return filenames
data = pd.read_csv(args.csv, names=['hash', 'y'])

def ExtractFeatures(path_datasets, filesname, path_output):
def ExtractLabel(filename):
return data[data.hash==filename].values[0][1]

def main():
ErrorCount = 0
extractor = PEFeatureExtractor()

with jsonlines.open(os.path.join(path_output, "features.jsonl"), 'w') as f:
for i in tqdm.tqdm(range(len(filesname))):
_filename, extension = os.path.splitext(filesname[i])
_file = os.path.join(path_datasets, filesname[i])
binary = open(_file, 'rb').read()
with jsonlines.open(os.path.join(args.output, "features.jsonl"), 'w') as f:
for _file in tqdm.tqdm(os.listdir(args.dataset)):
path = os.path.join(args.dataset, _file)
binary = open(path, 'rb').read()

try:
feature = extractor.raw_features(binary)
feature.update({"sha256": _filename}) #hash
feature.update({"label" : ExtractLabel(filesname[i])}) #label
f.write(feature)
feature.update({"sha256": _file}) #hash
feature.update({"label" : ExtractLabel(_file)}) #label
f.write(feature)
except KeyboardInterrupt:
sys.exit()
except:
ErrorCount += 1

print("Error : %d" % (ErrorCount))

def main():
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--dirname", help="Directoryname including Datasets", required=True)
parser.add_argument("-o", "--output", help="output Directory", required=True)
parser.add_argument("-c", "--csv", help="filename including label", required=True)
args = parser.parse_args()

if not os.path.exists(args.dirname):
parser.error("ember model {} does not exist".format(args.dirname))
if not os.path.exists(args.output):
parser.error("ember model {} does not exist".format(args.output))
if not os.path.exists(args.csv):
parser.error("ember model {} does not exist".format(args.csv))

path_datasets = args.dirname
path_output = args.output
global path_label
path_label = args.csv

filenames = GetFileLists(path_datasets)
ExtractFeatures(path_datasets, filenames, path_output)

if __name__=='__main__':
main()
print("\n================ DONE ==================\n")
print("Done")
15 changes: 6 additions & 9 deletions 02_learn.py → 02_train.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
# -*- coding:utf-8 -*-
import argparse
import os
from ember import features
import ember
import sys
import subprocess

def clear_data(data_dir):
def clear(data_dir):
path_X = os.path.join(data_dir, "X.dat")
path_y = os.path.join(data_dir, "y.dat")

Expand All @@ -22,26 +21,24 @@ def main():

if not os.path.exists(args.datadir) or not os.path.isdir(args.datadir):
parser.error("{} is not a directory".format(args.parser))
sys.exit()

parameter_popen = ['wc', '-l', os.path.join(args.datadir, 'features.jsonl')]
resut = subprocess.Popen(parameter_popen, stdout=subprocess.PIPE, universal_newlines=True).communicate()[0]
rows = int(resut.split(' ')[0])

#학습데이터 차원 변환 또는 축소
clear_data(args.datadir)
clear(args.datadir)
ember.create_vectorized_features(args.datadir, rows)

# #학습
# Train and save model
print("Training LightGBM model")
lgbm_model = ember.train_model(args.datadir, rows)
lgbm_model.save_model(os.path.join(args.datadir, "model.txt")) #학습완료된 모델 저장
lgbm_model.save_model(os.path.join(args.datadir, "model.txt"))

#교차검증
# cross validation
# print("Training LightGBM model with cross validation")
# lgbm_model = ember.cross_validation(args.datadir, rows)
# lgbm_model.save_model(os.path.join(args.datadir, "model.txt")) #save model

if __name__=='__main__':
main()
print("\n================ DONE ==================\n")
print("Done")
20 changes: 5 additions & 15 deletions 03_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,19 @@ def main():
args = parser.parse_args()

if not os.path.exists(args.modelpath):
parser.error("ember model {} does not exist".format(args.modelpath))
if not os.path.exists(args.output):
os.mkdir(args.output)
parser.error("ember model {} does not exist".format(args.modelpath))
if not os.path.exists(args.csv):
parser.error("ember model {} does not exist".format(args.csv))
if not os.path.exists(args.datadir):
parser.error("ember model {} does not exist".format(args.datadir))
if not os.path.exists(args.output):
os.mkdir(args.output)

model_path = os.path.join(args.modelpath, "model.txt")
lgbm_model = lgb.Booster(model_file=model_path)

#read answer sheet
data = pd.read_csv(args.csv)
data = pd.read_csv(args.csv, names=['hash', 'y'])

errorcount = 0
y_pred = []
Expand All @@ -57,16 +57,6 @@ def main():
#print and save accuracy
y_pred_01 = np.array(y_pred)
y_pred_01 = np.where(y_pred_01 > 0.75, 1, 0)
# acc_lgbm = accuracy_score(y, y_pred_01)
# print("accuaracy : ", acc_lgbm)
# with open(os.path.join(args.output, 'accuarcy.txt'), 'w') as f:
# acc_lgbm.tofile(f, format='%s', sep='str')

# #print and save matrix
# mt = confusion_matrix(y, y_pred_01)
# print(mt)
# print("Error : %d" % (errorcount))
# np.savetxt(os.path.join(args.output, 'matrix.txt'), mt)

#save csv
raw_predict = pd.DataFrame({'hash': _name, 'y': y, 'ypred': y_pred_01})
Expand All @@ -75,7 +65,7 @@ def main():
r = pd.DataFrame({'hash': _name, 'y_pred': y_pred_01})
r.to_csv(os.path.join(args.output, 'result.csv'), index=False, header=None)

#print errorcount
#print error count
print("Error : %d" % (errorcount))

if __name__ == "__main__":
Expand Down
1 change: 0 additions & 1 deletion 03_predict.sh

This file was deleted.

10 changes: 9 additions & 1 deletion 04_get_accuarcy.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
import pandas as pd
import numpy as np
import argparse
import os

parser = argparse.ArgumentParser()
parser.add_argument('-c', '--csv', type=str, required=True, help='csv file for getting accuracy')
parser.add_argument('-t', '--threshold', type=str, default=0.75, help='threadshold for predicting')
parser.add_argument('-o', '--output', default=None, help="save [option]")
args = parser.parse_args()

def main():
Expand All @@ -17,7 +19,7 @@ def main():
#get and print accuracy
accuracy = accuracy_score(y, ypred)
print("accuracy : %.0f%%" % (np.round(accuracy, decimals=2)*100))

#get and print matrix
mt = confusion_matrix(y, ypred)
t = mt[0][0]
Expand All @@ -29,5 +31,11 @@ def main():
print("False Postive : %.0f%%" % (round(mt[0][1]/(mt[0][1]+mt[1][1]), 2)*100))
print("False Negative : %.0f%%" % (round(mt[1][0]/(mt[0][0]+mt[1][0]), 2)*100))

#save accuracy, mt [option]
if args.output:
with open(os.path.join(args.output, 'accuarcy.txt'), 'w') as f:
accuracy.tofile(f, format='%s', sep='str')
np.savetxt(os.path.join(args.output, 'matrix.txt'), mt)

if __name__=='__main__':
main()
20 changes: 17 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,30 @@ H. Anderson and P. Roth, "EMBER: An Open Dataset for Training Static PE Malware
}
```

# install
# Install
above python 3.5
```
; install virtualenv
;install virtualenv
$ virtualenv emberenv -p python3
$ . ./emberenv/bin/activate
```

```
; install python modules
;install python modules
(emberenv)$ pip install -r requirements.txt
(emberenv)$ pip install -U scikit-learn
```

# inputfile(csv including label) structure
without column's names
![traindata_label](screenshot/traindata_label.png)

# Summary of run
```
sript/01_extract.sh -> script/02_learn.sh -> script/03_predict.sh -> script/04_get_accuracy.sh
```
or

```
script/run.sh
```

0 comments on commit add417c

Please sign in to comment.