diff --git a/03_predict.py b/03_predict.py index 4662b9f..a58e69a 100755 --- a/03_predict.py +++ b/03_predict.py @@ -9,7 +9,6 @@ import pandas as pd import numpy as np from sklearn.model_selection import train_test_split -from sklearn.metrics import confusion_matrix, accuracy_score def main(): parser = argparse.ArgumentParser() @@ -21,8 +20,8 @@ def main(): if not os.path.exists(args.modelpath): parser.error("ember model {} does not exist".format(args.modelpath)) - if not os.path.exists(args.modelpath): - parser.error("ember model {} does not exist".format(args.output)) + if not os.path.exists(args.output): + os.mkdir(args.output) if not os.path.exists(args.csv): parser.error("ember model {} does not exist".format(args.csv)) if not os.path.exists(args.datadir): @@ -55,7 +54,6 @@ def main(): y_pred.append(0) errorcount += 1 - #print and save accuracy y_pred_01 = np.array(y_pred) y_pred_01 = np.where(y_pred_01 > 0.75, 1, 0) @@ -72,10 +70,14 @@ def main(): #save csv raw_predict = pd.DataFrame({'hash': _name, 'y': y, 'ypred': y_pred_01}) - raw_predict.to_csv(os.path.join(args.output, 'predict_with_label.csv'), index=False) + raw_predict.to_csv(os.path.join(args.output, 'predict_with_label.csv'), index=False, header=None) r = pd.DataFrame({'hash': _name, 'y_pred': y_pred_01}) - r.to_csv(os.path.join(args.output, 'result.csv'), index=False) + r.to_csv(os.path.join(args.output, 'result.csv'), index=False, header=None) + + #print errorcount + print("Error : %d" % (errorcount)) if __name__ == "__main__": - main() \ No newline at end of file + main() + print("Done") \ No newline at end of file diff --git a/04_get_accuarcy.py b/04_get_accuarcy.py index baf1858..e2e1868 100644 --- a/04_get_accuarcy.py +++ b/04_get_accuarcy.py @@ -5,15 +5,29 @@ parser = argparse.ArgumentParser() parser.add_argument('-c', '--csv', type=str, required=True, help='csv file for getting accuracy') +parser.add_argument('-t', '--threshold', type=str, default=0.75, help='threadshold for predicting') args = parser.parse_args() def main(): - data = pd.read_csv(args.csv) - + data = pd.read_csv(args.csv, names=['hash', 'y', 'ypred']) + y = data.y - ypred = np.where(np.array(data.ypred) > 0.75, 1, 0) + ypred = np.where(np.array(data.ypred) > args.threshold, 1, 0) + + #get and print accuracy accuracy = accuracy_score(y, ypred) - print("accuracy : %.2f%%" % (np.round(accuracy, decimals=2)*100)) + print("accuracy : %.0f%%" % (np.round(accuracy, decimals=2)*100)) + + #get and print matrix + mt = confusion_matrix(y, ypred) + t = mt[0][0] + mt[0][0] = mt[1][1] + mt[1][1] = t + print(mt) + + #print FP, FN + print("False Postive : %.0f%%" % (round(mt[0][1]/(mt[0][1]+mt[1][1]), 2)*100)) + print("False Negative : %.0f%%" % (round(mt[1][0]/(mt[0][0]+mt[1][0]), 2)*100)) if __name__=='__main__': main() \ No newline at end of file diff --git a/README.ME b/README.ME deleted file mode 100644 index 78afce6..0000000 --- a/README.ME +++ /dev/null @@ -1,32 +0,0 @@ -# Reference -https://github.com/endgameinc/ember - -H. Anderson and P. Roth, "EMBER: An Open Dataset for Training Static PE Malware Machine Learning Models”, in ArXiv e-prints. Apr. 2018. - -``` -@ARTICLE{2018arXiv180404637A, - author = {{Anderson}, H.~S. and {Roth}, P.}, - title = "{EMBER: An Open Dataset for Training Static PE Malware Machine Learning Models}", - journal = {ArXiv e-prints}, - archivePrefix = "arXiv", - eprint = {1804.04637}, - primaryClass = "cs.CR", - keywords = {Computer Science - Cryptography and Security}, - year = 2018, - month = apr, - adsurl = {http://adsabs.harvard.edu/abs/2018arXiv180404637A}, -} -``` - -# install -above python 3.5 -``` -#install virtualenv -$ virtualenv emberenv -p python3 -$ . ./emberenv/bin/activate -``` - -``` -(emberenv)$ pip instal -r requirements.txt -(emberenv)$ pip install -U scikit-learn -``` \ No newline at end of file diff --git a/utils/upx_packer.py b/utils/upx_packer.py new file mode 100755 index 0000000..9dd6278 --- /dev/null +++ b/utils/upx_packer.py @@ -0,0 +1,32 @@ +#require sudo apt install upx +import os +import argparse +import subprocess +import tqdm + +parser = argparse.ArgumentParser() +parser.add_argument("-d", "--datadir", required=True, help="directory for packing") +parser.add_argument("-o", "--output", required=True, help="output directory") +args = parser.parse_args() + +if not os.path.exists(args.datadir): + parser.error("ember model {} does not exist".format(args.modelpath)) +if not os.path.exists(args.output): + os.makedirs(args.output) + +for _file in tqdm.tqdm(os.listdir(args.datadir)): + path = os.path.join(args.datadir, _file) + output = os.path.join(args.output, _file) + + command = ['upx -l ' + path] + r = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True, stderr=subprocess.STDOUT) + NotPacker = r.communicate()[0].decode('utf-8') + + if "NotPackedException" in NotPacker: + command = ['upx -o ' + output + ' ' + path] + subprocess.Popen(command, stdout=subprocess.PIPE, shell=True, stderr=subprocess.STDOUT) + else: + command = ['cp ' + path + ' ' + output] + subprocess.Popen(command, stdout=subprocess.PIPE, shell=True, stderr=subprocess.STDOUT) + +print("Done")