# 개요

Random Forest를 사용하여 전통적인 머신러닝 학습방법에서 재학습에 걸리는 시간과 성능을 체크한다.

* 특징정보 : api
* 수집한 apk : 2014~2020

In [1]:
from tqdm import tqdm
import numpy as np
import os
import pandas as pd
from itertools import product
import time
import warnings
import joblib
import shutil


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import matplotlib.pyplot as plt

In [2]:
warnings.filterwarnings(action='ignore')

# 전처리

In [3]:
train_X_dict, train_y_dict, test_X_dict, test_y_dict = dict(),dict(),dict(),dict()
for i, year in enumerate(range(2014,2021)):
    df1 = pd.read_csv(f'./csv_5000/csv_5000/{year}_benign_api.csv', index_col=0)
    df2 = pd.read_csv(f'./csv_5000/csv_5000/{year}_malware_api.csv', index_col=0)
    df1['year']=year
    df1['b/m']=0
    df2['year']=year
    df2['b/m']=1
    
    train_X_dict[year] = pd.concat([df1.iloc[1000:,:-2],df2.iloc[1000:,:-2]])
    train_y_dict[year] = pd.concat([df1.iloc[1000:,-2:],df2.iloc[1000:,-2:]])
    test_X_dict[year] = pd.concat([df1.iloc[:1000,:-2],df2.iloc[:1000,:-2]])
    test_y_dict[year] = pd.concat([df1.iloc[:1000,-2:],df2.iloc[:1000,-2:]])

In [4]:
def make_model(df_x, df_y):
    model = RandomForestClassifier(random_state=42)
    start = time.time()
    model.fit(df_x, df_y)
    end = time.time()
    return model, (end-start)*1000

In [5]:
def convert_size(size_bytes):
    import math
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)
    return "%s %s" % (s, size_name[i])

In [6]:
def AUT(list_):
    return sum([(list_[i] + list_[i+1])/2 for i, elm in enumerate(list_[:-1])])/(len(list_)-1) 

# 일반 실험

In [7]:
for year in range(2014,2021):
    model, t_time = make_model(train_X_dict[year], train_y_dict[year]['b/m'])
    print(year, t_time, model.score(test_X_dict[year], test_y_dict[year]['b/m']))

# Sustainability 실험

In [8]:
time_list=[]
size_list=[]
score_dict = dict()
f1_dict = dict()

In [9]:
dir_path = './ex1_model'

if os.path.exists(dir_path):
    shutil.rmtree(dir_path)

if not os.path.isdir(dir_path):
    os.mkdir(dir_path)

## 14 년도

In [10]:
df = pd.concat([
    train_X_dict[2014]
])
label = pd.concat([
    train_y_dict[2014]['b/m']
])

year=14

exec(f'model{year}, train_time = make_model(df, label)')
exec(f'time_list.append((20{year}, train_time))')

exec(f"joblib.dump(model{year}, './ex1_model/model{year}.pkl')")
exec(f"f_size = convert_size(os.path.getsize('./ex1_model/model{year}.pkl'))")
exec(f'size_list.append((20{year}, f_size))')

print(train_time) 
print(f_size)

In [11]:
score_list=[eval(f'model{year}').score(test_X_dict[i], test_y_dict[i]['b/m']) for i in range(2014,2021)]
f1_list = [f1_score(test_y_dict[i]['b/m'], eval(f'model{year}').predict(test_X_dict[i])) for i in range(2014,2021)]
score_dict[year+2000] = score_list
f1_dict[year+2000] = f1_list

## 15 년도

In [12]:
df = pd.concat([
    train_X_dict[2014],
    train_X_dict[2015],
])
label = pd.concat([
    train_y_dict[2014]['b/m'],
    train_y_dict[2015]['b/m']
])

year=15

exec(f'model{year}, train_time = make_model(df, label)')
exec(f'time_list.append((20{year}, train_time))')

exec(f"joblib.dump(model{year}, './ex1_model/model{year}.pkl')")
exec(f"f_size = convert_size(os.path.getsize('./ex1_model/model{year}.pkl'))")
exec(f'size_list.append((20{year}, f_size))')

print(train_time) 
print(f_size)

In [13]:
score_list=[eval(f'model{year}').score(test_X_dict[i], test_y_dict[i]['b/m']) for i in range(2014,2021)]
f1_list = [f1_score(test_y_dict[i]['b/m'], eval(f'model{year}').predict(test_X_dict[i])) for i in range(2014,2021)]
score_dict[year+2000] = score_list
f1_dict[year+2000] = f1_list

## 16년도

In [14]:
df = pd.concat([
    train_X_dict[2014],
    train_X_dict[2015],
    train_X_dict[2016],
])
label = pd.concat([
    train_y_dict[2014]['b/m'],
    train_y_dict[2015]['b/m'],
    train_y_dict[2016]['b/m'],
])

year=16

exec(f'model{year}, train_time = make_model(df, label)')
exec(f'time_list.append((20{year}, train_time))')

exec(f"joblib.dump(model{year}, './ex1_model/model{year}.pkl')")
exec(f"f_size = convert_size(os.path.getsize('./ex1_model/model{year}.pkl'))")
exec(f'size_list.append((20{year}, f_size))')

print(train_time) 
print(f_size)

In [15]:
score_list=[eval(f'model{year}').score(test_X_dict[i], test_y_dict[i]['b/m']) for i in range(2014,2021)]
f1_list = [f1_score(test_y_dict[i]['b/m'], eval(f'model{year}').predict(test_X_dict[i])) for i in range(2014,2021)]
score_dict[year+2000] = score_list
f1_dict[year+2000] = f1_list

## 17년도

In [16]:
df = pd.concat([
    train_X_dict[2014],
    train_X_dict[2015],
    train_X_dict[2016],
    train_X_dict[2017],
])
label = pd.concat([
    train_y_dict[2014]['b/m'],
    train_y_dict[2015]['b/m'],
    train_y_dict[2016]['b/m'],
    train_y_dict[2017]['b/m'],
])

year=17

exec(f'model{year}, train_time = make_model(df, label)')
exec(f'time_list.append((20{year}, train_time))')

exec(f"joblib.dump(model{year}, './ex1_model/model{year}.pkl')")
exec(f"f_size = convert_size(os.path.getsize('./ex1_model/model{year}.pkl'))")
exec(f'size_list.append((20{year}, f_size))')

print(train_time) 
print(f_size)

In [17]:
score_list=[eval(f'model{year}').score(test_X_dict[i], test_y_dict[i]['b/m']) for i in range(2014,2021)]
f1_list = [f1_score(test_y_dict[i]['b/m'], eval(f'model{year}').predict(test_X_dict[i])) for i in range(2014,2021)]
score_dict[year+2000] = score_list
f1_dict[year+2000] = f1_list

## 18년도

In [18]:
df = pd.concat([
    train_X_dict[2014],
    train_X_dict[2015],
    train_X_dict[2016],
    train_X_dict[2017],
    train_X_dict[2018],
])
label = pd.concat([
    train_y_dict[2014]['b/m'],
    train_y_dict[2015]['b/m'],
    train_y_dict[2016]['b/m'],
    train_y_dict[2017]['b/m'],
    train_y_dict[2018]['b/m'],
])

year=18

exec(f'model{year}, train_time = make_model(df, label)')
exec(f'time_list.append((20{year}, train_time))')

exec(f"joblib.dump(model{year}, './ex1_model/model{year}.pkl')")
exec(f"f_size = convert_size(os.path.getsize('./ex1_model/model{year}.pkl'))")
exec(f'size_list.append((20{year}, f_size))')

print(train_time) 
print(f_size)

In [19]:
score_list=[eval(f'model{year}').score(test_X_dict[i], test_y_dict[i]['b/m']) for i in range(2014,2021)]
f1_list = [f1_score(test_y_dict[i]['b/m'], eval(f'model{year}').predict(test_X_dict[i])) for i in range(2014,2021)]
score_dict[year+2000] = score_list
f1_dict[year+2000] = f1_list

## 19년도

In [20]:
df = pd.concat([
    train_X_dict[2014],
    train_X_dict[2015],
    train_X_dict[2016],
    train_X_dict[2017],
    train_X_dict[2018],
    train_X_dict[2019],
])
label = pd.concat([
    train_y_dict[2014]['b/m'],
    train_y_dict[2015]['b/m'],
    train_y_dict[2016]['b/m'],
    train_y_dict[2017]['b/m'],
    train_y_dict[2018]['b/m'],
    train_y_dict[2019]['b/m'],
])

year=19

exec(f'model{year}, train_time = make_model(df, label)')
exec(f'time_list.append((20{year}, train_time))')

exec(f"joblib.dump(model{year}, './ex1_model/model{year}.pkl')")
exec(f"f_size = convert_size(os.path.getsize('./ex1_model/model{year}.pkl'))")
exec(f'size_list.append((20{year}, f_size))')

print(train_time) 
print(f_size)

In [21]:
score_list=[eval(f'model{year}').score(test_X_dict[i], test_y_dict[i]['b/m']) for i in range(2014,2021)]
f1_list = [f1_score(test_y_dict[i]['b/m'], eval(f'model{year}').predict(test_X_dict[i])) for i in range(2014,2021)]
score_dict[year+2000] = score_list
f1_dict[year+2000] = f1_list

## 20년도

In [22]:
df = pd.concat([
    train_X_dict[2014],
    train_X_dict[2015],
    train_X_dict[2016],
    train_X_dict[2017],
    train_X_dict[2018],
    train_X_dict[2019],
    train_X_dict[2020],
])
label = pd.concat([
    train_y_dict[2014]['b/m'],
    train_y_dict[2015]['b/m'],
    train_y_dict[2016]['b/m'],
    train_y_dict[2017]['b/m'],
    train_y_dict[2018]['b/m'],
    train_y_dict[2019]['b/m'],
    train_y_dict[2020]['b/m'],
])

year=20

exec(f'model{year}, train_time = make_model(df, label)')
exec(f'time_list.append((20{year}, train_time))')

exec(f"joblib.dump(model{year}, './ex1_model/model{year}.pkl')")
exec(f"f_size = convert_size(os.path.getsize('./ex1_model/model{year}.pkl'))")
exec(f'size_list.append((20{year}, f_size))')

print(train_time) 
print(f_size)

In [23]:
score_list=[eval(f'model{year}').score(test_X_dict[i], test_y_dict[i]['b/m']) for i in range(2014,2021)]
f1_list = [f1_score(test_y_dict[i]['b/m'], eval(f'model{year}').predict(test_X_dict[i])) for i in range(2014,2021)]
score_dict[year+2000] = score_list
f1_dict[year+2000] = f1_list

# 결과

In [24]:
if not os.path.isdir('./visual'):
    os.mkdir('./visual')

In [25]:
years = range(2014,2021)
x = np.arange(len(years))

for i in range(2014,2021):
    plt.bar(x-0.2 , score_dict[i], label="acc", width=0.4, color='cornflowerblue')
    plt.bar(x+0.2 , f1_dict[i], label="f1", width=0.4, color='sandybrown')
    plt.axvline(i-2013.5, 0, 1, color='lightgray', linestyle='--', linewidth=2)
    if i ==2014:
        plt.title(f'ex1_train 2014')
    else:
        plt.title(f'ex1_train 2014~{i}')
    plt.xticks(x, years)
    plt.ylim(0.5,1)
    plt.legend()
    plt.savefig(f"./visual/ex1_{i}_acc_f1.png")
    plt.show()

In [26]:
df = pd.concat([
    pd.DataFrame(time_list), 
    pd.DataFrame(size_list)[1], 
    pd.DataFrame([AUT(score_dict[i]) for i in range(2014,2021)]),
    pd.DataFrame([AUT(f1_dict[i]) for i in range(2014,2021)]),
], axis=1)

df.columns=['year','train_time','model_file_size','accuracy','f1 score']
df

In [27]:
df.to_csv('./ex1_result.csv')

In [28]:
print(len(time_list), len(size_list), len(score_dict), len(f1_dict))

In [33]:
pd.DataFrame(score_dict).T

Unnamed: 0,0,1,2,3,4,5,6
2014,0.9785,0.927,0.903,0.8205,0.896,0.586,0.5945
2015,0.972,0.9835,0.959,0.832,0.9025,0.6145,0.586
2016,0.9725,0.9855,0.985,0.95,0.941,0.562,0.548
2017,0.9705,0.979,0.9835,0.9905,0.9645,0.563,0.5475
2018,0.972,0.981,0.984,0.99,0.969,0.5585,0.53
2019,0.972,0.976,0.9685,0.9835,0.9565,0.9665,0.9525
2020,0.9695,0.9725,0.9685,0.984,0.9545,0.973,0.9725


In [35]:
score_dict[2019]

[0.972, 0.976, 0.9685, 0.9835, 0.9565, 0.9665, 0.9525]