In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import joblib
import os
import matplotlib.pyplot as plt

# 데이터 준비

In [2]:
years = list(range(2014,2022))
apk_types = ['benign', 'malware']

In [3]:
df_ben_list = []
df_mal_list = []

for year in years:
    df_ben_list.append(pd.read_csv(f'./csv/{apk_types[0]}_{year}.csv', index_col=0))
    df_mal_list.append(pd.read_csv(f'./csv/{apk_types[1]}_{year}.csv', index_col=0))

# 성능 보기

In [4]:
j, test_idx, df_ben_list[j].shape

NameError: name 'j' is not defined

In [None]:
df_ben_list[j].iloc[test_idx:]

In [None]:
acc_table=[]

for i, year in enumerate(years):
    model=joblib.load(f'./model/model_{year}.pk1')
    
    print('model :', year)
    acc_list=[]
    for j in range(len(years)):
        len_ = len(df_ben_list[j])
        if j==i:
            test_idx = int(len_ * 0.8)
        else:
            test_idx=0
        data = pd.concat([df_ben_list[j].iloc[test_idx:], df_mal_list[j].iloc[test_idx:]])
        label = (len_-test_idx)*[1] + (len_-test_idx)*[0]
        accuracy = model.score(data, label)
        f1 = f1_score(label, model.predict(data))
        
        
        print(years[j],"\'s acc : ", accuracy)
        print(years[j],"\'s f1 : ", f1)
        
        acc_list.append(accuracy)
    
    print()
    acc_table.append(acc_list)

# 도표 표현

In [None]:
acc_table

In [None]:
if not os.path.isdir('./visual'):
    os.mkdir('./visual')


    
for i, year in enumerate(years):
    x = np.arange(len(years))
    bars = plt.bar(x, acc_table[i], label="acc", width=0.5)
    bars[i].set_hatch('//')
    plt.xticks(x, years)
    plt.ylim(0.5, 1)
    plt.legend()
    plt.title(f"model : {years[i]}")
    plt.savefig(f"./visual/{years[i]}_model.png")
    plt.show()

# 추가 실험

In [None]:
acc_table2=[]

for i, year in enumerate(years):
    model=joblib.load(f'./model/model_{year}.pk1')
    
    print('model :', year)
    acc_list=[]
    for j in range(len(years)):
        len_ = len(df_ben_list[j])
        if j==i:
            test_idx = int(len_ * 0.8)
        else:
            test_idx=0
        ben_data = df_ben_list[j].iloc[test_idx:]
        mal_data = df_mal_list[j].iloc[test_idx:]
        all_data = pd.concat([ben_data, mal_data])
        
        label = (len_-test_idx)*[1] + (len_-test_idx)*[0]
        
        accuracy = [model.score(all_data, label), 
                    model.score(ben_data, label[:len(label)//2]), 
                    model.score(mal_data, label[len(label)//2:])]
        
        print(years[j],"\'s acc : ", *accuracy)
        acc_list.append(accuracy)
    
    print()
    acc_table2.append(acc_list)

# 추가 도표

In [None]:
acc_table2[0][0][1]

In [None]:
if not os.path.isdir('./visual'):
    os.mkdir('./visual')


    
for i, year in enumerate(years):
    x = np.arange(len(years))
    plt.bar(x, [acc_table2[i][j][1] for j in range(len(years))], label="benign", width=0.5)
    plt.xticks(x, years)
    plt.ylim(0, 1)
    plt.legend()
    plt.title(f"model : {years[i]}")
    plt.savefig(f"./visual/{years[i]}_model_ben.png")
    plt.show()
    
    x = np.arange(len(years))
    plt.bar(x, [acc_table2[i][j][2] for j in range(len(years))], label="malware", width=0.5)
    plt.xticks(x, years)
    plt.ylim(0, 1)
    plt.legend()
    plt.title(f"model : {years[i]}")
    plt.savefig(f"./visual/{years[i]}_model_mal.png")
    plt.show()
    

In [None]:
if not os.path.isdir('./visual'):
    os.mkdir('./visual')


    
for i, year in enumerate(years):
    x = np.arange(len(years))
    plt.bar(x, [acc_table2[i][j][1] for j in range(len(years))], label="benign", width=0.3, color='b')
    plt.bar(x+0.3, [acc_table2[i][j][2] for j in range(len(years))], label="malware", width=0.3, color='r')
    plt.xticks(x, years)
    plt.ylim(0, 1)
    plt.legend(loc=(1.0, 0.8))
    plt.title(f"model : {years[i]}")
    plt.savefig(f"./visual/{years[i]}_model_ben_mal.png")
    plt.show()
    
    

In [None]:
if not os.path.isdir('./visual'):
    os.mkdir('./visual')


    
for i, year in enumerate(years):
    x = np.arange(len(years))
    plt.bar(x, [acc_table2[i][j][0] for j in range(len(years))], label="all", width=0.2, color='g')
    plt.bar(x+0.2, [acc_table2[i][j][1] for j in range(len(years))], label="benign", width=0.2, color='b')
    plt.bar(x+0.4, [acc_table2[i][j][2] for j in range(len(years))], label="malware", width=0.2, color='r')
    plt.xticks(x, years)
    plt.ylim(0, 1)
    plt.legend(loc=(1.0, 0.8))
    plt.title(f"model : {years[i]}")
    plt.savefig(f"./visual/{years[i]}_model_all.png")
    plt.show()
    
    

In [None]:
x = np.arange(len(years))
plt.bar(x, [acc_table2[j][j][0] for j in range(len(years))], label="acc", width=0.5)
plt.xticks(x, years)
plt.ylim(0.5, 1)
plt.savefig(f"./visual/self_model.png")
plt.show()

In [None]:
[acc_table2[j][j][0] for j in range(len(years))]

In [None]:
f1_score(label, model.predict(data))