# MalconV on PDF

### Preprocessing

In [1]:
from globals import DATADIR, DATAMALDIR, LOCAL_BUCKET, BENIGN_LIST, MALICIOUS_LIST, BENIGN, MALICIOUS
from storage import FsStorage
import pandas as pd

In [2]:
def files_df():
    flist = []
    verdicts = []
    
    with open(BENIGN_LIST) as f:
        lines = f.read().split()
    flist.extend(lines)
    verdicts.extend([BENIGN]*len(lines))
    
    with open(MALICIOUS_LIST) as f:
        lines = f.read().split()
    flist.extend(lines)
    verdicts.extend([MALICIOUS]*len(lines))
    return pd.DataFrame({'hash': flist, 'verdict': verdicts})

In [3]:
def get_file(name, label):
    storage = FsStorage()
    if label == BENIGN:
        return storage.get(DATADIR + name)
    elif label == MALICIOUS:
        return storage.get(DATAMALDIR + name)
    raise ValueError("Bad label")

In [4]:
df = files_df().sample(frac=1, random_state=42)
cut_index = (int(0.9*df.shape[0]), int(0.92*df.shape[0]))
df_train, df_valid, df_test = df.iloc[:cut_index[0]], df.iloc[cut_index[0]:cut_index[1]], df.iloc[cut_index[1]:]

### Defining the network

In [5]:
INPUT_LENGTH = 1024*200
INPUT_HEIGHT = 257

In [6]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [7]:
class MalConv(nn.Module):
    def __init__(self,input_height=INPUT_HEIGHT, input_length=INPUT_LENGTH, window_size=500):
        super().__init__()
        embedding_size = 16
        self.embed = nn.Embedding(input_height, embedding_size) 
        self.conv_1 = nn.Conv1d(embedding_size, 128, window_size, stride=window_size, bias=True)
        self.conv_2 = nn.Conv1d(embedding_size, 128, window_size, stride=window_size, bias=True)

        self.pooling = nn.MaxPool1d(int(input_length/window_size))
        

        self.fc_1 = nn.Linear(128,128)
        self.fc_2 = nn.Linear(128,1)

        self.sigmoid = nn.Sigmoid()
        #self.softmax = nn.Softmax()
        

    def forward(self,x):
        # Channel first
        x = self.embed(x)  # Output batch_size, flength, n_embed
        x = torch.transpose(x, 1, 2) # Output batch_size, n_embed, flength
        cnn_value = self.conv_1(x)
        gating_weight = self.sigmoid(self.conv_2(x))

        x = cnn_value * gating_weight
        x = self.pooling(x)

        x = x.view(-1,128)
        x = self.fc_1(x)
        x = self.fc_2(x)
        #x = self.sigmoid(x)

        return x

In [8]:
class PDFDataSet(Dataset):
    def __init__(self, df, first_n_byte=INPUT_LENGTH):
        self.df = df
        self.first_n_byte = first_n_byte

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        cnt = get_file(row['hash'], row['verdict'])
        tmp = [i+1 for i in cnt[:self.first_n_byte]]
        tmp = tmp+[0]*(self.first_n_byte-len(tmp))
        return np.array(tmp), np.array([row['verdict']])

### Running 

In [9]:
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import time

In [10]:
BATCH_SIZE = 32
use_gpu = True
learning_rate = 1e-3
max_step = (int(df_train.shape[0] / 32) + 1) * 4   # Last number is the number of epochs
display_step = 15
test_step = 300

In [11]:
dataloader = DataLoader(PDFDataSet(df_train), batch_size=BATCH_SIZE, shuffle=True)
validloader = DataLoader(PDFDataSet(df_valid), batch_size=BATCH_SIZE, shuffle=True)

malconv = MalConv()
bce_loss = nn.BCEWithLogitsLoss()
adam_optim = optim.Adam([{'params':malconv.parameters()}],lr=learning_rate)
sigmoid = nn.Sigmoid()

if use_gpu:
    malconv = malconv.cuda()
    bce_loss = bce_loss.cuda()
    sigmoid = sigmoid.cuda()


step_msg = 'step-{}-loss-{:.6f}-acc-{:.4f}-time-{:.2f}'
valid_msg = 'step-{}-tr_loss-{:.6f}-tr_acc-{:.4f}-val_loss-{:.6f}-val_acc-{:.4f}'
log_msg = '{}, {:.6f}, {:.4f}, {:.6f}, {:.4f}, {:.2f}'
history = {}
history['tr_loss'] = []
history['tr_acc'] = []

print('step,tr_loss, tr_acc, val_loss, val_acc, time\n')

valid_best_acc = 0.0
total_step = 0
step_cost_time = 0

while total_step < max_step:
    
    # Training 
    for step,batch_data in enumerate(dataloader):
        start = time.time()
        
        adam_optim.zero_grad()
        
        cur_batch_size = batch_data[0].size(0)

        exe_input = batch_data[0].cuda() if use_gpu else batch_data[0]
        exe_input = Variable(exe_input.long(),requires_grad=False)
        label = batch_data[1].cuda() if use_gpu else batch_data[1]
        label = Variable(label.float(),requires_grad=False)
        pred = malconv(exe_input)
        loss = bce_loss(pred,label)
        loss.backward()
        adam_optim.step()
        history['tr_loss'].append(loss.cpu().data.item())
        history['tr_acc'].extend(list(label.cpu().data.numpy().astype(int)==(sigmoid(pred).cpu().data.numpy()+0.5).astype(int)))
        
        step_cost_time = time.time()-start
        
        if step%display_step == 0:
            print(step_msg.format(total_step,np.mean(history['tr_loss']),
                                  np.mean(history['tr_acc']),step_cost_time))
        total_step += 1

        # Interupt for validation
        if total_step%test_step ==0:
            break
    
    
    # Testing
    history['val_loss'] = []
    history['val_acc'] = []
    history['val_pred'] = []
    
    for _,val_batch_data in enumerate(validloader):
        cur_batch_size = val_batch_data[0].size(0)

        exe_input = val_batch_data[0].cuda() if use_gpu else val_batch_data[0]
        exe_input = Variable(exe_input.long(),requires_grad=False)

        label = val_batch_data[1].cuda() if use_gpu else val_batch_data[1]
        label = Variable(label.float(),requires_grad=False)

        pred = malconv(exe_input)
        loss = bce_loss(pred,label)

        history['val_loss'].append(loss.cpu().data.item())
        history['val_acc'].extend(list(label.cpu().data.numpy().astype(int)==(sigmoid(pred).cpu().data.numpy()+0.5).astype(int)))
        history['val_pred'].append(list(sigmoid(pred).cpu().data.numpy()))

    print(log_msg.format(total_step, np.mean(history['tr_loss']), np.mean(history['tr_acc']),
                    np.mean(history['val_loss']), np.mean(history['val_acc']),step_cost_time))
    
    print(valid_msg.format(total_step,np.mean(history['tr_loss']),np.mean(history['tr_acc']),
                           np.mean(history['val_loss']),np.mean(history['val_acc'])))
    if valid_best_acc < np.mean(history['val_acc']):
        valid_best_acc = np.mean(history['val_acc'])
        torch.save(malconv,'Chkp/chkp2.txt')
        print('Checkpoint saved at','Chkp/chkp2.txt')
#        write_pred(history['val_pred'],valid_idx,'Chkp/Pred/pred.txt')
#        print('Prediction saved at', 'Chkp/Pred/pred.txt')

    history['tr_loss'] = []
    history['tr_acc'] = []

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/specific/scratches/scratch/miniconda3/envs/thesis/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-11-b1ff4e265746>", line 10, in <module>
    malconv = malconv.cuda()
  File "/specific/scratches/scratch/miniconda3/envs/thesis/lib/python3.7/site-packages/torch/nn/modules/module.py", line 258, in cuda
    return self._apply(lambda t: t.cuda(device))
  File "/specific/scratches/scratch/miniconda3/envs/thesis/lib/python3.7/site-packages/torch/nn/modules/module.py", line 185, in _apply
    module._apply(fn)
  File "/specific/scratches/scratch/miniconda3/envs/thesis/lib/python3.7/site-packages/torch/nn/modules/module.py", line 191, in _apply
    param.data = fn(param.data)
  File "/specific/scratches/scratch/miniconda3/envs/thesis/lib/python3.7/site-packages/torch/nn/modules/module.py", line 258, in <lambda>
    return self._apply(lamb

KeyboardInterrupt: 

### Testing