In [1]:
from bashprocessing import Parser
import numpy as np

from sklearn.utils import shuffle

RANDOM = 33

In [2]:
with open('data/bashlex.cm') as f1:
    benign = f1.readlines()

with open('data/malicious.cm') as f2:
    malicious = f2.readlines()

In [3]:
# heavily imbalanced data - as always in security ...
print(len(benign))
print(len(malicious))

12607
123


# Creating test and validation sets

We just can't use sklearn's `train_test_split()` as is, because many malicious behavior commands have totally different pattern and purpose. Therefore, we need to find specific command subset, that characterize some family of malicious commands, and is in necessary amount to be present in both train and test sets.

*nix `/usr/bin/find` binary is often used query filesystem on behalf of specific files and their parameters. Heavily utilized by both system administrators and security threat actors during system enumeration. Some subset of `find` commands present in both datasets:

In [4]:
print(len([x for x in benign if 'find' in x]))

print(len([x for x in malicious if 'find' in x]))

7826
64


Similar situation with network connectivity - both valid commands and malicious callbacks (called *reverse shells*) have network information within (in our dataset all remote host specification is normalized to be `example.com`), and present in both datasets:

In [5]:
# commands that perform connections to remote hosts (normalized so host is specified as example.com)
print(len([x for x in benign if 'example.com' in x]))
print(len([x for x in malicious if 'example.com' in x]))

77
30


Both of these command patterns will be segregated to, so some subset is present in train set, and other subset in test set..

In [6]:

def give_test_by_pattern(inputlist, pattern, size=None):
    pattern_list = shuffle([x for x in inputlist if pattern in x], random_state=RANDOM)
    non_pattern_list = [x for x in inputlist if pattern not in x]
    
    if not size:
        size = int(0.5 * len(pattern_list))
    
    pattern_trainset = [x for x in pattern_list][size:]
    pattern_testset = [x for x in pattern_list][:size]

    trainset = pattern_trainset + non_pattern_list
    return trainset, pattern_testset

In [7]:
def create_datasets(benign, malicious, patterns):

    X_train, X_test, y_train, y_test = [], [], [], []
    cropped_train = [benign, malicious]

    for pattern in patterns:
        
        tr, te = give_test_by_pattern(cropped_train[1], pattern)
        X_test.extend(te)
        y_test.extend([1]*len(te))
        cropped_train[1] = tr

        tr, te = give_test_by_pattern(cropped_train[0], pattern, size=len(te))
        X_test.extend(te)
        y_test.extend([0]*len(te))
        cropped_train[0] = tr

    X_train, y_train = shuffle(cropped_train[0] + cropped_train[1], \
                                [0] * len(cropped_train[0]) + [1] * len(cropped_train[1]), \
                                random_state=RANDOM) 
    X_test, y_test = shuffle(X_test, y_test, random_state=RANDOM)
    return X_train, X_test, y_train, y_test


In [8]:
X_train_cm, X_test_cm, y_train, y_test = create_datasets(benign, malicious, ["find", "example.com", "python", "php"])
print(len(X_train_cm), len(X_test_cm))

12626 104


In [9]:
# sanity check
print(len(X_train_cm + X_test_cm))
print(len(y_train + y_test))
print(len(malicious + benign))

12730
12730
12730


# Encoding using `bashprocessing`

In [29]:
from bashprocessing import Parser

p = Parser(verbose=True)
cntr , corpus = p.parse(X_train_cm)
X_train = p.encode(mode="onehot", top_tokens=1000)
print("\n",X_train.shape)


 (12626, 1000)


In [28]:
p = Parser(verbose=True)
_,_ = p.parse(X_test_cm)
X_test = p.encode(mode="onehot", top_tokens=1000)
print(X_test.shape)

(104, 1000)


# RandomForestClassifier and Oversampling

If we train model on data as is we get all predictions as benign and accuracy of exact 50% (due to fast that our test set have exact same amount of values from both classes)..

In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(y_pred)


0.5
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


This is due to fact that we have negligble count of malicious examples in our training set.  

In [31]:
def class_percentage(y):
    u,c = np.unique(y, return_counts=True)
    neg,pos = c
    print(f"Malicious commands: {round(pos*100/(neg+pos),4)} %")

print("In test set:\n\t", end="")
class_percentage(y_test)
print("\nIn train set:\n\t", end="")
class_percentage(y_train)

In test set:
	Malicious commands: 50.0 %

In train set:
	Malicious commands: 0.5623 %


Therefore, we need to implement oversampling, so model is able to train itself on malicious examples.

In [32]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(sampling_strategy='minority', random_state=RANDOM)

X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

In [33]:
print(X_train.shape)
print(X_train_resampled.shape)

(12626, 1000)
(25110, 1000)


In [34]:
print("\nIn resampled train set:\n\t", end="")
class_percentage(y_train_resampled)


In resampled train set:
	Malicious commands: 50.0 %


In [35]:
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train_resampled, y_train_resampled)

y_pred = rfc.predict(X_test)
print(accuracy_score(y_test, y_pred))


0.5096153846153846


In [38]:
print(y_pred)
print(np.array(y_test))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 1 1 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0 1 1 1 0 0 0 1 0 0 1 1 0 1 1 1 0
 1 1 1 1 1 1 1 0 1 1 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 1 1 1 1
 1 1 0 0 1 1 1 0 0 1 1 1 0 1 1 0 0 0 0 1 0 0 1 0 1 1 1 0 1 1]


Still predition is really poor and obviously biased towards benign class ...  
On train set we have pretty good pattern learning:

In [19]:
y_train_pred = rfc.predict(X_train_resampled)
print(accuracy_score(y_train_resampled, y_train_pred))

0.9988849064117882
