This notebook is adapted from code given in `https://github.com/PacktPublishing/Machine-Learning-for-Cybersecurity-Cookbook`

# PART 1 Getting the data

In [None]:
!wget https://github.com/PacktPublishing/Machine-Learning-for-Cybersecurity-Cookbook/raw/master/Chapter03/Resources/DA%20Logs%20Benign%201.7z
!wget https://github.com/PacktPublishing/Machine-Learning-for-Cybersecurity-Cookbook/raw/master/Chapter03/Resources/DA%20Logs%20Benign%202.7z
!wget https://github.com/PacktPublishing/Machine-Learning-for-Cybersecurity-Cookbook/raw/master/Chapter03/Resources/DA%20Logs%20Benign%203.7z
    
!wget https://github.com/PacktPublishing/Machine-Learning-for-Cybersecurity-Cookbook/raw/master/Chapter03/Resources/DA%20Logs%20Malware%201.7z
!wget https://github.com/PacktPublishing/Machine-Learning-for-Cybersecurity-Cookbook/raw/master/Chapter03/Resources/DA%20Logs%20Malware%202.7z
!wget https://github.com/PacktPublishing/Machine-Learning-for-Cybersecurity-Cookbook/raw/master/Chapter03/Resources/DA%20Logs%20Malware%203.7z
!wget https://github.com/PacktPublishing/Machine-Learning-for-Cybersecurity-Cookbook/raw/master/Chapter03/Resources/DA%20Logs%20Malware%204.7z
!wget https://github.com/PacktPublishing/Machine-Learning-for-Cybersecurity-Cookbook/raw/master/Chapter03/Resources/DA%20Logs%20Malware%205.7z
!wget https://github.com/PacktPublishing/Machine-Learning-for-Cybersecurity-Cookbook/raw/master/Chapter03/Resources/DA%20Logs%20Malware%206.7z

In [None]:
!dir

In [None]:
!apt install p7zip-full p7zip-rar -y 

In [None]:
!mkdir DA_logs_Benign
!mkdir DA_logs_Malware

In [None]:
%cd DA_logs_Benign/

In [None]:
for i in range(1,4):
  !7z e -y ../DA\ Logs\ Benign\ {i}.7z 

In [None]:
%cd ../DA_logs_Malware/

In [None]:
for i in range(1,7):
  !7z e -y ../DA\ Logs\ Malware\ {i}.7z -pinfected

In [None]:
%cd ..

In [None]:
# will deal with the empty directories later

# for i in range(1,3):
#     !rm -rf "/content/Malicious_PE_samples/Malicious PE Samples {i}"

# for i in range(1,7):
#      !rm -rf "/content/Benign_PE_samples/Benign PE Samples {i}"

# PART 2 creating a malware detection model based on API calls

In [None]:
import numpy as np
import os
import json

directories_with_labels = [("DA_logs_Benign",0),("DA_logs_Malware", 1)]

In [None]:
def get_API_class_method_type_from_log(log):
    """Parses out API calls from behavioral logs."""
    API_data_sequence = []
    with open(log) as log_file:
        json_log = json.load(log_file)
        api_calls_array = "[" + json_log["api_calls"] + "]"
        api_calls = json.loads(api_calls_array)
        for api_call in api_calls:
            data = api_call["class"] + ":" + api_call["method"] + ":" + api_call["type"]
            API_data_sequence.append(data)
    return API_data_sequence
        

In [None]:
data_corpus = []
labels = []

for directory, label in directories_with_labels:
    logs = os.listdir(directory)
    for log_path in logs:
        file_path = directory + "/" + log_path
        try:
            data_corpus.append(get_API_class_method_type_from_log(file_path))
            labels.append(label)
        except:
            pass

we do a train test split

In [None]:
print(data_corpus[0][0])

we use N grams so we loadour Ngram extraction function, with a slight modification for the current data format

In [None]:
from sklearn.model_selection import train_test_split

corpus_train, corpus_test, y_train, y_test = train_test_split(data_corpus, labels, test_size=0.2, random_state=11)

Our approach is to use N grams so we load our Ngram extraction functions with a slight modificaiton for the current dat format

In [None]:

import collections
from nltk import ngrams
import numpy as np


def read_file(file_path):
    """Reads in the binary sequence of a binary file."""
    with open(file_path, "rb") as binary_file:
        data = binary_file.read()
    return data


def text_to_Ngrams(text, n):
    """Produces a list of N-grams from a text."""
    Ngrams = ngrams(text, n)
    return list(Ngrams)


def get_Ngram_counts(text, N):
    """Get a frequency count of N-grams in a text."""
    Ngrams = text_to_Ngrams(text, N)
    return collections.Counter(Ngrams)

In [None]:
N = 4
total_Ngram_count = collections.Counter([])
for file in corpus_train:
    total_Ngram_count += get_Ngram_counts(file, N)

In [None]:
len(total_Ngram_count)

In [None]:
K1 = 3000
K1_most_frequent_Ngrams = total_Ngram_count.most_common(K1)
K1_most_frequent_Ngrams_list = [x[0] for x in K1_most_frequent_Ngrams]


In [None]:
K1_most_frequent_Ngrams_list[0]

In [None]:
def featurize_sample(file, Ngrams_list):
    K1 = len(Ngrams_list)
    feature_vector = K1 * [0]
    fileNgrams = get_Ngram_counts(file, N)
    for i in range(K1):
        feature_vector[i] = fileNgrams[Ngrams_list[i]]
    return feature_vector

In [None]:
X_train = []
try:
    for sample in corpus_train:
        X_train.append(featurize_sample(sample, K1_most_frequent_Ngrams_list))
except Exception as e:
    print(e)
    
X_train = np.asarray(X_train)
X_test = []

for sample in corpus_test:
    X_test.append(featurize_sample(sample, K1_most_frequent_Ngrams_list))
X_test = np.asarray(X_test)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
from sklearn.feature_selection import SelectKBest, muttual_info_classif
from sklearn.pipeline import Pipeline
from xgboost import XGBClasssifier

K2 = 500
mi_pipeline = Pipeline(
    [
        ("mutual_information", SelectKBest(mutual_info_classif, k=K2)),
        ("xgb", XGBClassifier())
    ]
)

In [None]:
mi_pipeline.fit(X_train, y_train)
print("accuracy training and testing : ")
print(mi_pipeline.score(X_train, y_train))
print(mi_pipeline.score(X_test, y_test))