In [ ]:
import os, glob, json, collections, itertools, pickle, nltk, numpy, scipy
from nltk.util import ngrams
import sklearn as skl
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import *
from sklearn.metrics import plot_confusion_matrix

In [ ]:
# Reading raw data from files and converting to json database

def ft_load_raw_data_from_fs():
    if not os.path.isfile("./raw_data.json"):
        dir_list = os.listdir("./Dataset/")
        raw_data = {}
        for dir_ in dir_list:
            raw_data[dir_] = {}
            for fl in glob.glob(os.path.join("./Dataset/"+dir_+"/","*.txt")):
                fp = open(fl)
                text = fp.read()
                fp.close()
                offset = text.index("     ")
                entry_name = text[:offset]
                entry = text[offset+5:].strip(" ").split()
                raw_data[dir_][entry_name]=entry
        with open('raw_data.json', 'w', encoding="utf-8") as outfile:
            json.dump(raw_data, outfile, ensure_ascii=False, indent=4)
    else:
        with open("./raw_data.json") as json_file:
                raw_data = json.load(json_file)

    return (raw_data)
                     

In [ ]:
# Getting count for all assembly instructions from the dataset
def ft_get_top_opcodes(raw_data):
    count =  {}
    total = 0
    for rware in raw_data:
        for entry in raw_data[rware]:
            for word in raw_data[rware][entry]:
                if word in count:
                    count[word] += 1
                else:
                    count[word] = 1
    count = dict(sorted(count.items(), key = lambda k:k[1], reverse=True))
    total = sum(count.values())
    # Getting the top instructions to cover 90% of the data

    n = 0
    top = []

    for inst in count:
        if n/total >= 0.9:
            break
        top.append(inst)
        n += count[inst]
    
    return (count, top)

In [ ]:
def ft_vectorize(count, top, raw_data):
    # creating features based on ngrams

    unigrams = list(count.keys())
    bigrams = list(itertools.permutations(top, 2))
    trigrams = list(itertools.permutations(top, 3))

    # creating column labels
    columns = unigrams + bigrams + trigrams
    columns.append("Ransomware Type")

    # vectorizing unigrams

    univecs = {}
    for rware in raw_data:
        for entry in raw_data[rware]:
            vector = []
            for word in unigrams:
                vector.append(raw_data[rware][entry].count(word))
            # vector.append(rware)
            univecs[entry] = vector

    # vectorizing bigrams

    bivecs = {}
    for rware in raw_data:
        for entry in raw_data[rware]:
            temp1 = list(ngrams(raw_data[rware][entry],2))
            temp2 = dict(collections.Counter(temp1))
            vec = dict((x,0) for x in bigrams)
            for gram in temp2:
                if gram in bigrams:
                    vec[gram] = temp2[gram]
            bivecs[entry] = list(vec.values())

    # vectorizing trigrams

    trivecs = {}
    for rware in raw_data:
        for entry in raw_data[rware]:
            temp1 = list(ngrams(raw_data[rware][entry],3))
            temp2 = dict(collections.Counter(temp1))
            vec = dict((x,0) for x in trigrams)
            for gram in temp2:
                if gram in trigrams:
                    vec[gram] = temp2[gram]
            trivecs[entry] = list(vec.values())
    
    return (columns, univecs, bivecs, trivecs)

In [ ]:
def ft_get_vectors():
    if os.path.isfile("./Data/data_vectors.csv"):
        df= pd.read_csv("./Data/data_vectors.csv")
        return (df.drop(df.columns[0],axis=1))
    else:
        raw_data = ft_load_raw_data_from_fs()
        count, top = ft_get_top_opcodes(raw_data)
        columns, univecs, bivecs, trivecs = ft_vectorize(count, top, raw_data)

        data = {}
        for rware in raw_data:
            for entry in raw_data[rware]:
                temp = [] + univecs[entry] + bivecs[entry] + trivecs[entry] + [rware]
                data[entry] = temp

        df = pd.DataFrame(list(data.values()), columns=columns)
        # replacing ransomware names with integers

        df.replace("Cerber",1, inplace=True)
        df.replace("CryptoWall",2, inplace=True)
        df.replace("CTB-Locker",3, inplace=True)
        df.replace("Locky",4, inplace=True)
        df.replace("Sage",5, inplace=True)
        df.replace("TeslaCrypt",6, inplace=True)
        map_ = {1:"Cerber",2:"CryptoWall",3:"CTB-Locker",4:"Locky",5:"Sage",6:"TeslaCrypt"}

        # dropping column where all counts are 0
        df = df.loc[:, (df != 0).any(axis=0)]

        df.to_csv("./Data/data_vectors.csv")
        with open('Results/top-opcodes.txt','w') as fp:
            fp.write(str(top))

        return (df)

In [ ]:
# getting data in the form of count vectors
map_ = {1:"Cerber",2:"CryptoWall",3:"CTB-Locker",4:"Locky",5:"Sage",6:"TeslaCrypt"}
df = ft_get_vectors()

In [ ]:
# splitting dataframe into test and train sets 60-40 ratio using stratified sampling
y = df.pop('Ransomware Type')
X_train_counts, X_test_counts, Y_train_counts, Y_test_counts = skl.model_selection.train_test_split(df, y, test_size=0.33, stratify = y)

In [ ]:
# getting probabilities based on all of test data
X_train_norm_all = X_train_counts/X_train_counts.sum()
X_train_norm_all.fillna(0, inplace = True)
X_test_norm_all = X_test_counts
Y_train_norm_all = Y_train_counts
Y_test_norm_all = Y_test_counts

In [ ]:
# getting metrics for features as counts


# classification labels
metrics = [['Classifier\t', 'Precision\t', 'Recall\t\t', 'F-Score']]

# logistic regression on counts
lr_counts = skl.linear_model.LogisticRegression(penalty = 'l2',solver = 'newton-cg', fit_intercept = True).fit(X_train_counts,Y_train_counts)
# lr_counts_acc = skl.metrics.accuracy_score(Y_test_counts,lr_counts.predict(X_test_counts))
plot_cm_lr_counts = skl.metrics.plot_confusion_matrix(lr_counts, X_test_counts,Y_test_counts, display_labels=[_ for _ in map_], cmap = plt.cm.BuGn)
plot_cm_lr_counts.figure_.savefig('Figures/LR-Counts-CM.png')
prf_lr_counts = skl.metrics.precision_recall_fscore_support(Y_test_counts,lr_counts.predict(X_test_counts), labels=[1,2,3,4,5,6], average = 'weighted')
metrics.append(['Logistic Regression'] + list(prf_lr_counts[:-1]))

# svm on counts
svm_counts = skl.svm.LinearSVC(penalty = 'l2').fit(X_train_counts,Y_train_counts)
svm_counts_acc = skl.metrics.accuracy_score(Y_test_counts,svm_counts.predict(X_test_counts))
plot_cm_svm_counts = skl.metrics.plot_confusion_matrix(svm_counts, X_test_counts,Y_test_counts, display_labels=[_ for _ in map_], cmap = plt.cm.BuGn)
plot_cm_svm_counts.figure_.savefig('Figures/SVM-Counts-CM.png')
prf_svm_counts = skl.metrics.precision_recall_fscore_support(Y_test_counts,svm_counts.predict(X_test_counts), labels=[1,2,3,4,5,6], average = 'weighted')
metrics.append(['SVM\t\t'] + list(prf_svm_counts[:-1]))


# random forest classifier on counts
rf_counts = skl.ensemble.RandomForestClassifier(criterion='gini', n_estimators = 100).fit(X_train_counts,Y_train_counts)
rf_counts_acc = skl.metrics.accuracy_score(Y_test_counts,rf_counts.predict(X_test_counts))
plot_cm_rf_counts = skl.metrics.plot_confusion_matrix(rf_counts, X_test_counts,Y_test_counts, display_labels=[_ for _ in map_], cmap = plt.cm.BuGn)
plot_cm_rf_counts.figure_.savefig('Figures/RF-Counts-CM.png')
prf_rf_counts = skl.metrics.precision_recall_fscore_support(Y_test_counts,rf_counts.predict(X_test_counts), labels=[1,2,3,4,5,6], average = 'weighted')
metrics.append(['Random Forest\t'] + list(prf_rf_counts[:-1]))


# decision tree for counts
dtree_counts = skl.tree.DecisionTreeClassifier(criterion='gini').fit(X_train_counts,Y_train_counts)
dtree_counts_acc = skl.metrics.accuracy_score(Y_test_counts,dtree_counts.predict(X_test_counts))
plot_cm_dtree_counts = skl.metrics.plot_confusion_matrix(dtree_counts, X_test_counts,Y_test_counts, display_labels=[_ for _ in map_], cmap = plt.cm.BuGn)
plot_cm_dtree_counts.figure_.savefig('Figures/dtree-Counts-CM.png')
prf_dtree_counts = skl.metrics.precision_recall_fscore_support(Y_test_counts,dtree_counts.predict(X_test_counts), labels=[1,2,3,4,5,6], average = 'weighted')
metrics.append(['Decision Tree\t'] + list(prf_lr_counts[:-1]))


# knn classifier on counts
knn_counts = skl.neighbors.KNeighborsClassifier(algorithm = 'auto', metric = 'l1', n_neighbors = 3).fit(X_train_counts,Y_train_counts)
knn_counts_acc = skl.metrics.accuracy_score(Y_test_counts,knn_counts.predict(X_test_counts))
plot_cm_knn_counts = skl.metrics.plot_confusion_matrix(knn_counts, X_test_counts,Y_test_counts, display_labels=[_ for _ in map_], cmap = plt.cm.BuGn)
plot_cm_knn_counts.figure_.savefig('Figures/KNN-Counts-CM.png')
prf_knn_counts = skl.metrics.precision_recall_fscore_support(Y_test_counts,knn_counts.predict(X_test_counts), labels=[1,2,3,4,5,6], average = 'weighted')
metrics.append(['KNN\t\t'] + list(prf_knn_counts[:-1]))

op_counts = ''
for row in metrics:
    line = ''
    for col in row:
        if type(col) == float:
            line += '{:.4f}\t'.format(col)
        else:
            line += '{}\t'.format(col)
    op_counts += line + '\n'

with open('Results/Count_metrics.txt','w') as fp:
    fp.write(op_counts)


In [ ]:
# getting metrics for features as probabilities

# classification labels
metrics = [['Classifier\t', 'Precision\t', 'Recall\t\t', 'F-Score']]


# logistic regression on normalized probabilities
lr_norm_all = skl.linear_model.LogisticRegression(penalty = 'none',solver = 'newton-cg', fit_intercept = True).fit(X_train_norm_all,Y_train_norm_all)
lr_norm_all_acc = skl.metrics.accuracy_score(Y_test_norm_all,lr_norm_all.predict(X_test_norm_all))
plot_cm_lr_norm_all = skl.metrics.plot_confusion_matrix(lr_norm_all, X_test_norm_all,Y_test_norm_all, display_labels=[_ for _ in map_], cmap = plt.cm.BuGn)
plot_cm_lr_norm_all.figure_.savefig('Figures/LR-Normalized-CM.png')
prf_lr_norm_all = skl.metrics.precision_recall_fscore_support(Y_test_norm_all,lr_norm_all.predict(X_test_norm_all), labels=[1,2,3,4,5,6], average = 'weighted')
metrics.append(['Logistic Regression'] + list(prf_lr_norm_all[:-1]))

# svm on counts
svm_norm_all = skl.svm.LinearSVC().fit(X_train_norm_all,Y_train_norm_all)
svm_norm_all_acc = skl.metrics.accuracy_score(Y_test_norm_all,svm_norm_all.predict(X_test_norm_all))
plot_cm_svm_norm_all = skl.metrics.plot_confusion_matrix(svm_norm_all, X_test_norm_all,Y_test_norm_all, display_labels=[_ for _ in map_], cmap = plt.cm.BuGn)
plot_cm_svm_norm_all.figure_.savefig('Figures/SVM-Normalized-CM.png')
prf_svm_norm_all = skl.metrics.precision_recall_fscore_support(Y_test_norm_all,svm_norm_all.predict(X_test_norm_all), labels=[1,2,3,4,5,6], average = 'weighted')
metrics.append(['SVM\t\t'] + list(prf_svm_norm_all[:-1]))


# random forest classifier on counts
rf_norm_all = skl.ensemble.RandomForestClassifier(criterion='gini', n_estimators = 100).fit(X_train_norm_all,Y_train_norm_all)
rf_norm_all_acc = skl.metrics.accuracy_score(Y_test_norm_all,rf_norm_all.predict(X_test_norm_all))
plot_cm_rf_norm_all = skl.metrics.plot_confusion_matrix(rf_norm_all, X_test_norm_all,Y_test_norm_all, display_labels=[_ for _ in map_], cmap = plt.cm.BuGn)
plot_cm_rf_norm_all.figure_.savefig('Figures/RF-Normalized-CM.png')
prf_rf_norm_all = skl.metrics.precision_recall_fscore_support(Y_test_norm_all,rf_norm_all.predict(X_test_norm_all), labels=[1,2,3,4,5,6], average = 'weighted')
metrics.append(['Random Forest\t'] + list(prf_rf_norm_all[:-1]))


# # decision tree for counts
dtree_norm_all = skl.tree.DecisionTreeClassifier(criterion='gini').fit(X_train_norm_all,Y_train_norm_all)
dtree_norm_all_acc = skl.metrics.accuracy_score(Y_test_norm_all,dtree_norm_all.predict(X_test_norm_all))
plot_cm_dtree_norm_all = skl.metrics.plot_confusion_matrix(dtree_norm_all, X_test_norm_all,Y_test_norm_all, display_labels=[_ for _ in map_], cmap = plt.cm.BuGn)
plot_cm_dtree_norm_all.figure_.savefig('Figures/dtree-Normalized-CM.png')
prf_dtree_norm_all = skl.metrics.precision_recall_fscore_support(Y_test_norm_all,dtree_norm_all.predict(X_test_norm_all), labels=[1,2,3,4,5,6], average = 'weighted')
metrics.append(['Decision Tree\t'] + list(prf_lr_norm_all[:-1]))


# # knn classifier on counts
knn_norm_all = skl.neighbors.KNeighborsClassifier(algorithm = 'auto', metric = scipy.spatial.distance.jensenshannon, n_neighbors = 3).fit(X_train_norm_all,Y_train_norm_all)
knn_norm_all_acc = skl.metrics.accuracy_score(Y_test_norm_all,knn_norm_all.predict(X_test_norm_all))
plot_cm_knn_norm_all = skl.metrics.plot_confusion_matrix(knn_norm_all, X_test_norm_all,Y_test_norm_all, display_labels=[_ for _ in map_], cmap = plt.cm.BuGn)
plot_cm_knn_norm_all.figure_.savefig('Figures/KNN-Normalized-CM.png')
prf_knn_norm_all = skl.metrics.precision_recall_fscore_support(Y_test_norm_all,knn_norm_all.predict(X_test_norm_all), labels=[1,2,3,4,5,6], average = 'weighted')
metrics.append(['KNN\t\t'] + list(prf_knn_norm_all[:-1]))

                                             
op_counts = ''
for row in metrics:
    line = ''
    for col in row:
        if type(col) == float:
            line += '{:.4f}\t'.format(col)
        else:
            line += '{}\t'.format(col)
    op_counts += line + '\n'

with open('Results/Normalized_metrics.txt','w') as fp:
    fp.write(op_counts)
                                             

In [ ]:
# training lasso regression model to obtain weights for columns
# dropping features if lasso weights are zero
# saving selected features to file

lasso = skl.linear_model.Lasso(max_iter = 10000).fit(X_train_counts,Y_train_counts)
lasso_coeff = lasso.coef_.tolist()
drop_zero = [i for i in range(len(lasso_coeff)) if lasso_coeff[i] == 0]
selected_features = X_train_counts.columns.to_list()
with open('Results/selected_features.txt','w') as fp:
    fp.write(str(selected_features))

In [ ]:
X_train_counts = X_train_counts.drop([X_train_counts.columns[i] for i in drop_zero],axis=1)
X_test_counts = X_test_counts.drop([X_test_counts.columns[i] for i in drop_zero],axis=1)

In [ ]:
# getting metrics for selected features as counts


# classification labels
metrics = [['Classifier\t', 'Precision\t', 'Recall\t\t', 'F-Score']]

# logistic regression on counts
lr_counts = skl.linear_model.LogisticRegression(penalty = 'l2',solver = 'newton-cg', fit_intercept = True).fit(X_train_counts,Y_train_counts)
# lr_counts_acc = skl.metrics.accuracy_score(Y_test_counts,lr_counts.predict(X_test_counts))
plot_cm_lr_counts = skl.metrics.plot_confusion_matrix(lr_counts, X_test_counts,Y_test_counts, display_labels=[_ for _ in map_], cmap = plt.cm.BuGn)
plot_cm_lr_counts.figure_.savefig('Figures/LR-Counts-selected-CM.png')
prf_lr_counts = skl.metrics.precision_recall_fscore_support(Y_test_counts,lr_counts.predict(X_test_counts), labels=[1,2,3,4,5,6], average = 'weighted')
metrics.append(['Logistic Regression'] + list(prf_lr_counts[:-1]))

# svm on counts
svm_counts = skl.svm.LinearSVC(penalty = 'l2').fit(X_train_counts,Y_train_counts)
svm_counts_acc = skl.metrics.accuracy_score(Y_test_counts,svm_counts.predict(X_test_counts))
plot_cm_svm_counts = skl.metrics.plot_confusion_matrix(svm_counts, X_test_counts,Y_test_counts, display_labels=[_ for _ in map_], cmap = plt.cm.BuGn)
plot_cm_svm_counts.figure_.savefig('Figures/SVM-Counts-selected-CM.png')
prf_svm_counts = skl.metrics.precision_recall_fscore_support(Y_test_counts,svm_counts.predict(X_test_counts), labels=[1,2,3,4,5,6], average = 'weighted')
metrics.append(['SVM\t\t'] + list(prf_svm_counts[:-1]))


# random forest classifier on counts
rf_counts = skl.ensemble.RandomForestClassifier(criterion='gini', n_estimators = 100).fit(X_train_counts,Y_train_counts)
rf_counts_acc = skl.metrics.accuracy_score(Y_test_counts,rf_counts.predict(X_test_counts))
plot_cm_rf_counts = skl.metrics.plot_confusion_matrix(rf_counts, X_test_counts,Y_test_counts, display_labels=[_ for _ in map_], cmap = plt.cm.BuGn)
plot_cm_rf_counts.figure_.savefig('Figures/RF-Counts-selected-CM.png')
prf_rf_counts = skl.metrics.precision_recall_fscore_support(Y_test_counts,rf_counts.predict(X_test_counts), labels=[1,2,3,4,5,6], average = 'weighted')
metrics.append(['Random Forest\t'] + list(prf_rf_counts[:-1]))


# decision tree for counts
dtree_counts = skl.tree.DecisionTreeClassifier(criterion='gini').fit(X_train_counts,Y_train_counts)
dtree_counts_acc = skl.metrics.accuracy_score(Y_test_counts,dtree_counts.predict(X_test_counts))
plot_cm_dtree_counts = skl.metrics.plot_confusion_matrix(dtree_counts, X_test_counts,Y_test_counts, display_labels=[_ for _ in map_], cmap = plt.cm.BuGn)
plot_cm_dtree_counts.figure_.savefig('Figures/dtree-Counts-selected-CM.png')
prf_dtree_counts = skl.metrics.precision_recall_fscore_support(Y_test_counts,dtree_counts.predict(X_test_counts), labels=[1,2,3,4,5,6], average = 'weighted')
metrics.append(['Decision Tree\t'] + list(prf_lr_counts[:-1]))


# knn classifier on counts
knn_counts = skl.neighbors.KNeighborsClassifier(algorithm = 'auto', metric = 'l1', n_neighbors = 3).fit(X_train_counts,Y_train_counts)
knn_counts_acc = skl.metrics.accuracy_score(Y_test_counts,knn_counts.predict(X_test_counts))
plot_cm_knn_counts = skl.metrics.plot_confusion_matrix(knn_counts, X_test_counts,Y_test_counts, display_labels=[_ for _ in map_], cmap = plt.cm.BuGn)
plot_cm_knn_counts.figure_.savefig('Figures/KNN-Counts-selected-CM.png')
prf_knn_counts = skl.metrics.precision_recall_fscore_support(Y_test_counts,knn_counts.predict(X_test_counts), labels=[1,2,3,4,5,6], average = 'weighted')
metrics.append(['KNN\t\t'] + list(prf_knn_counts[:-1]))

op_counts = ''
for row in metrics:
    line = ''
    for col in row:
        if type(col) == float:
            line += '{:.4f}\t'.format(col)
        else:
            line += '{}\t'.format(col)
    op_counts += line + '\n'

with open('Results/Count_selected_metrics.txt','w') as fp:
    fp.write(op_counts)
