In [1]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import sys
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
%matplotlib notebook
sys.path.append("..")
from data.loaders import load_capas_from_jsons, get_train_test_data

In [2]:
def _prepare_dfs_for_model(cats_df, column):
    cats_df['dummy'] = 1
    features_df = pd.pivot_table(cats_df, values='dummy', index=['uid'], columns=[column], aggfunc=np.sum, fill_value=0)
    features_df = (features_df>=1).astype(int)
    features_df['label'] = capas_df.groupby('uid').label.first()
    return features_df

In [3]:
base_dir = r"C:\Users\stav\data\whodis\parsed\CAPAs"
train_test_split_dir = r'C:\Users\stav\data\whodis\train_test_split'

In [17]:
cat_dfs= load_capas_from_jsons(base_dir)
capas_df = cat_dfs['capas']
mbcs_df = cat_dfs['mbc']

Loading json files: 100%|█████████████████████████████████████████████████████████| 4506/4506 [00:05<00:00, 798.00it/s]


In [18]:
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

features_encoder = LabelEncoder(); features_encoder.fit(capas_df['rule'].append(mbcs_df['objective']))

capas_df['rule']=features_encoder.transform(capas_df.rule)
mbcs_df['objective']=features_encoder.transform(mbcs_df.objective)

  features_encoder = LabelEncoder(); features_encoder.fit(capas_df['rule'].append(mbcs_df['objective']))


In [19]:
# Prepare 
capas_features_df = _prepare_dfs_for_model(capas_df, column='rule')
mbcs_features_df = _prepare_dfs_for_model(mbcs_df,column='objective')

In [20]:
train_split, test_split = get_train_test_data(train_test_split_dir, ver='v2')

In [21]:
features_opts = 'both' #['capas','mbc','both']
if features_opts=='both':
    features_df = pd.concat([capas_features_df,mbcs_features_df.drop(columns=['label'])],axis=1).fillna(int(0))
    feature_names = features_df.drop(columns=['label']).columns
    features_df[feature_names]=features_df[feature_names].astype(int)
elif features_opts=='capas':
    features_df = capas_features_df
elif features_opts=='mbc':
    features_df = mbcs_features_df

In [22]:
split_type= 'family_as_bg' #['flat','family_as_bg']
if split_type=='flat':
    pass
elif split_type=='family_as_bg':
    families = ['orcus', '7ev3n', 'Emotet', 'Conti', 'SugarRansomware']
    apts = ['cozy','veno']
    
    transform_dict = {f:'family' for f in families}
    for apt in apts:
        transform_dict[apt]=apt
    features_df['label'] = features_df.label.apply(transform_dict.get)

In [23]:
label_names = features_df['label'].unique()
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(features_df.label)
features_df['label'] = le.transform(features_df.label)
label_names

array(['veno', 'family', 'cozy'], dtype=object)

In [24]:
train_index  = np.intersect1d(train_split.index, features_df.index)
test_index  = np.intersect1d(test_split.index, features_df.index)
features_train = features_df.loc[train_index]
features_test = features_df.loc[test_index]

In [25]:
from catboost import CatBoost, CatBoostRegressor, Pool
train_pool = Pool(features_train.drop(['label'], 1), 
                  label=features_train.label,
                 cat_features=features_train.drop(columns=['label']).columns.to_list())
test_pool = Pool(features_test.drop(['label'], 1),
                 label=features_test.label,
                cat_features=features_train.drop(columns=['label']).columns.to_list())

  train_pool = Pool(features_train.drop(['label'], 1),
  test_pool = Pool(features_test.drop(['label'], 1),


In [44]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(
    iterations=3000,
    learning_rate=0.1,
    #auto_class_weights='Balanced',
    reg_lambda=6.0,
    #bootstrap_type='MVS',
    grow_policy='Lossguide',
    depth=12,
    max_leaves=24,
    colsample_bylevel=0.5,
    loss_function='MultiClassOneVsAll'
)

In [None]:
from catboost import CatBoostRegressor, EShapCalcType, EFeaturesSelectionAlgorithm
summary = model.select_features(
        train_pool,
        eval_set=test_pool,
        features_for_select=features_train.drop(columns=['label']).columns.to_list(),     # we will select from all features
        num_features_to_select=30,  # we want to select exactly important features
        steps=5,                                     # more steps - more accurate selection
        algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
        shap_calc_type=EShapCalcType.Regular,            # can be Approximate, Regular and Exact
        train_final_model=True,                          # to train model with selected features
        logging_level='Verbose',
        plot=True
    )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Step #1 out of 5
0:	learn: 0.6626223	test: 0.6639333	best: 0.6639333 (0)	total: 12.1ms	remaining: 36.2s
1:	learn: 0.6332182	test: 0.6361876	best: 0.6361876 (1)	total: 27.9ms	remaining: 41.9s
2:	learn: 0.6040280	test: 0.6077185	best: 0.6077185 (2)	total: 43.2ms	remaining: 43.2s
3:	learn: 0.5835824	test: 0.5874348	best: 0.5874348 (3)	total: 56.1ms	remaining: 42s
4:	learn: 0.5630997	test: 0.5703363	best: 0.5703363 (4)	total: 69.9ms	remaining: 41.9s
5:	learn: 0.5484143	test: 0.5563438	best: 0.5563438 (5)	total: 81.2ms	remaining: 40.5s
6:	learn: 0.5335825	test: 0.5422385	best: 0.5422385 (6)	total: 91.8ms	remaining: 39.2s
7:	learn: 0.5214708	test: 0.5308149	best: 0.5308149 (7)	total: 106ms	remaining: 39.7s
8:	learn: 0.5084989	test: 0.5185812	best: 0.5185812 (8)	total: 118ms	remaining: 39.1s
9:	learn: 0.4959825	test: 0.5066066	best: 0.5066066 (9)	total: 128ms	remaining: 38.3s
10:	learn: 0.4884116	test: 0.4992304	best: 0.4992304 (10)	total: 140ms	remaining: 38.2s
11:	learn: 0.4812027	test: 0.4

In [36]:
selected_features = summary['selected_features']
X_train = features_train[selected_features]
y_train = features_train.label
X_test = features_test[selected_features]
y_test = features_test.label

[16,
 27,
 35,
 43,
 44,
 67,
 70,
 74,
 81,
 90,
 114,
 130,
 166,
 167,
 175,
 176,
 181,
 208,
 235,
 238,
 245,
 255,
 256,
 264,
 294,
 296,
 310,
 328,
 330,
 332]

In [41]:
model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    verbose=True,
    plot=True
)
print('Model is fitted: ' + str(model.is_fitted()))
print('Model params:')
print(model.get_params())

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6622266	test: 0.6626331	best: 0.6626331 (0)	total: 3.38ms	remaining: 10.1s
1:	learn: 0.6363851	test: 0.6392119	best: 0.6392119 (1)	total: 7.09ms	remaining: 10.6s
2:	learn: 0.6141137	test: 0.6185392	best: 0.6185392 (2)	total: 10.7ms	remaining: 10.7s
3:	learn: 0.5929788	test: 0.5991047	best: 0.5991047 (3)	total: 13.4ms	remaining: 10.1s
4:	learn: 0.5797650	test: 0.5866447	best: 0.5866447 (4)	total: 17ms	remaining: 10.2s
5:	learn: 0.5661114	test: 0.5743218	best: 0.5743218 (5)	total: 20.1ms	remaining: 10.1s
6:	learn: 0.5539928	test: 0.5626957	best: 0.5626957 (6)	total: 23.4ms	remaining: 10s
7:	learn: 0.5440635	test: 0.5535383	best: 0.5535383 (7)	total: 26.2ms	remaining: 9.8s
8:	learn: 0.5329475	test: 0.5441342	best: 0.5441342 (8)	total: 29.8ms	remaining: 9.89s
9:	learn: 0.5241037	test: 0.5366149	best: 0.5366149 (9)	total: 33.3ms	remaining: 9.94s
10:	learn: 0.5172603	test: 0.5306914	best: 0.5306914 (10)	total: 35.9ms	remaining: 9.75s
11:	learn: 0.5117388	test: 0.5260949	best: 0.5

In [42]:
predictions = model.predict(features_test.drop(columns=['label']))
labels = features_test.label.values[:,np.newaxis]

print('Test Accuracy {}'.format((predictions==labels).sum()/labels.shape[0]))

Test Accuracy 0.7350427350427351


In [94]:
from catboost import Pool, CatBoostClassifier
from catboost.utils import get_confusion_matrix
cm = get_confusion_matrix(model, test_pool)
print(cm)

CatBoostError: Model was already fitted. Set train_final_model to False or use not fitted model.

In [12]:
import time
time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=60, n_iter=3000)
tsne_results = tsne.fit_transform(features_test.drop(columns=['label']))



[t-SNE] Computing 181 nearest neighbors...
[t-SNE] Indexed 545 samples in 0.000s...
[t-SNE] Computed neighbors for 545 samples in 0.009s...
[t-SNE] Computed conditional probabilities for sample 545 / 545
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 43.673325
[t-SNE] KL divergence after 3000 iterations: 0.055862


In [13]:

features_test['tsne-2d-one'] = tsne_results[:,0]
features_test['tsne-2d-two'] = tsne_results[:,1]
plt.figure(figsize=(16,10))
sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="label",
    #palette=sns.color_palette("hls", 5),
    data=features_test,
    legend="full",
    alpha=0.3
)

<IPython.core.display.Javascript object>

<AxesSubplot:xlabel='tsne-2d-one', ylabel='tsne-2d-two'>

In [82]:
sns.color_palette("hls", 5)

In [103]:
import time
time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=60, n_iter=3000)
tsne_results = tsne.fit_transform(features_df.drop(columns=['label']))



[t-SNE] Computing 181 nearest neighbors...
[t-SNE] Indexed 2147 samples in 0.001s...
[t-SNE] Computed neighbors for 2147 samples in 0.083s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2147
[t-SNE] Computed conditional probabilities for sample 2000 / 2147
[t-SNE] Computed conditional probabilities for sample 2147 / 2147
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 38.791409
[t-SNE] KL divergence after 2950 iterations: 0.087856


In [107]:

features_df['tsne-2d-one'] = tsne_results[:,0]
features_df['tsne-2d-two'] = tsne_results[:,1]
plt.figure(figsize=(16,10))
sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="label",
    #palette=sns.color_palette("hls", 5),
    data=features_df,
    legend="full",
    alpha=0.3
)

<IPython.core.display.Javascript object>

<AxesSubplot:xlabel='tsne-2d-one', ylabel='tsne-2d-two'>

In [106]:
!pip install ipython

