In [1]:
import pandas as pd

## Load dataset

In [2]:
df = pd.read_csv("exploit_hacking_set Data Scientist.csv",
                 converters={"type": str, "label": str, "sub_label": str, "text": str, "title": str, "site": str})


In [3]:
df.dtypes

type            object
label           object
sub_label       object
text            object
title           object
site            object
spont_score    float64
dtype: object

In [4]:
df

Unnamed: 0,type,label,sub_label,text,title,site,spont_score
0,post,Hacking,,i haven t seen a post about this rat lately . ...,spynote v android rat,forum_sinisterly,60.869565
1,post,Fraud,,looking for us partner simple extra money add ...,looking for us partner simple extra money,forum_alphabay,-30.434783
2,post,Fraud,Carding,i have drop address in us eu for who need for ...,for who needs drop address in us eu,forum_alphabay,-108.695652
3,post,,,have a closed prefix on the thread names after...,suggestions,forum_sinisterly,-26.086957
4,post,Computer,Coding,hey guys . i was working on an updated mass tw...,php leaks / downloads,forum_leakforums,-4.347826
...,...,...,...,...,...,...,...
10330,,,,,,,
10331,,,,,,,
10332,,,,,,,
10333,,,,,,,


### Convert categorical features to one hot

In [5]:
one_hot_type = pd.get_dummies(df['type'])
one_hot_site = pd.get_dummies(df['site'])
one_hot_spont_score = pd.get_dummies(df['spont_score'])

df_X = pd.concat([one_hot_type, one_hot_site, one_hot_spont_score], axis=1)

df_X["spont_score"] = df['spont_score']

# add simple text-based features
df_X["text_len"] = df['text'].apply(len)
df_X["title_len"] = df['title'].apply(len)

# create targets for Naive and Flat approaches
y_hac = df['label'] == 'Hacking'
y_hac = y_hac.astype(int)
y_exp = df['sub_label'] == 'Exploit'
y_exp = y_exp.astype(int)

y_hac_exp = df['sub_label'] == 'Exploit'
y_hac_exp[y_hac == 1] = 1
y_hac_exp[y_exp == 1] = 2
y_hac_exp = y_hac_exp.astype(int)

In [6]:
y_hac.value_counts()

0    8590
1    1745
Name: label, dtype: int64

In [7]:
y_exp.value_counts()

0    9581
1     754
Name: sub_label, dtype: int64

In [8]:
y_hac_exp.value_counts()

0    8590
1     991
2     754
Name: sub_label, dtype: int64

In [9]:
# fill missing values
df_X_isna = df_X.isna()
for name in df_X.columns:
    print(f"{name}: {df_X_isna[name].sum()}")

df_X["spont_score"] = df_X["spont_score"].fillna(value=0.239)
print(f"spont_score: {df_X.isna()['spont_score'].sum()}")

:     0
    0
dtype: int64
post: 0
product: 0
reply: 0
:     0
    0
dtype: int64
cc_market_approved_su: 0
cc_market_bigbase: 0
cc_market_bigmoney: 0
cc_market_bingo: 0
cc_market_cv2: 0
cc_market_ferumshop: 0
cc_market_goldenshop: 0
cc_market_goswipe: 0
cc_market_hustlerbank: 0
cc_market_jokersstash: 0
cc_market_tormarket: 0
cc_market_validcc: 0
cc_market_wuzzup: 0
dread: 0
forum_0day: 0
forum_0x00sec: 0
forum_2cto: 0
forum_51cto: 0
forum_52pojie: 0
forum_8chan: 0
forum_agora: 0
forum_aljyyosh: 0
forum_alkrsan: 0
forum_alligatorcash: 0
forum_alphabay: 0
forum_altenen: 0
forum_antichat: 0
forum_antionline: 0
forum_ashiyane: 0
forum_bestblackhat: 0
forum_bestblackhat_vip: 0
forum_bhf: 0
forum_bitcointalk: 0
forum_bitshacking: 0
forum_blackhacker: 0
forum_blackhatworld: 0
forum_blackstuff: 0
forum_blackworld: 0
forum_bpcsquad: 0
forum_breachforums: 0
forum_breachteam: 0
forum_brutezone: 0
forum_carder: 0
forum_cardingmafia: 0
forum_cardvilla: 0
forum_ccc: 0
forum_cctry: 0
forum_cebulka: 0

In [10]:
X = df_X.values

### Cross-validation able to work with Naive and Flat approaches

In [11]:
import numpy as np
from cross_val import cross_val

In [12]:
from sklearn.ensemble import RandomForestClassifier

### RandomForest (Hacking)

In [13]:
clf_factory = lambda: RandomForestClassifier(n_estimators=1000, random_state=0)
cross_val(clf_factory, X, y_hac)

Cross Validation k_folds=5, n_classes=2:
	0 pr: 0.673728813559322 rec: 0.452991452991453 f1: 0.5417376490630325
	1 pr: 0.625 rec: 0.44072948328267475 f1: 0.5169340463458111
	2 pr: 0.6383763837638377 rec: 0.4914772727272727 f1: 0.5553772070626003
	3 pr: 0.6752136752136753 rec: 0.45272206303724927 f1: 0.5420240137221269
	4 pr: 0.6940298507462687 rec: 0.510989010989011 f1: 0.5886075949367089
avg: pr=0.6612697446566207 rec=0.4697818566055322 f1=0.5489361022260559


### RandomForest (Exploit)

In [50]:
clf_factory = lambda: RandomForestClassifier(n_estimators=1000, random_state=0)
cross_val(clf_factory, X, y_exp)

Cross Validation k_folds=5, n_classes=2:
	0 pr: 0.8214285714285714 rec: 0.6433566433566433 f1: 0.7215686274509804
	1 pr: 0.8245614035087719 rec: 0.6438356164383562 f1: 0.7230769230769231
	2 pr: 0.7622950819672131 rec: 0.6118421052631579 f1: 0.6788321167883211
	3 pr: 0.8214285714285714 rec: 0.6216216216216216 f1: 0.7076923076923075
	4 pr: 0.8148148148148148 rec: 0.6666666666666666 f1: 0.7333333333333333
avg: pr=0.8089056886295886 rec=0.6374645306692891 f1=0.7129006616683731


### RandomForest (Hacking & Exploit - Flat approach)

In [51]:
clf_factory = lambda: RandomForestClassifier(n_estimators=1000, random_state=0)
cross_val(clf_factory, X, y_hac_exp)

Cross Validation k_folds=5, n_classes=3:
	0 pr: [0.89142237 0.48571429 0.8       ] rec: [0.95687646 0.24519231 0.67132867] f1: [0.92299044 0.32587859 0.73003802]
	1 pr: [0.90054348 0.38461538 0.81300813] rec: [0.95339471 0.21857923 0.68493151] f1: [0.92621576 0.27874564 0.74349442]
	2 pr: [0.90083102 0.47244094 0.74074074] rec: [0.94810496 0.3        0.65789474] f1: [0.92386364 0.36697248 0.69686411]
	3 pr: [0.89202387 0.42857143 0.80357143] rec: [0.95692666 0.23880597 0.60810811] f1: [0.92333614 0.30670927 0.69230769]
	4 pr: [0.89767699 0.46153846 0.8028169 ] rec: [0.95302408 0.27135678 0.69090909] f1: [0.92452293 0.34177215 0.74267101]
avg: pr=[0.89649955 0.4465761  0.79202744] rec=[0.95366537 0.25478686 0.66263442] f1=[0.92418578 0.32401563 0.72107505]


### LogisticRegression (Hacking)

In [52]:
from sklearn.linear_model import LogisticRegression

clf_factory = lambda: LogisticRegression(solver='liblinear')
cross_val(clf_factory, X, y_hac)

Cross Validation k_folds=5, n_classes=2:
	0 pr: 0.725 rec: 0.4131054131054131 f1: 0.5263157894736842
	1 pr: 0.7643979057591623 rec: 0.44376899696048633 f1: 0.5615384615384615
	2 pr: 0.6995884773662552 rec: 0.48295454545454547 f1: 0.5714285714285714
	3 pr: 0.7603686635944701 rec: 0.47277936962750716 f1: 0.5830388692579506
	4 pr: 0.7261904761904762 rec: 0.5027472527472527 f1: 0.594155844155844
avg: pr=0.7351091045820728 rec=0.46307111557904096 f1=0.5672955071709024


### CatBoost (Hacking)

In [53]:
from catboost import CatBoostClassifier

clf_factory = lambda: CatBoostClassifier(iterations=2000,
                                         learning_rate=0.1,
                                         loss_function='Logloss',
                                         verbose=False)

cross_val(clf_factory, X, y_hac)

Cross Validation k_folds=5, n_classes=2:
	0 pr: 0.7283950617283951 rec: 0.5042735042735043 f1: 0.5959595959595959
	1 pr: 0.6824034334763949 rec: 0.48328267477203646 f1: 0.5658362989323843
	2 pr: 0.6690647482014388 rec: 0.5284090909090909 f1: 0.5904761904761905
	3 pr: 0.7191489361702128 rec: 0.48424068767908307 f1: 0.5787671232876712
	4 pr: 0.6872727272727273 rec: 0.5192307692307693 f1: 0.5915492957746479
avg: pr=0.6972569813698337 rec=0.5038873453728968 f1=0.584517700886098


### CatBoost (Exploit)

In [54]:
clf_factory = lambda: CatBoostClassifier(iterations=2000,
                                         learning_rate=0.1,
                                         loss_function='Logloss',
                                         verbose=False)

cross_val(clf_factory, X, y_exp)

Cross Validation k_folds=5, n_classes=2:
	0 pr: 0.8091603053435115 rec: 0.7412587412587412 f1: 0.7737226277372262
	1 pr: 0.7737226277372263 rec: 0.726027397260274 f1: 0.7491166077738517
	2 pr: 0.7171052631578947 rec: 0.7171052631578947 f1: 0.7171052631578947
	3 pr: 0.7910447761194029 rec: 0.7162162162162162 f1: 0.7517730496453902
	4 pr: 0.7770700636942676 rec: 0.7393939393939394 f1: 0.7577639751552795
avg: pr=0.7736206072104606 rec=0.7280003114574132 f1=0.7498963046939284


### CatBoost (Hacking & Exploit - Flat approach)

In [56]:
clf_factory = lambda: CatBoostClassifier(iterations=2000,
                                         learning_rate=0.1,
                                         # loss_function='Logloss',
                                         verbose=False)

cross_val(clf_factory, X, y_hac_exp)

Cross Validation k_folds=5, n_classes=3:
	0 pr: [0.89658906 0.57777778 0.77692308] rec: [0.96503497 0.25       0.70629371] f1: [0.92955375 0.34899329 0.73992674]
	1 pr: [0.90830168 0.5        0.77536232] rec: [0.96317606 0.23497268 0.73287671] f1: [0.93493438 0.3197026  0.75352113]
	2 pr: [0.9044975  0.51260504 0.72108844] rec: [0.94985423 0.305      0.69736842] f1: [0.92662116 0.38244514 0.7090301 ]
	3 pr: [0.900271   0.53608247 0.824     ] rec: [0.96682189 0.25870647 0.69594595] f1: [0.93236037 0.34899329 0.75457875]
	4 pr: [0.9016031  0.51546392 0.77018634] rec: [0.95772167 0.25125628 0.75151515] f1: [0.92881549 0.33783784 0.7607362 ]
avg: pr=[0.90225247 0.52838584 0.77351203] rec=[0.96052176 0.25998709 0.71679999] f1=[0.93045703 0.34759443 0.74355858]
