In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [2]:
DATA_FOLDER = "../../../DATA/"

train = pd.read_parquet(DATA_FOLDER + 'FINAL/TRAIN_FINISHED.parq')
test = pd.read_parquet(DATA_FOLDER + 'FINAL/TEST_FINISHED.parq')
val = pd.read_parquet(DATA_FOLDER + 'FINAL/VAL_FINISHED.parq')

In [3]:
train.columns

Index(['port_count', 'HAS_TCP?', 'HAS_UDP?', 'risky_ports', 'label',
       'attacker_as_name', 'watcher_as_name', 'attacker_country',
       'joined_countries_0', 'joined_countries_1', 'joined_countries_2',
       'joined_countries_3', 'joined_countries_4', 'joined_countries_5',
       'joined_countries_6', 'joined_countries_7', 'joined_countries_8',
       'joined_countries_9', 'joined_countries_10', 'joined_countries_11',
       'joined_countries_12', 'protocol_0', 'protocol_1', 'protocol_2',
       'protocol_3', 'crawl', 'exploit', 'scan', 'spam', 'unknown', 'nan'],
      dtype='object')

In [4]:
X_train = train.loc[:, ~train.columns.isin(['label', "attacker_ip_enum", 'HAS_TCP?', 'HAS_UDP?', 'watcher_as_name'])]
Y_train = train.loc[:, 'label']
X_test = test.loc[:, ~test.columns.isin(['label', "attacker_ip_enum", 'HAS_TCP?', 'HAS_UDP?', 'watcher_as_name'])]
Y_test = test.loc[:, 'label']
X_val = val.loc[:, ~val.columns.isin(['label', "attacker_ip_enum", 'HAS_TCP?', 'HAS_UDP?', 'watcher_as_name'])]
Y_val = val.loc[:, 'label']
del train
del test

In [5]:
model = RandomForestClassifier()

param_grid = {
    'n_estimators': 50,
    'criterion': 'entropy',
    'max_depth': None,
    'min_samples_split': 6,
    'min_samples_leaf': 3,
    'max_features': 'log2',
    'max_leaf_nodes': None,
    'min_impurity_decrease': 0.0,
    'class_weight': 'balanced',
    'random_state': 1996}

best_model = RandomForestClassifier(**param_grid, verbose=3)
best_model.fit(X_train, Y_train)


y_val_pred = best_model.predict(X_val)
f1_val = f1_score(Y_val, y_val_pred)


y_test_pred = best_model.predict(X_test)
f1_test = f1_score(Y_test, y_test_pred)



print(f'Validation F1 Score: {f1_val}')
print(f'Test F1 Score: {f1_test}')
print(param_grid)
print(X_train.columns)

building tree 1 of 50
building tree 2 of 50
building tree 3 of 50
building tree 4 of 50
building tree 5 of 50
building tree 6 of 50
building tree 7 of 50
building tree 8 of 50
building tree 9 of 50
building tree 10 of 50
building tree 11 of 50
building tree 12 of 50
building tree 13 of 50
building tree 14 of 50
building tree 15 of 50
building tree 16 of 50
building tree 17 of 50
building tree 18 of 50
building tree 19 of 50
building tree 20 of 50
building tree 21 of 50
building tree 22 of 50
building tree 23 of 50
building tree 24 of 50
building tree 25 of 50
building tree 26 of 50
building tree 27 of 50
building tree 28 of 50
building tree 29 of 50
building tree 30 of 50
building tree 31 of 50


[Parallel(n_jobs=1)]: Done  31 tasks      | elapsed:  2.2min


building tree 32 of 50
building tree 33 of 50
building tree 34 of 50
building tree 35 of 50
building tree 36 of 50
building tree 37 of 50
building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50
building tree 44 of 50
building tree 45 of 50
building tree 46 of 50
building tree 47 of 50
building tree 48 of 50
building tree 49 of 50
building tree 50 of 50


[Parallel(n_jobs=1)]: Done  31 tasks      | elapsed:    9.6s
[Parallel(n_jobs=1)]: Done  31 tasks      | elapsed:   20.9s


Validation F1 Score: 0.6530388411936009
Test F1 Score: 0.6627821706414998
{'n_estimators': 50, 'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 6, 'min_samples_leaf': 3, 'max_features': 'log2', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'class_weight': 'balanced', 'random_state': 1996}
Index(['port_count', 'risky_ports', 'attacker_as_name', 'attacker_country',
       'joined_countries_0', 'joined_countries_1', 'joined_countries_2',
       'joined_countries_3', 'joined_countries_4', 'joined_countries_5',
       'joined_countries_6', 'joined_countries_7', 'joined_countries_8',
       'joined_countries_9', 'joined_countries_10', 'joined_countries_11',
       'joined_countries_12', 'protocol_0', 'protocol_1', 'protocol_2',
       'protocol_3', 'crawl', 'exploit', 'scan', 'spam', 'unknown', 'nan'],
      dtype='object')


Validation F1 Score: 0.65312683194342
Test F1 Score: 0.6616880889122925
{'n_estimators': 50, 'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 6, 'min_samples_leaf': 3, 'max_features': 'log2', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'class_weight': 'balanced', 'random_state': 1996}
Index(['port_count', 'risky_ports', 'attacker_as_name', 'attacker_country',
       'joined_countries_0', 'joined_countries_1', 'joined_countries_2',
       'joined_countries_3', 'joined_countries_4', 'joined_countries_5',
       'joined_countries_6', 'joined_countries_7', 'joined_countries_8',
       'joined_countries_9', 'joined_countries_10', 'joined_countries_11',
       'joined_countries_12', 'protocol_0', 'protocol_1', 'protocol_2',
       'protocol_3', 'crawl', 'exploit', 'scan', 'spam', 'unknown', 'nan'],
      dtype='object')

      

In [6]:

FINAL_TEST = pd.read_parquet(DATA_FOLDER + 'FINAL/COMP_TEST_FINISHED.parq')
len(FINAL_TEST)

18682297

In [7]:
FINAL_TEST

Unnamed: 0,port_count,risky_ports,attacker_ip_enum,attacker_as_name,attacker_country,joined_countries_0,joined_countries_1,joined_countries_2,joined_countries_3,joined_countries_4,...,protocol_0,protocol_1,protocol_2,protocol_3,crawl,exploit,scan,spam,unknown,nan
0,17,True,7696,0.490258,0.159045,0,0,0,0,0,...,0,0,0,1,False,False,False,True,False,False
1,17,True,7696,0.490258,0.159045,0,0,0,0,0,...,0,0,0,1,False,True,False,False,False,False
2,17,True,7696,0.490258,0.159045,0,0,0,0,0,...,0,0,0,1,False,False,False,False,False,False
3,17,True,7696,0.490258,0.159045,0,0,0,0,0,...,0,0,0,1,False,False,False,False,False,False
4,1,False,7543,0.399279,0.304671,0,0,0,0,0,...,0,0,0,1,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18682292,0,False,198081,0.000000,0.216196,0,0,0,0,0,...,0,0,0,1,False,False,True,False,False,False
18682293,0,False,198081,0.000000,0.216196,0,0,0,0,0,...,0,0,0,1,False,False,True,False,False,False
18682294,0,False,198081,0.000000,0.216196,0,0,0,0,0,...,0,0,0,1,False,False,True,False,False,False
18682295,0,False,198081,0.000000,0.216196,0,0,0,0,0,...,0,0,0,1,False,False,True,False,False,False


In [8]:
nan_count_per_column = FINAL_TEST.isna().sum()
nan_count_per_column


port_count             0
risky_ports            0
attacker_ip_enum       0
attacker_as_name       0
attacker_country       0
joined_countries_0     0
joined_countries_1     0
joined_countries_2     0
joined_countries_3     0
joined_countries_4     0
joined_countries_5     0
joined_countries_6     0
joined_countries_7     0
joined_countries_8     0
joined_countries_9     0
joined_countries_10    0
joined_countries_11    0
joined_countries_12    0
protocol_0             0
protocol_1             0
protocol_2             0
protocol_3             0
crawl                  0
exploit                0
scan                   0
spam                   0
unknown                0
nan                    0
dtype: int64

In [9]:
FINAL_PREDICT = best_model.predict(FINAL_TEST.loc[:, ~FINAL_TEST.columns.isin(["attacker_ip_enum"])])

[Parallel(n_jobs=1)]: Done  31 tasks      | elapsed:   59.3s


In [10]:
FINAL_PREDICT = pd.Series(FINAL_PREDICT, name='prediction')
FINAL_PREDICT = pd.concat([FINAL_TEST['attacker_ip_enum'], FINAL_PREDICT], axis=1)

In [11]:
filtered_result_1 = FINAL_PREDICT[FINAL_PREDICT['prediction'] == 1].drop_duplicates(subset='attacker_ip_enum')
filtered_result_0 = FINAL_PREDICT[FINAL_PREDICT['prediction'] == 0].drop_duplicates(subset='attacker_ip_enum')

del FINAL_PREDICT

# Get IPs not in filtered_result_no_duplicates
filtered_result_0 = filtered_result_0[~filtered_result_0['attacker_ip_enum'].isin(filtered_result_1['attacker_ip_enum'])]



# Concatenate remaining_ips with filtered_result_no_duplicates
final_result = pd.concat([filtered_result_1, filtered_result_0])

del filtered_result_1
del filtered_result_0

final_result

Unnamed: 0,attacker_ip_enum,prediction
0,7696,1
5,7543,1
189,2083,1
337,6232,1
573,8206,1
...,...,...
18626746,196304,0
18638235,199918,0
18639976,192446,0
18662560,192056,0


In [12]:
final_result[final_result["attacker_ip_enum"] == "nan"]

Unnamed: 0,attacker_ip_enum,prediction


In [13]:
len(final_result)

49420

In [14]:
final_result.sort_values(by='attacker_ip_enum', inplace=True)

final_result

Unnamed: 0,attacker_ip_enum,prediction
21502,5,0
191,7,0
665517,21,0
72758,29,0
37390,33,0
...,...,...
17869691,199947,0
17977618,199949,0
17862033,199962,0
17857346,199964,0


In [15]:
final_result.isna().sum()


attacker_ip_enum    0
prediction          0
dtype: int64

In [16]:
final_result = final_result.rename(columns={'prediction': 'label'})

In [17]:
final_result = final_result.reset_index(drop=True)
final_result[[]]

0
1
2
3
4
...
49415
49416
49417
49418
49419


In [18]:
final_result[["attacker_ip_enum", "label"]]

Unnamed: 0,attacker_ip_enum,label
0,5,0
1,7,0
2,21,0
3,29,0
4,33,0
...,...,...
49415,199947,0
49416,199949,0
49417,199962,0
49418,199964,0


In [19]:
final_result.reset_index()

Unnamed: 0,index,attacker_ip_enum,label
0,0,5,0
1,1,7,0
2,2,21,0
3,3,29,0
4,4,33,0
...,...,...,...
49415,49415,199947,0
49416,49416,199949,0
49417,49417,199962,0
49418,49418,199964,0


In [20]:
final_result[["attacker_ip_enum", "label"]].to_csv('./FINAL_PREDICT.csv')