In [1]:
import sys
sys.path.append("../")
import argparse
import pandas as pd
from torch.utils.data import DataLoader
from tikuna.models import Transformer
from tikuna.common.preprocess import FeatureExtractor
from tikuna.common.dataloader import load_sessions, log_dataset
from tikuna.common.utils import seed_everything, dump_final_results, dump_params
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import SelectPercentile, chi2

In [2]:
NORMAL_LOG_DATA="/home/tikuna/app/data/mainnet/normal/log/normal.log"
ECLIPSE_LOG_DATA="/home/tikuna/app/data/mainnet/eclipse-single/log/eclipse.log"

In [3]:
normal_data = pd.read_csv(NORMAL_LOG_DATA,
                         sep = '\s+',
                         names=["Timestamp", "Removed IP", "Removed Port",
                                "Added IP", "Added Port", "Bucket", "label"])

abnormal_data = pd.read_csv(ECLIPSE_LOG_DATA,
                         sep = '\s+',
                         names=["Timestamp", "Removed IP", "Removed Port",
                                "Added IP", "Added Port", "Bucket", "label"])

# Add abnormal data to the normal so we add the vocabulary
normal_data = pd.concat([normal_data, abnormal_data])
normal_data['Timestamp'] = pd.to_datetime(normal_data['Timestamp'],
                                          format='[%Y-%m-%d|%H:%M:%S.%f]')
abnormal_data['Timestamp'] = pd.to_datetime(abnormal_data['Timestamp'],
                                          format='[%Y-%m-%d|%H:%M:%S.%f]')
display(normal_data)
display(abnormal_data)

training_data = {}
testing_data = {}

training_data["features"] = normal_data.iloc[:100000, 1:6]
training_data["label"] = normal_data.iloc[:100000, [6]].replace("normal", 0)

testing_data["features"] = pd.concat([abnormal_data.iloc[:, 1:6], normal_data.iloc[100000:101000, 1:6]])
testing_data["label"] = pd.concat([
                           abnormal_data.iloc[:, [6]].replace("abnormal", 1),
                           normal_data.iloc[100000:101000, [6]].replace("normal", 0)])

Unnamed: 0,Timestamp,Removed IP,Removed Port,Added IP,Added Port,Bucket,label
0,2023-01-28 01:13:41.042,162.55.1.114,11900,83.151.202.176,1024,256,normal
1,2023-01-28 01:13:41.042,3.80.226.93,9000,114.156.141.196,12000,256,normal
2,2023-01-28 01:13:41.042,68.170.92.177,52594,139.144.21.173,9000,256,normal
3,2023-01-28 01:13:41.042,54.209.226.233,9000,135.181.178.95,9000,256,normal
4,2023-01-28 01:13:41.042,18.224.59.77,9000,34.234.215.244,9000,256,normal
...,...,...,...,...,...,...,...
1420,2023-02-04 05:07:00.092,16.0.61.29,12000,205.185.120.171,12651,254,abnormal
1421,2023-02-04 07:51:06.105,54.238.108.184,33311,16.0.61.29,12000,254,abnormal
1422,2023-02-04 07:51:38.907,16.0.61.29,12000,162.55.134.100,49429,254,abnormal
1423,2023-02-04 08:50:19.653,34.147.78.9,12000,16.0.61.29,12000,254,abnormal


Unnamed: 0,Timestamp,Removed IP,Removed Port,Added IP,Added Port,Bucket,label
0,2023-02-03 21:53:11.906,149.56.240.35,9000,16.0.186.130,9000,252,abnormal
1,2023-02-03 21:53:11.907,35.207.99.26,9000,16.0.53.120,9000,255,abnormal
2,2023-02-03 21:53:11.909,43.135.40.73,12000,16.0.160.76,9000,252,abnormal
3,2023-02-03 21:53:11.909,100.27.30.226,9000,16.0.65.129,9000,256,abnormal
4,2023-02-03 21:53:11.910,34.229.79.57,39085,16.0.170.200,9000,255,abnormal
...,...,...,...,...,...,...,...
1420,2023-02-04 05:07:00.092,16.0.61.29,12000,205.185.120.171,12651,254,abnormal
1421,2023-02-04 07:51:06.105,54.238.108.184,33311,16.0.61.29,12000,254,abnormal
1422,2023-02-04 07:51:38.907,16.0.61.29,12000,162.55.134.100,49429,254,abnormal
1423,2023-02-04 08:50:19.653,34.147.78.9,12000,16.0.61.29,12000,254,abnormal


In [None]:
parser = argparse.ArgumentParser()

##### Model params
parser.add_argument("--model_name", default="Transformer", type=str)
parser.add_argument("--hidden_size", default=20, type=int)
parser.add_argument("--num_layers", default=2, type=int)
parser.add_argument("--embedding_dim", default=10, type=int)
parser.add_argument("--nhead", default=2, type=int)

##### Dataset params
parser.add_argument("--dataset", default="Ethereum Mainnet", type=str)
parser.add_argument("--window_size", default=30, type=int)
parser.add_argument("--stride", default=1, type=int)

##### Input params
parser.add_argument("--feature_type", default="sequentials", type=str)
parser.add_argument("--use_attention", action="store_true")
parser.add_argument("--label_type", default="next_log", type=str)
parser.add_argument("--use_tfidf", action="store_true")
parser.add_argument("--max_token_len", default=50, type=int)
parser.add_argument("--min_token_count", default=1, type=int)

##### Training params
parser.add_argument("--epoches", default=10, type=int)
parser.add_argument("--batch_size", default=10, type=int)
parser.add_argument("--learning_rate", default=0.01, type=float)
parser.add_argument("--topk", default=5, type=int)
parser.add_argument("--patience", default=10, type=int)

##### Others
parser.add_argument("--random_seed", default=42, type=int)
parser.add_argument("--gpu", default=0, type=int)

args, unknown = parser.parse_known_args()
params = vars(args)

model_save_path = dump_params(params)

seed_everything(params["random_seed"])

# session_train, session_test = load_sessions(data_dir=params["data_dir"])
ext = FeatureExtractor(**params)

session_train = ext.fit_transform(training_data, datatype="train")
session_test = ext.fit_transform(testing_data, datatype="test")

dataset_train = log_dataset(session_train, feature_type=params["feature_type"])
dataloader_train = DataLoader(
    dataset_train, batch_size=params["batch_size"], shuffle=True, pin_memory=True
)

dataset_test = log_dataset(session_test, feature_type=params["feature_type"])
dataloader_test = DataLoader(
    dataset_test, batch_size=10, shuffle=False, pin_memory=True
)

model = Transformer(
    meta_data=ext.meta_data, model_save_path=model_save_path, **params
)

eval_results = model.fit(
    dataloader_train,
    test_loader=dataloader_test,
    epoches=params["epoches"],
    learning_rate=params["learning_rate"],
)

result_str = "\t".join(["{}-{:.4f}".format(k, v) for k, v in eval_results.items()])

key_info = [
    "dataset",
    "train_anomaly_ratio",
    "feature_type",
    "label_type",
    "use_attention",
]

args_str = "\t".join(
    ["{}:{}".format(k, v) for k, v in params.items() if k in key_info]
)

dump_final_results(params, eval_results, model)

2023-02-10 01:06:59,828 P2293669 INFO {
    "model_name": "Transformer",
    "hidden_size": 20,
    "num_layers": 2,
    "embedding_dim": 10,
    "nhead": 2,
    "dataset": "Ethereum Mainnet",
    "window_size": 30,
    "stride": 1,
    "feature_type": "sequentials",
    "use_attention": false,
    "label_type": "next_log",
    "use_tfidf": false,
    "max_token_len": 50,
    "min_token_count": 1,
    "epoches": 10,
    "batch_size": 10,
    "learning_rate": 0.01,
    "topk": 5,
    "patience": 10,
    "random_seed": 42,
    "gpu": 0,
    "hash_id": "a7bbf3be"
}
2023-02-10 01:06:59,830 P2293669 INFO Cannot load cached feature extractor.
2023-02-10 01:07:04,463 P2293669 INFO 22355 words are found.
2023-02-10 01:07:04,466 P2293669 INFO Transforming train data.
2023-02-10 01:09:31,041 P2293669 INFO 499940 sliding windows generated.
2023-02-10 01:09:34,796 P2293669 INFO Finish feature extraction (train).
2023-02-10 01:09:34,798 P2293669 INFO Cannot load cached feature extractor.
2023-02-10