In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import warnings

from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVC, OneClassSVM
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer, FunctionTransformer, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, precision_score, recall_score,confusion_matrix

In [2]:
df = pd.read_csv('mirai3.csv', header=None)
labels = [1] * 71000 + [-1] * 29000
df['label'] = labels

In [3]:
X = df.drop('label' , axis = 1)
y = df['label']

In [4]:
from lightgbm import LGBMClassifier

model = LGBMClassifier(objective='binary')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42,stratify = y)


model.fit(X_train, y_train)


[LightGBM] [Info] Number of positive: 35500, number of negative: 14500
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.145348 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 29030
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 115
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.710000 -> initscore=0.895384
[LightGBM] [Info] Start training from score 0.895384


predict on training set

In [5]:
y_pred = model.predict(X_train)
cm = confusion_matrix(y_train, y_pred)

# Extract TP, TN, FP, and FN
TN, FP, FN, TP = cm.ravel()

TPR = TP / (TP + FN) if (TP + FN) > 0 else 0  # Sensitivity/Recall
FPR = FP / (FP + TN) if (FP + TN) > 0 else 0  # Probability of false alarm
FNR = FN / (TP + FN) if (TP + FN) > 0 else 0  # Miss Rate
TNR = TN / (TN + FP) if (TN + FP) > 0 else 0  # Specificity

print(f"True Positive Rate (TPR): {TPR}")
print(f"False Positive Rate (FPR): {FPR}")
print(f"False Negative Rate (FNR): {FNR}")
print(f"True Negative Rate (TNR): {TNR}")

True Positive Rate (TPR): 1.0
False Positive Rate (FPR): 0.0
False Negative Rate (FNR): 0.0
True Negative Rate (TNR): 1.0


predict on testing set

In [6]:
y_pred = model.predict(X_test)
f1_score(y_test,y_pred,average = 'weighted') 

0.9987195813337367

WOW, what a score.. now i need to pass the same testing set to OcSVM and then to Kitsune..

The later are outlier detection methods.. they do not need to see the negative class for them to decide. And thats an advantage 

We're proceeding with lightgbm for the golang implementation.. 0.99(LOL) vs 0.9655(Kitsune) vs 0.9377(OcSVM)

This analysis is only for comparison with the outlier detection methods

There is further analysis on timeseries.ipynb where i see crossval score for lightgbm and the results are far from the ones displayed here

So i treat dataset as timeseries (which it is) and do some feature engineering

In [21]:
y_train.value_counts(),y_test.value_counts()

(label
  1    35500
 -1    14500
 Name: count, dtype: int64,
 label
  1    35500
 -1    14500
 Name: count, dtype: int64)

Dump in pickle files so i can read them from other notebooks.

In [20]:
with open("X_train.pkl", "wb") as f:
    pickle.dump(X_train,f)

with open("X_test.pkl", "wb") as f:
    pickle.dump(X_test,f)

with open("y_test.pkl", "wb") as f:
    pickle.dump(y_test,f)

with open("y_train.pkl", "wb") as f:
    pickle.dump(y_train,f)

Dump in json for parsing in Golang

In [6]:

import json
booster = model.booster_
model_dump = booster.dump_model()

# Save the model in JSON format
with open('lightgbm_model.json', 'w') as f:
    json.dump(model_dump, f)