In [None]:
%load_ext nb_black

# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    roc_auc_score,
    recall_score,
    f1_score,
)
import warnings

# Set random seed for reproducibility
np.random.seed(42)

# Suppress warnings
warnings.filterwarnings("ignore")

# Function to load and clean data
def load_and_clean_data(folder, fname_benign, fname_malicious):
    benign_data = pd.read_csv(folder + fname_benign)
    malicious_data = pd.read_csv(folder + fname_malicious)

    # Drop rows with missing values
    benign_data.dropna(inplace=True)
    malicious_data.dropna(inplace=True)

    benign_data["Type"] = "Benign"
    malicious_data["Type"] = "Malicious"

    combined_data = pd.concat([malicious_data, benign_data])
    combined_data = combined_data.sample(frac=1)  # Random shuffle

    return combined_data

# Function to add throughput columns
def add_throughput_columns(df):
    colsPerTime = [
        "flowLength",
        "fwdFlowLength",
        "bwdFlowLength",
        "packetSizeTotal",
        "fwdPacketSizeTotal",
        "bwdPacketSizeTotal",
    ]

    for feature in colsPerTime:
        df[feature + "PerTime"] = df[feature] / df["flowDuration"]
        print(feature + "PerTime")

# Function to clean the dataset
def clean_dataset(df):
    df.dropna(inplace=True)
    df_X = df.iloc[:, :-1]
    df_Y = df.iloc[:, -1]

    indices_to_keep = ~df_X.isin([np.nan, np.inf, -np.inf]).any(1)
    return df_X[indices_to_keep].astype(np.float64).values, df_Y[indices_to_keep].values

# Load and clean the data
folder = "../pkg/flowOutput/"
fname_benign = "2017-05-02_kali-normal22_flow_stats.csv"
fname_malicious = "webgoat_flow_stats.csv"

combined_data = load_and_clean_data(folder, fname_benign, fname_malicious)

# Add throughput columns
add_throughput_columns(combined_data)

# Define feature columns
feature_cols = [
    # Your feature columns here
]

# Select feature columns in datasets
pd_comb_features = combined_data[feature_cols]

# Get feature and class arrays
X, y = clean_dataset(pd_comb_features.copy(deep=True))

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Scale the data
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)

# Weighted Logistic Regression
# Define hyperparameter grid
hyperparam_grid = {
    # Your hyperparameters here
}

# Model fitting
lg = LogisticRegression(random_state=13)
grid = GridSearchCV(lg, hyperparam_grid, scoring="roc_auc", cv=10, n_jobs=-1, refit=True)
grid.fit(X_train_scale, y_train2.astype("int32"))

# Print best score and parameters
print(f"Best score: {grid.best_score_} with param: {grid.best_params_}")

# Test performance
y_pred_wt = grid.predict(X_test_scale)

# Performance metrics
conf_mat = confusion_matrix(y_test2.astype("int32"), y_pred_wt)
print(f"Accuracy Score: {accuracy_score(y_test2.astype('int32'), y_pred_wt)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test2.astype('int32'), y_pred_wt)}")
print(f"Area Under Curve: {roc_auc_score(y_test2.astype('int32'), y_pred_wt)}")
print(f"Recall score (Pct of true malicious detected): {100 * recall_score(y_test2.astype('int32'), y_pred_wt)}")
print(f"Data reduction: {np.round(100.0 * conf_mat.T[1].sum() / conf_mat.sum(), 2)} percent")
print(f"Pct malicious in data sent to console: {np.round(100.0 * conf_mat.T[1][1] / conf_mat.T[1].sum(), 2)} percent")
print("F1 score:", f1_score(y_test2.astype("int32"), y_pred_wt, average="weighted"))

# Save parameters
np.savetxt("mean.txt", scaler.mean_, delimiter=",")
np.savetxt("std.txt", scaler.scale_, delimiter=",")
np.savetxt("weights.txt", best_fit_model.coef_[0], delimiter=",")
np.savetxt("intercept.txt", best_fit_model.intercept_, delimiter=",")

# Feature importance scores
important_features = pd_comb_features_cp.iloc[:, :-1].columns.values[np.argsort(-1 * np.abs(best_fit_model.coef_[0])]
