In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from imblearn.over_sampling import SMOTE
from collections import Counter
import torch
import torch.nn as nn
import torch.utils.data as data_utils
import torch.nn.functional as F
import torch.optim as optim
from sklearn.utils.class_weight import compute_class_weight

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
    roc_auc_score, 
    roc_curve
)



from sklearn.metrics import classification_report
from torch.utils.data import TensorDataset, DataLoader
from tqdm.notebook import tqdm



import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)


In [None]:
df1 = pd.read_parquet(r"E:\Thesis\Defence\DLL_Named\3rd_Implementation\UNSW_NB15_multiclass_label_is_label.parquet")
print(f"Dataset Shape: {df1.shape}")
display(df1.head(10))

In [None]:
class_distribution = df1['label'].value_counts()
null_values = df1.isnull().sum()
duplicate_values = df1.duplicated().sum()

print(f'Class Distribution:\n{class_distribution}, \nNull Values in Each Column:\n{null_values}, \nNumber of Duplicate Rows:\n{duplicate_values}')

df2 = df1.drop_duplicates()
print({df1.shape})
df2['label'].value_counts()
print(df2.shape)


In [None]:
# x = df2.drop(columns=["dur", "proto", "service", "state", "label"])  
x = df2.drop(columns=["label"])  
y = df2["label"].values


X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)



In [None]:
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.linear_model import LogisticRegression
from scipy.stats import pearsonr
from imblearn.over_sampling import SMOTE
import pandas as pd

X_train_non_negative = X_train[X_train >= 0]

chi2_selector = SelectKBest(chi2, k=20)
X_train_chi2 = chi2_selector.fit_transform(X_train_non_negative, y_train)
chi2_top_features = pd.DataFrame(chi2_selector.scores_, index=X_train.columns, columns=["Chi-Square Score"])
chi2_top_features = chi2_top_features.sort_values(by="Chi-Square Score", ascending=False)

model = LogisticRegression(max_iter=1000)
rfe_selector = RFE(estimator=model, n_features_to_select=20)
rfe_selector = rfe_selector.fit(X_train, y_train)
rfe_top_features = pd.DataFrame(rfe_selector.ranking_, index=X_train.columns, columns=["RFE Rank"])
rfe_top_features = rfe_top_features.sort_values(by="RFE Rank")

pearson_corr = []
for column in X_train.columns:
    corr, _ = pearsonr(X_train[column], y_train)
    pearson_corr.append((column, abs(corr)))

pearson_corr = sorted(pearson_corr, key=lambda x: x[1], reverse=True)
pearson_top_features = pd.DataFrame(pearson_corr, columns=["Feature", "Pearson Correlation"])

chi2_top = set(chi2_top_features.head(20).index)
rfe_top = set(rfe_top_features.head(20).index)
pearson_top = set(pearson_top_features['Feature'][:20])

top_features = chi2_top.union(rfe_top).union(pearson_top)

print("Combined Top Features from Chi², RFE, and Pearson Correlation:")
print(top_features)


plt.figure(figsize=(12, 6))
sns.barplot(x=chi2_top_features.index, y=chi2_top_features['Chi-Square Score'])
plt.xticks(rotation=90)
plt.title('Top 20 Features based on Chi-Square Scores')
plt.xlabel('Features')
plt.ylabel('Chi-Square Score')
plt.show()

plt.figure(figsize=(12, 6))
sns.barplot(x=rfe_top_features.index, y=rfe_top_features['RFE Rank'])
plt.xticks(rotation=90)
plt.title('Top 20 Features based on RFE Rank')
plt.xlabel('Features')
plt.ylabel('RFE Rank')
plt.show()

plt.figure(figsize=(12, 6))
sns.barplot(x=pearson_top_features['Feature'], y=pearson_top_features['Pearson Correlation'])
plt.xticks(rotation=90)
plt.title('Top 20 Features based on Pearson Correlation')
plt.xlabel('Features')
plt.ylabel('Pearson Correlation')
plt.show()


top_features_combined = pd.DataFrame({
    'Feature': list(chi2_top) + list(rfe_top) + list(pearson_top),
    'Method': ['Chi²'] * len(chi2_top) + ['RFE'] * len(rfe_top) + ['Pearson'] * len(pearson_top)
})

plt.figure(figsize=(12, 6))
sns.countplot(x='Feature', hue='Method', data=top_features_combined)
plt.xticks(rotation=90)
plt.title('Top Features from Chi², RFE, and Pearson')
plt.xlabel('Features')
plt.ylabel('Count')
plt.show()





In [None]:
print("Class distribution in y_train:")
print(pd.Series(y_train).value_counts())

In [None]:
import pandas as pd
top_features = ['state', 'ct_dst_ltm', 'dwin', 'swin',
                'ct_dst_sport_ltm', 'dload', 'dtcpb', 'ct_state_ttl', 
                'ct_srv_src', 'ct_src_ltm', 'stcpb', 'sttl', 'rate', 'ct_srv_dst', 
                'ct_src_dport_ltm', 'ct_dst_src_ltm', 'dur', 'service', 
                'dmean', 'dttl']

n_df = df1[top_features]
n_df['label'] = df1['label']
display(n_df.head(10))


x = n_df.drop(columns=["label"])  
y = n_df["label"].values


X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)




In [None]:
print("Class distribution in y_train:")
print(pd.Series(y_train).value_counts())