### IMPORTING DATASET

In [None]:
import pandas as pd
import numpy as np
import math
from pandas.api.types import is_numeric_dtype
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# loading data-set

data = pd.read_csv(r"C:\Users\User\Desktop\Network-Intrusion-Detection\data\raw-data\Train_data.csv")
data2 = pd.read_csv(r"C:\Users\User\Desktop\Network-Intrusion-Detection\data\raw-data\Test_data.csv")

data.head(20)
data2.head(5)

### Data preprocessing part 1

In [None]:
print("The row and columns of the dataset", data.shape)
print("the row and columns of the dataset2", data2.shape)

In [None]:
data.describe()

In [None]:
data.describe(include="object")

In [None]:
# checking for missing value

for col in data.columns:
    missing_value = data[col].isnull().sum()
    if missing_value > 0:
        print(f"{col}: {missing_value} missing values")
    else:
        print("There are no missing value")

In [None]:
print((data.isnull().sum()[lambda x: x > 0] / len(data) * 100)
      .round(3)
      .astype(str) + ' %')

missing_info = data.isnull().sum()
missing_info = missing_info[missing_info > 0].to_frame('Missing Count')
missing_info['Missing %'] = (missing_info['Missing Count'] / len(data) * 100).round(3)
print(missing_info)


In [None]:
# check for duplicates

print(f"number of duplicated rows: {data.duplicated().sum()}")

In [None]:
# outliers

# Filter numeric columns except 'class'
numeric_cols = [col for col in data.columns if col != 'class' and is_numeric_dtype(data[col])]

# Grid setup
n_cols = 2  # Boxplot + Scatterplot side by side for each feature
n_rows = len(numeric_cols)
fig, axes = plt.subplots(n_rows, n_cols, figsize=(12, 4 * n_rows))

for i, col in enumerate(numeric_cols):
    # Boxplot
    sns.boxplot(x=data[col], ax=axes[i, 0])
    axes[i, 0].set_title(f"Boxplot of {col}")
    
    # Scatterplot vs class
    sns.scatterplot(data=data, x=data[col], y=data['class'], ax=axes[i, 1])
    axes[i, 1].set_title(f"Scatterplot of {col} vs class")

plt.tight_layout()
plt.show()


In [None]:
# Filter out only numeric columns for correlation calculation
numeric_train = data.select_dtypes(include=[np.number])

# Generate the heatmap
plt.figure(figsize=(40,30))
sns.heatmap(numeric_train.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.show()

In [None]:
# this two columns are redundant

print(data["is_host_login"].value_counts())
print(data["num_outbound_cmds"].value_counts())

print(data2["is_host_login"].value_counts())
print(data2["num_outbound_cmds"].value_counts())

In [None]:
data.drop(["num_outbound_cmds"], axis=1, inplace=True)
data2.drop(["num_outbound_cmds"], axis=1, inplace=True)


In [None]:
data.drop(["is_host_login"], axis=1, inplace=True)
data2.drop(["is_host_login"], axis=1, inplace=True)

In [None]:
# checking attack class distributiion
print("value count for class",data["class"].value_counts())


In [None]:
data.select_dtypes(include=["float64", "int64"]).columns

In [None]:
data.select_dtypes(include=["object"]).columns

### Scaling Numerical Attribute

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# extract numerical attributes
cols = data.select_dtypes(include=['float64', 'int64']).columns

# Fit only on train
sc_data  = scaler.fit_transform(data[cols])

# Use the same scaler to transform test
sc_test = scaler.transform(data2[cols])

# turn the result back to dataframes
sc_traindf = pd.DataFrame(sc_data, columns=cols, index=data.index)
sc_testdf = pd.DataFrame(sc_test, columns=cols, index=data2.index)


### Encoding Categorical attribute


In [None]:
from sklearn.preprocessing import LabelEncoder

def encode_categoricals(data, data2, target_col):
    encoders = {}
    
    # Copy data
    cat_data = data.select_dtypes(include=['object']).copy()
    cat_data2 = data2.select_dtypes(include=['object']).copy()

    for col in cat_data.columns:
        le = LabelEncoder()
        data_vals = cat_data[col].astype(str)

        # Fit only on train data
        cat_data[col] = le.fit_transform(data_vals)

        # Transform test, handle unseen labels → -1
        data2_vals = cat_data2[col].astype(str)
        data2_encoded = []
        for val in data2_vals:
            if val in le.classes_:
                data2_encoded.append(le.transform([val])[0])
            else:
                data2_encoded.append(-1)  # unseen label placeholder
        cat_data2[col] = data2_encoded

        encoders[col] = le

    # Separate features and target
    X_train = cat_data.drop(columns=[target_col])
    y_train = cat_data[target_col]

    X_test = cat_data2.drop(columns=[target_col])
    y_test = cat_data2[target_col]

    return X_train, y_train, X_test, y_test, encoders




In [None]:

X_train, y_train, X_test, y_test, encoders = encode_categoricals(data, data2, target_col="class")
print("Encoding features shape:", X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [121]:
from sklearn.preprocessing import LabelEncoder

def encode_categoricals(train, test, target_col):
    encoders = {}

    # 1. Separate target column (only from train)
    y_train = train[target_col].copy()
    X_train = train.drop(columns=[target_col]).copy()
    X_test = test.copy()

    # 2. Encode only categorical (object) columns
    cat_cols = X_train.select_dtypes(include=['object']).columns
    for col in cat_cols:
        le = LabelEncoder()
        X_train[col] = le.fit_transform(X_train[col].astype(str))

        # Handle unseen categories in test → -1
        mapping = {cls: i for i, cls in enumerate(le.classes_)}
        X_test[col] = X_test[col].astype(str).map(mapping).fillna(-1).astype(int)

        encoders[col] = le

    # Since test has no target_col
    y_test = None  

    return X_train, y_train, X_test, y_test, encoders
X_train, y_train, X_test, y_test, encoders = encode_categoricals(data, data2, target_col="class")
print("Shapes:", X_train.shape, y_train.shape, X_test.shape)




Shapes: (25192, 39) (25192,) (22544, 39)


In [120]:
X_test.shape

(22544, 39)

### Using RandomForestClassifier for feature selection

In [None]:
from sklearm.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

