In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

## step 0

In [4]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("dhoogla/unswnb15")

df=pd.read_parquet(path)
df.head()


Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat,label
0,1.1e-05,udp,-,INT,2,0,496,0,90909.09375,180363632.0,...,0,0,1,1,0,0,0,0,Normal,0
1,8e-06,udp,-,INT,2,0,1762,0,125000.0,881000000.0,...,0,0,1,1,0,0,0,0,Normal,0
2,5e-06,udp,-,INT,2,0,1068,0,200000.0,854400000.0,...,0,0,1,1,0,0,0,0,Normal,0
3,6e-06,udp,-,INT,2,0,900,0,166666.65625,600000000.0,...,0,0,2,1,0,0,0,0,Normal,0
4,1e-05,udp,-,INT,2,0,2126,0,100000.0,850400000.0,...,0,0,2,1,0,0,0,0,Normal,0


In [5]:
# Drop non-numeric columns before calculating the correlation matrix
numeric_df = df.select_dtypes(include=['number'])

# Calculate the correlation matrix
correlation_matrix = numeric_df.corr()

# Sort correlations with the 'label' column
if 'label' in correlation_matrix:
    print(correlation_matrix['label'].sort_values(ascending=False))
else:
    print("The 'label' column is not numeric or is missing in the correlation matrix.")



label                1.000000
ct_dst_sport_ltm     0.371672
rate                 0.335883
ct_src_dport_ltm     0.318518
sload                0.165249
dur                  0.029096
sbytes               0.019376
trans_depth          0.002246
sloss                0.001828
ackdat               0.000817
is_ftp_login        -0.008762
ct_ftp_cmd          -0.009092
ct_flw_http_mthd    -0.012237
sjit                -0.016436
response_body_len   -0.018930
tcprtt              -0.024668
smean               -0.028372
dinpkt              -0.030136
spkts               -0.043040
synack              -0.043250
djit                -0.048819
dbytes              -0.060403
dloss               -0.075961
dpkts               -0.097394
sinpkt              -0.155454
is_sm_ips_ports     -0.160126
dtcpb               -0.263543
stcpb               -0.266585
dmean               -0.295173
dwin                -0.339166
dload               -0.352169
swin                -0.364877
Name: label, dtype: float64


## stage 1

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
import logging

class NetworkIntrusionClassifier:
    def __init__(self, random_state=42):
        """
        Initialize the Network Intrusion Detection classifier.
        
        Parameters:
        random_state (int): Random seed for reproducibility
        """
        self.random_state = random_state
        self.encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
        self.scaler = StandardScaler()
        self.model = DecisionTreeClassifier(random_state=random_state)
        self.categorical_cols = None
        self.numerical_cols = None
        
    def _is_numeric_column(self, series):
        """
        Check if a column is truly numeric by attempting to convert it to float.
        
        Parameters:
        series (pd.Series): Column to check
        
        Returns:
        bool: True if the column can be converted to float, False otherwise

        bhut saare dtype hote hai numerics me, for eg scientific notation, complex, float, int so this method is suitable for this
        """
        try:
            pd.to_numeric(series, errors='raise')
            return True
        except (ValueError, TypeError):
            return False
            
    def preprocess_data(self, data, target_col="label", attack_cat_col="attack_cat"):
        """
        Preprocess the network traffic data with mode/median imputation.
        """
        logging.info("Starting data preprocessing...")
        df = data.copy()
        
        # Split features and target
        y = df[target_col]
        X = df.drop([target_col, attack_cat_col], axis=1)
    
        # Identify column types
        self.categorical_cols = [col for col in X.columns 
                                if not self._is_numeric_column(X[col])]
        self.numerical_cols = [col for col in X.columns 
                              if col not in self.categorical_cols]
    
        logging.info(f"Categorical: {self.categorical_cols}")
        logging.info(f"Numerical: {self.numerical_cols}")
    
        # --- New Missing Value Handling ---
        # Fill categorical with mode
        for col in self.categorical_cols:
            mode_val = X[col].mode()[0]
            X[col].fillna(mode_val, inplace=True)
        
        # Fill numerical with median
        for col in self.numerical_cols:
            median_val = X[col].median()
            X[col].fillna(median_val, inplace=True)
        # --- End of New Handling ---
    
        # Convert categorical to string for consistent encoding
        if self.categorical_cols:
            X[self.categorical_cols] = X[self.categorical_cols].astype(str)
            encoded = pd.DataFrame(
                self.encoder.fit_transform(X[self.categorical_cols]),
                columns=self.encoder.get_feature_names_out(self.categorical_cols)
            )
            X = X.drop(self.categorical_cols, axis=1)
            X = pd.concat([X, encoded], axis=1)
    
        # Scale numerical features
        if self.numerical_cols:
            X[self.numerical_cols] = self.scaler.fit_transform(X[self.numerical_cols])
    
        logging.info(f"Preprocessed shape: {X.shape}")
        return X, y

    
    def train(self, X, y, test_size=0.2):
        """
        Train with proper test_size parameter usage
        """
        logging.info("Starting training...")
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, 
            test_size=test_size,  # Now using parameter instead of hardcoded 0.2
            random_state=self.random_state
        )
        
        self.model.fit(X_train, y_train)
        
        results = {
            'train_score': self.model.score(X_train, y_train),
            'test_score': self.model.score(X_test, y_test),
            'report': classification_report(y_test, self.model.predict(X_test)),
            'confusion': confusion_matrix(y_test, self.model.predict(X_test))
        }
        
        logging.info(f"Training complete. Test accuracy: {results['test_score']:.4f}")
        return results
    
    def predict(self, X):
        """
        Make predictions on new data.
        
        Parameters:
        X (pd.DataFrame): New data to predict on
        
        Returns:
        np.array: Predicted labels
        """
        return self.model.predict(X)

# Example usage
if __name__ == "__main__":
    # Set up logging
    logging.basicConfig(level=logging.INFO)
    
    # Initialize classifier
    classifier = NetworkIntrusionClassifier()

In [9]:
classifier = NetworkIntrusionClassifier()
X, y = classifier.preprocess_data(df)
results = classifier.train(X, y)

INFO:root:Starting data preprocessing...
INFO:root:Categorical: ['proto', 'service', 'state']
INFO:root:Numerical: ['dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'is_sm_ips_ports']
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on