In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import accuracy_score, classification_report
import socket, struct
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
import warnings
warnings.filterwarnings('ignore')

In [16]:
available_features = [
    # Flow identification
    'Source_IP', 
    'Destination_IP',
    'Source_port',
    'Destination_port',
    'Protocol',
    'Timestamp',
    
    # Packet counts
    'fwd_packets_amount',
    'bwd_packets_amount',
    'fwd_packets_length',
    'bwd_packets_length',
    
    # Packet sizes
    'max_fwd_packet ',
    'min_fwd_packet',
    'max_bwd_packet',
    'min_bwd_packet',
    
    # Flag counts
    'FIN_count',
    'SYN_count',
    'RST_count',
    'PSH_count',
    'silence_windows',
    
    # Inter-arrival times
    'min_fwd_inter_arrival_time',
    'max_fwd_inter_arrival_time',
    'mean_fwd_inter_arrival_time',
    'min_bwd_inter_arrival_time',
    'max_bwd_inter_arrival_time',
    'mean_bwd_inter_arrival_time',
    'min_bib_inter_arrival_time',
    'max_bib_inter_arrival_time',
    'mean_bib_inter_arrival_time',
    
    # First packet sizes 
    *[f'first_packet_sizes_{i}' for i in range(30)],
    
    # Statistical features
    'min_packet_size',
    'max_packet_size',
    'mean_packet_size',
    'STD_packet_size',
    'mean_delta_byte',
    'STD_delta_byte',
    
    # Bandwidth features
    *[f'bandwidth_{i}' for i in range(20)],
    
    # Bytes per packet
    *[f'bpp_{i}' for i in range(3)],
    
    # Beaconing features
    *[f'beaconning_{i}' for i in range(20)],
    
    # Packets per second
    'pps_fwd',
    'pps_bwd',
    
    # Additional features
    'count_big_requests',
    'ACK_count'
]

# Print the resulting DataFrames to verify
# print("X_train shape:", X_train.shape)
# print("X_test shape:", X_test.shape)

In [61]:
import pandas as pd
import numpy as np

def create_important_features(df):
    """
    Create and transform features to match the important features list
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame with original features
    
    Returns:
    pandas.DataFrame: DataFrame with new and transformed features
    """
    df_new = df.copy()
    
    # 1. Basic Flow Features (Already available but need renaming)
    mapping = {
        'Source_IP': 'Source.IP',
        'Destination_IP': 'Destination.IP',
        'Source_port': 'Source.Port',
        'Destination_port': 'Destination.Port'
    }
    df_new = df_new.rename(columns=mapping)
    
    # 2. Packet Length Features
    df_new['Fwd.Packet.Length.Max'] = df_new['max_fwd_packet ']
    df_new['Fwd.Packet.Length.Min'] = df_new['min_fwd_packet']
    df_new['Min.Packet.Length'] = df_new['min_packet_size']
    df_new['Max.Packet.Length'] = df_new['max_packet_size']
    df_new['Average.Packet.Size'] = df_new['mean_packet_size']
    df_new['Packet.Length.Mean'] = df_new['mean_packet_size']
    df_new['Packet.Length.Std'] = df_new['STD_packet_size']
    df_new['Packet.Length.Variance'] = df_new['STD_packet_size'] ** 2
    
    # 3. Forward Packet Statistics
    df_new['Total.Length.of.Fwd.Packets'] = df_new['fwd_packets_length']
    df_new['Fwd.Packets.s'] = df_new['pps_fwd']
    df_new['Total.Fwd.Packets'] = df_new['fwd_packets_amount']
    
    # 4. Backward Packet Statistics
    df_new['Total.Length.of.Bwd.Packets'] = df_new['bwd_packets_length']
    df_new['Bwd.Packets.s'] = df_new['pps_bwd']
    df_new['Total.Backward.Packets'] = df_new['bwd_packets_amount']
    df_new['Bwd.Packet.Length.Max'] = df_new['max_bwd_packet']
    df_new['Bwd.Packet.Length.Mean'] = df_new['bwd_packets_length'] / (df_new['bwd_packets_amount'] + 1e-6)
    df_new['Bwd.Packet.Length.Std'] = np.sqrt(df_new['STD_packet_size'])  # Approximation
    
    # 5. Inter-Arrival Time (IAT) Features
    df_new['Flow.IAT.Max'] = df_new['max_bib_inter_arrival_time']
    df_new['Flow.IAT.Mean'] = df_new['mean_bib_inter_arrival_time']
    df_new['Flow.IAT.Std'] = np.sqrt(
        (df_new['max_bib_inter_arrival_time'] - df_new['min_bib_inter_arrival_time'])**2 / 4
    )
    df_new['Flow.IAT.Min'] = df_new['min_bib_inter_arrival_time']
    
    # Forward IAT
    df_new['Fwd.IAT.Max'] = df_new['max_fwd_inter_arrival_time']
    df_new['Fwd.IAT.Mean'] = df_new['mean_fwd_inter_arrival_time']
    df_new['Fwd.IAT.Min'] = df_new['min_fwd_inter_arrival_time']
    df_new['Fwd.IAT.Total'] = df_new['Fwd.IAT.Mean'] * (df_new['fwd_packets_amount'] - 1)
    df_new['Fwd.IAT.Std'] = np.sqrt(
        (df_new['max_fwd_inter_arrival_time'] - df_new['min_fwd_inter_arrival_time'])**2 / 4
    )
    
    # Backward IAT
    df_new['Bwd.IAT.Max'] = df_new['max_bwd_inter_arrival_time']
    df_new['Bwd.IAT.Mean'] = df_new['mean_bwd_inter_arrival_time']
    df_new['Bwd.IAT.Total'] = df_new['Bwd.IAT.Mean'] * (df_new['bwd_packets_amount'] - 1)
    df_new['Bwd.IAT.Std'] = np.sqrt(
        (df_new['max_bwd_inter_arrival_time'] - df_new['min_bwd_inter_arrival_time'])**2 / 4
    )
    
    # 6. Derived Flow Features
    df_new['Flow.Duration'] = df_new['max_bib_inter_arrival_time'] * (df_new['fwd_packets_amount'] + df_new['bwd_packets_amount'])
    df_new['Flow.Bytes.s'] = (df_new['fwd_packets_length'] + df_new['bwd_packets_length']) / (df_new['Flow.Duration'] + 1e-6)
    df_new['Flow.Packets.s'] = (df_new['fwd_packets_amount'] + df_new['bwd_packets_amount']) / (df_new['Flow.Duration'] + 1e-6)
    
    # 7. Subflow Features
    df_new['Subflow.Fwd.Packets'] = df_new['fwd_packets_amount']
    df_new['Subflow.Fwd.Bytes'] = df_new['fwd_packets_length']
    df_new['Subflow.Bwd.Packets'] = df_new['bwd_packets_amount']
    df_new['Subflow.Bwd.Bytes'] = df_new['bwd_packets_length']
    
    # 8. Segment Size Features
    df_new['Avg.Fwd.Segment.Size'] = df_new['fwd_packets_length'] / (df_new['fwd_packets_amount'] + 1e-6)
    df_new['Avg.Bwd.Segment.Size'] = df_new['bwd_packets_length'] / (df_new['bwd_packets_amount'] + 1e-6)
    
    # 9. Additional Features
    df_new['Down.Up.Ratio'] = df_new['bwd_packets_amount'] / (df_new['fwd_packets_amount'] + 1e-6)
    
    # 10. Idle Time Features (approximated from silence_windows)
    if 'silence_windows' in df_new.columns:
        df_new['Idle.Max'] = df_new['silence_windows']
        df_new['Idle.Mean'] = df_new['silence_windows'] / 2
        df_new['Idle.Min'] = df_new['silence_windows'] / 4
    
    # 11. Header Length Features (approximated)
    df_new['Fwd.Header.Length'] = df_new['fwd_packets_amount'] * 20  # Assuming minimum TCP header size
    df_new['Fwd.Header.Length.1'] = df_new['Fwd.Header.Length']  # Same as above
    df_new['Bwd.Header.Length'] = df_new['bwd_packets_amount'] * 20
    
    # 12. Additional Required Features (approximated)
    df_new['Init_Win_bytes_forward'] = 0  # Would need TCP header info
    df_new['Init_Win_bytes_backward'] = 0  # Would need TCP header info
    df_new['min_seg_size_forward'] = df_new['min_fwd_packet']
    df_new['act_data_pkt_fwd'] = df_new['fwd_packets_amount']
    
    # Select only the important features
    important_features = [
        'Destination.IP', 'Destination.Port', 'Source.IP', 'Init_Win_bytes_forward',
        'min_seg_size_forward', 'Fwd.Packet.Length.Max', 'Init_Win_bytes_backward',
        'Flow.IAT.Max', 'Source.Port', 'Flow.Duration',
        'Bwd.IAT.Total', 'Avg.Fwd.Segment.Size', 'Fwd.Packets.s', 'Fwd.IAT.Total',
        'Fwd.IAT.Max', 'Subflow.Fwd.Bytes', 'Flow.Bytes.s',
        'Min.Packet.Length', 'Total.Length.of.Fwd.Packets', 'Bwd.IAT.Max',
        'Packet.Length.Variance', 'Bwd.Packets.s', 'Flow.IAT.Mean', 'Fwd.Header.Length',
        'act_data_pkt_fwd', 'Max.Packet.Length', 'Flow.Packets.s', 'Flow.IAT.Std',
        'Packet.Length.Std', 'Idle.Max', 'Fwd.Header.Length.1', 'Bwd.Packet.Length.Mean',
        'Bwd.IAT.Std', 'Fwd.Packet.Length.Min', 'Bwd.Packet.Length.Std',
        'Avg.Bwd.Segment.Size', 'Average.Packet.Size', 'Total.Length.of.Bwd.Packets',
        'Packet.Length.Mean', 'Fwd.IAT.Mean', 'Fwd.IAT.Std', 'Flow.IAT.Min',
        'Bwd.IAT.Mean', 'Bwd.Packet.Length.Max', 'Subflow.Fwd.Packets',
        'Total.Fwd.Packets', 'Total.Backward.Packets', 'Bwd.Header.Length',
        'Subflow.Bwd.Bytes', 'Subflow.Bwd.Packets', 'Idle.Mean', 'Fwd.IAT.Min',
        'Down.Up.Ratio', 'Idle.Min'
    ]
    
    return df_new[important_features]

In [62]:
# Function that changes IP addresses into numbers

def ip2int(ip):
    packedIP = socket.inet_aton(ip)
    return struct.unpack("!L", packedIP)[0]

In [63]:
# Load train, test, and validation datasets
train_path = '../APP-1/train.csv'
test_path = '../APP-1/test.csv'
val_path = '../APP-1/val_without_labels.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
validation_data = pd.read_csv(val_path)

In [64]:
# Prepare features and target
y_train = train_df['label']
y_test = test_df['label']

In [65]:
# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [66]:
X_train = X_train = train_df.drop(columns='label')
X_test = test_df.drop(columns='label')

X_train.shape

(640, 111)

In [67]:
# Converting IP addresses into numbers
X_train['Source_IP'] = X_train['Source_IP'].apply(ip2int)
X_train['Destination_IP'] = X_train['Destination_IP'].apply(ip2int)

# Converting IP addresses into numbers
X_test['Source_IP'] = X_test['Source_IP'].apply(ip2int)
X_test['Destination_IP'] = X_test['Destination_IP'].apply(ip2int)

In [68]:
# X_train

In [69]:
X_train_sel = create_important_features(X_train)
X_test_sel = create_important_features(X_test)

In [70]:
len(available_features)

111

In [71]:
X_train_sel.shape
X_test_sel.shape

(640, 54)

In [72]:
# X_train_sel

In [73]:
# Identify columns with string data
string_columns = X_train_sel.select_dtypes(include=['object', 'string']).columns

print(string_columns)

# Initialize a dictionary to store LabelEncoders
label_encoders = {}

# Apply LabelEncoder to each string column in both X_train and X_test
for column in string_columns:
    
    # Fit on X_train and transform both X_train and X_test
    X_train_sel[column] = label_encoder.fit_transform(X_train_sel[column])
    X_test_sel[column] = label_encoder.transform(X_test_sel[column])
    
    # Store the encoder for future reference
    label_encoders[column] = label_encoder



Index([], dtype='object')


In [74]:
# Find columns with non-numeric data in X_test
# non_numeric_columns = X_test_drop.select_dtypes(include=['object', 'string']).columns
non_numeric_columns = X_test_sel.select_dtypes(include=['object', 'string']).columns

# Print the non-numeric columns
print("Non-numeric columns in X_test:")
print(non_numeric_columns)

Non-numeric columns in X_test:
Index([], dtype='object')


In [75]:
for column in non_numeric_columns:
    if column in label_encoders:  # Check if this column was encoded in X_train
        le = label_encoders[column]
        X_test_sel[column] = le.transform(X_test_sel[column])
    else:
        print(f"Warning: {column} was not encoded in X_train!")

In [76]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_sel)
X_test_scaled = scaler.transform(X_test_sel)

In [77]:
# BaggingClassifier with DecisionTree as base estimator
bagging_clf = BaggingClassifier(random_state=28, n_estimators=150,max_samples=0.75, max_features=0.55)
bagging_clf.fit(X_train_sel, y_train_encoded)
y_pred_bagging = bagging_clf.predict(X_test_sel)
accuracy_bagging = accuracy_score(y_test_encoded, y_pred_bagging)
print(f"BaggingClassifier Accuracy on Test Dataset: {accuracy_bagging:.4f}")
print("Classification Report (BaggingClassifier):\n", classification_report(y_test_encoded, y_pred_bagging))

BaggingClassifier Accuracy on Test Dataset: 0.5641
Classification Report (BaggingClassifier):
               precision    recall  f1-score   support

           0       1.00      0.40      0.57         5
           1       0.75      0.60      0.67         5
           2       1.00      0.20      0.33         5
           3       0.60      0.60      0.60         5
           4       0.44      0.80      0.57         5
           5       0.71      1.00      0.83         5
           6       0.50      0.20      0.29         5
           7       0.71      1.00      0.83         5
           8       0.50      0.60      0.55         5
           9       0.75      0.60      0.67         5
          10       1.00      0.80      0.89         5
          11       0.20      0.20      0.20         5
          12       0.67      0.40      0.50         5
          13       0.75      0.60      0.67         5
          14       0.00      0.00      0.00         5
          15       1.00      1.00      1

In [47]:
# important_features = [
#         'Destination.IP', 'Destination.Port', 'Source.IP', 'Init_Win_bytes_forward',
#         'min_seg_size_forward', 'Fwd.Packet.Length.Max', 'Init_Win_bytes_backward',
#         'Flow.IAT.Max', 'Source.Port', 'Flow.Duration', 'Fwd.Packet.Length.Std',
#         'Bwd.IAT.Total', 'Avg.Fwd.Segment.Size', 'Fwd.Packets.s', 'Fwd.IAT.Total',
#         'Fwd.IAT.Max', 'Fwd.Packet.Length.Mean', 'Subflow.Fwd.Bytes', 'Flow.Bytes.s',
#         'Min.Packet.Length', 'Total.Length.of.Fwd.Packets', 'Bwd.IAT.Max',
#         'Packet.Length.Variance', 'Bwd.Packets.s', 'Flow.IAT.Mean', 'Fwd.Header.Length',
#         'act_data_pkt_fwd', 'Max.Packet.Length', 'Flow.Packets.s', 'Flow.IAT.Std',
#         'Packet.Length.Std', 'Idle.Max', 'Fwd.Header.Length.1', 'Bwd.Packet.Length.Mean',
#         'Bwd.IAT.Std', 'Fwd.Packet.Length.Min', 'Bwd.Packet.Length.Std',
#         'Avg.Bwd.Segment.Size', 'Average.Packet.Size', 'Total.Length.of.Bwd.Packets',
#         'Packet.Length.Mean', 'Fwd.IAT.Mean', 'Fwd.IAT.Std', 'Flow.IAT.Min',
#         'Bwd.IAT.Mean', 'Bwd.Packet.Length.Max', 'Subflow.Fwd.Packets',
#         'Total.Fwd.Packets', 'Total.Backward.Packets', 'Bwd.Header.Length',
#         'Subflow.Bwd.Bytes', 'Subflow.Bwd.Packets', 'Idle.Mean', 'Fwd.IAT.Min',
#         'Down.Up.Ratio', 'Idle.Min'
#     ]

In [48]:
len(important_features)

56

In [78]:
rf = RandomForestClassifier(random_state=267, n_jobs=-1,n_estimators = 100)
rf.fit(X_train_scaled, y_train_encoded)
y_pred_rf = rf.predict(X_test_scaled)
accuracy_rf = accuracy_score(y_test_encoded, y_pred_rf)
print(f"RadomForest Accuracy on Test Dataset: {accuracy_rf:.4f}")
print("Classification Report (RadomForest):\n", classification_report(y_test_encoded, y_pred_rf))


RadomForest Accuracy on Test Dataset: 0.4750
Classification Report (RadomForest):
               precision    recall  f1-score   support

           0       1.00      0.40      0.57         5
           1       0.50      0.60      0.55         5
           2       1.00      0.20      0.33         5
           3       0.75      0.60      0.67         5
           4       0.30      0.60      0.40         5
           5       0.50      1.00      0.67         5
           6       0.50      0.20      0.29         5
           7       0.62      1.00      0.77         5
           8       0.40      0.40      0.40         5
           9       0.50      0.60      0.55         5
          10       1.00      0.80      0.89         5
          11       0.14      0.20      0.17         5
          12       0.50      0.40      0.44         5
          13       0.20      0.20      0.20         5
          14       0.00      0.00      0.00         5
          15       1.00      0.60      0.75         