In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
import os
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline


In [3]:
data = pd.read_csv('data/modified_utc_dataset.csv')

# Initial data exploration
print(data.head())

   Duration Protocol Direction State  Source_Type_of_Service  \
0  1.026539      tcp        ->  S_RA                     0.0   
1  1.009595      tcp        ->  S_RA                     0.0   
2  3.056586      tcp        ->  SR_A                     0.0   
3  3.111769      tcp        ->  SR_A                     0.0   
4  3.083411      tcp        ->  SR_A                     0.0   

   Destination_Type_of_Service  Total_Packets  Total_Bytes  Source_Bytes  \
0                          0.0              4          276           156   
1                          0.0              4          276           156   
2                          0.0              3          182           122   
3                          0.0              3          182           122   
4                          0.0              3          182           122   

                                    Label  
0  flow=Background-Established-cmpgw-CVUT  
1  flow=Background-Established-cmpgw-CVUT  
2             flow=Backgro

**Feature selection**
- we have seen the correlation between 'Total_Packets' and 'Total_Bytes' is approximately one.
- The 'label's purpose is to help us identify whether the observation is from a botnet or not.

In [4]:
columns_to_drop = ['Label', 'Total_Bytes']
columns_to_drop = [col for col in columns_to_drop if col in data.columns]

data['botnet'] = data['Label'].apply(lambda x: 1 if 'flow=From-Botnet' in x else 0)

data = data.drop(columns_to_drop, axis =1)

In [5]:
# Encoding categorical features if any
data = pd.get_dummies(data)

In [6]:
data

Unnamed: 0,Duration,Source_Type_of_Service,Destination_Type_of_Service,Total_Packets,Source_Bytes,botnet,Protocol_arp,Protocol_esp,Protocol_icmp,Protocol_igmp,...,State__FSA,State__FSPA,State__FSRPA,State__PA,State__R,State__RA,State__SA,State__SPA,State__SRA,State__SRPA
0,1.026539,0.0,0.0,4,156,0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1.009595,0.0,0.0,4,156,0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,3.056586,0.0,0.0,3,122,0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,3.111769,0.0,0.0,3,122,0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,3.083411,0.0,0.0,3,122,0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1621168,0.002661,0.0,0.0,2,474,0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1621169,0.000870,0.0,0.0,2,93,0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1621170,0.021536,0.0,0.0,5,506,0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1621171,0.001034,0.0,0.0,2,82,0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [7]:
X = data.drop('botnet', axis =1)
y = data['botnet']

**Some features doesn't even have a single observation representing the other class**

In [None]:
# Normalize the feature data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply chi-squared test
chi_scores, p_values = chi2(X_scaled, y)

# Create a DataFrame to hold the results
chi2_df = pd.DataFrame({'Feature': X.columns, 'Chi2 Score': chi_scores, 'P-Value': p_values})

# Set a significance level (e.g., 0.05)
significance_level = 0.05

# Filter out features with p-values above the significance level
significant_features = chi2_df[chi2_df['P-Value'] < significance_level]['Feature']

# Select only the significant features
X_significant = X[significant_features]


In [None]:
X_significant