In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

print("Libraries imported successfully.")


Libraries imported successfully.


In [3]:
# Load the dataset with full path
df = pd.read_csv(r"C:\Users\DELL\Desktop\machine learning\Train_Test_Network.csv")

# Display first 5 rows
print(df.head())

# Show dataset info
print(df.info())

# Check for missing values
print(df.isnull().sum())



           ts         src_ip  src_port         dst_ip  dst_port proto service  \
0  1554198358    3.122.49.24      1883  192.168.1.152     52976   tcp       -   
1  1554198358   192.168.1.79     47260  192.168.1.255     15600   udp       -   
2  1554198359  192.168.1.152      1880  192.168.1.152     51782   tcp       -   
3  1554198359  192.168.1.152     34296  192.168.1.152     10502   tcp       -   
4  1554198362  192.168.1.152     46608  192.168.1.190        53   udp     dns   

       duration  src_bytes  dst_bytes  ... http_response_body_len  \
0  80549.530260    1762852   41933215  ...                      0   
1      0.000000          0          0  ...                      0   
2      0.000000          0          0  ...                      0   
3      0.000000          0          0  ...                      0   
4      0.000549          0        298  ...                      0   

   http_status_code  http_user_agent  http_orig_mime_types  \
0                 0                -

In [4]:
df.drop(['src_ip', 'dst_ip'], axis=1, inplace=True)


In [5]:
# List of categorical columns
categorical_cols = ['proto', 'service', 'conn_state', 'weird_name']

# Create a dictionary to store encoders for each column
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])  # Encode categorical values
    label_encoders[col] = le  # Store encoders for future use

In [6]:
from sklearn.preprocessing import MinMaxScaler 

# List of numerical columns to normalize
num_cols = ['duration', 'src_bytes', 'dst_bytes', 'src_pkts', 'dst_pkts', 'dst_ip_bytes', 'src_ip_bytes', 'missed_bytes']

# Initialize and apply the scaler
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Display the first few rows
print(df.head())


           ts  src_port  dst_port  proto  service      duration  src_bytes  \
0  1554198358      1883     52976      1        0  8.613363e-01   0.000453   
1  1554198358     47260     15600      2        0  0.000000e+00   0.000000   
2  1554198359      1880     51782      1        0  0.000000e+00   0.000000   
3  1554198359     34296     10502      1        0  0.000000e+00   0.000000   
4  1554198362     46608        53      2        3  5.870595e-09   0.000000   

      dst_bytes  conn_state  missed_bytes  ...  http_response_body_len  \
0  1.071405e-02           0           0.0  ...                       0   
1  0.000000e+00           6           0.0  ...                       0   
2  0.000000e+00           0           0.0  ...                       0   
3  0.000000e+00           0           0.0  ...                       0   
4  7.613979e-08          12           0.0  ...                       0   

   http_status_code  http_user_agent  http_orig_mime_types  \
0                 0     

In [8]:
from sklearn.model_selection import train_test_split


X = df.drop(columns=['label', 'type',])  # Exclude target & non-numeric
y = df['label']  # Target variable

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display dataset sizes
print(f"Training set size: {X_train.shape}, Testing set size: {X_test.shape}")


Training set size: (368834, 41), Testing set size: (92209, 41)


In [9]:
print(X_train.shape, y_train.shape)


(368834, 41) (368834,)


In [34]:
non_numeric_cols = X.select_dtypes(exclude=['number']).columns
print("Non-numeric columns:", non_numeric_cols.tolist())


Non-numeric columns: ['ssl_version', 'ssl_cipher', 'http_trans_depth', 'http_method', 'http_version', 'http_user_agent', 'http_orig_mime_types', 'http_resp_mime_types', 'weird_addl']


In [35]:
cols_to_drop = ['dns_query', 'ssl_subject', 'ssl_issuer', 'http_uri']
X = X.drop(columns=cols_to_drop)


KeyError: "['dns_query', 'ssl_subject', 'ssl_issuer', 'http_uri'] not found in axis"

In [36]:
import pandas as pd

binary_cols = ['dns_AA', 'dns_RD', 'dns_RA', 'dns_rejected', 
               'ssl_resumed', 'ssl_established', 'weird_notice']

for col in binary_cols:
    X[col] = X[col].replace({'F': 0, 'T': 1, '-': 0}).infer_objects(copy=False).astype(int)



In [37]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['ssl_version', 'ssl_cipher', 'http_method', 
                    'http_version', 'http_user_agent', 
                    'http_orig_mime_types', 'http_resp_mime_types', 
                    'weird_addl']

encoder = LabelEncoder()
for col in categorical_cols:
    X[col] = encoder.fit_transform(X[col].astype(str))


In [38]:
non_numeric_cols = X.select_dtypes(exclude=['number']).columns
print("Non-numeric columns:", non_numeric_cols.tolist())


Non-numeric columns: ['http_trans_depth']


In [39]:
cols_to_drop = ['http_trans_depth']
X = X.drop(columns=cols_to_drop)


In [40]:
from sklearn.model_selection import train_test_split


X = df.drop(columns=['label', 'type',])  # Exclude target & non-numeric
y = df['label']  # Target variable

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display dataset sizes
print(f"Training set size: {X_train.shape}, Testing set size: {X_test.shape}")


Training set size: (368834, 41), Testing set size: (92209, 41)


In [42]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Initialize SVM model with RBF kernel
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)

# Train the model
svm_model.fit(X_train, y_train)

# Make predictions
y_pred = svm_model.predict(X_test)

# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


ValueError: could not convert string to float: '-'