In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [44]:
import warnings
from sklearn.exceptions import FitFailedWarning
warnings.filterwarnings("ignore", category=FitFailedWarning)

In [45]:
df = pd.read_csv("../datasets/7-cyber_attack_data.csv")

In [46]:
df.head()

Unnamed: 0,src_packet_rate,dst_packet_rate,avg_payload_size,connection_duration,tcp_flag_count,avg_interarrival_time,failed_login_attempts,unusual_port_activity_score,session_entropy,avg_response_delay,attack_type
0,-1.286132,-0.648334,1.044115,-0.469715,0.789859,-0.083727,-1.647309,-1.316412,1.01191,-0.898063,2
1,-0.222224,2.083232,1.191114,-1.354527,-0.956992,1.696028,-1.070406,0.981403,-1.628798,1.377594,0
2,-0.431963,0.375745,-1.370334,0.819214,0.345243,1.389447,-1.90413,1.292602,0.925545,0.232705,0
3,-0.912633,0.986988,-0.690042,2.014628,-0.44226,0.590347,-1.819353,1.560938,0.823755,0.517762,0
4,-0.367056,1.667892,0.879172,2.214276,1.846338,-0.894047,1.543838,0.931103,-1.01521,1.061845,1


In [47]:
df.columns 

Index(['src_packet_rate', 'dst_packet_rate', 'avg_payload_size',
       'connection_duration', 'tcp_flag_count', 'avg_interarrival_time',
       'failed_login_attempts', 'unusual_port_activity_score',
       'session_entropy', 'avg_response_delay', 'attack_type'],
      dtype='object')

* 👉🏻 Dependent variable: atteck type: 3 class (Normal, DDoS, Port Scan)
* src_packet_rate -> Source-side packet transmission rate - gelen istek
* dst_packet_rate -> Destination-side packet reception rate - alınan istek
* avg_payload_size -> Average size of payload in packets - gelen istek büyüklüğü
* connection_duration -> Duration of the connection (in seconds) 
* tcp_flag_count -> Number of TCP flag occurrences - TCP detaylarından saldırı tespiti
* avg_interarrival_time -> Time between packet arrivals 
* failed_login_attempts -> Number of failed login attempts 
* unusual_port_activity_score -> Score representing unusual port usage - anormal port kullanımı
* session_entropy -> Entropy of session behavior (for anomaly detection) - bağlantı açıldıktan sonra geçen süredeki garip durumlar
* avg_response_delay -> Average delay in server response (in ms)
* attack_type -> 0 = Normal, 1 = DDoS, 2 = Port Scan


In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   src_packet_rate              1000 non-null   float64
 1   dst_packet_rate              1000 non-null   float64
 2   avg_payload_size             1000 non-null   float64
 3   connection_duration          1000 non-null   float64
 4   tcp_flag_count               1000 non-null   float64
 5   avg_interarrival_time        1000 non-null   float64
 6   failed_login_attempts        1000 non-null   float64
 7   unusual_port_activity_score  1000 non-null   float64
 8   session_entropy              1000 non-null   float64
 9   avg_response_delay           1000 non-null   float64
 10  attack_type                  1000 non-null   int64  
dtypes: float64(10), int64(1)
memory usage: 86.1 KB


In [49]:
df.describe() # scaled data

Unnamed: 0,src_packet_rate,dst_packet_rate,avg_payload_size,connection_duration,tcp_flag_count,avg_interarrival_time,failed_login_attempts,unusual_port_activity_score,session_entropy,avg_response_delay,attack_type
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,-0.012296,0.239737,-0.031142,0.013329,0.0089,0.016681,0.026614,0.315347,-0.045636,0.17953,1.003
std,1.406123,1.635646,0.989677,1.001178,1.000905,1.027824,1.022217,1.364824,1.472316,0.960834,0.817104
min,-4.267039,-7.960328,-3.718638,-3.250031,-3.288725,-3.17879,-3.057529,-4.045045,-5.869039,-4.409592,0.0
25%,-1.007421,-0.87125,-0.657668,-0.672964,-0.654218,-0.697799,-0.653787,-0.712015,-1.074499,-0.591831,0.0
50%,0.085888,0.296278,-0.008968,0.045505,-0.032894,0.014639,0.049283,0.567003,0.12158,0.24892,1.0
75%,1.080743,1.518676,0.615381,0.678219,0.687831,0.703139,0.713809,1.23589,1.013049,0.983239,2.0
max,3.874738,7.168331,3.206344,2.741943,3.477044,3.600187,3.357941,4.020627,4.986178,4.112542,2.0


In [50]:
X = df.drop("attack_type", axis=1)
y = df["attack_type"] 

### Model 

In [51]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Logistic Regression without hyperparameter tuning

In [52]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [53]:
y_pred

array([2, 1, 2, 1, 1, 0, 0, 0, 1, 0, 2, 1, 2, 1, 2, 2, 2, 0, 0, 2, 2, 1,
       1, 1, 1, 0, 0, 1, 2, 1, 0, 2, 2, 1, 2, 0, 0, 2, 2, 1, 2, 2, 2, 1,
       2, 0, 1, 2, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       2, 1, 0, 1, 0, 1, 2, 1, 2, 1, 1, 0, 1, 0, 1, 0, 1, 2, 2, 0, 1, 2,
       2, 1, 1, 2, 2, 0, 0, 0, 2, 2, 0, 1, 2, 1, 2, 1, 0, 2, 0, 2, 0, 1,
       1, 1, 2, 2, 1, 1, 1, 1, 2, 0, 2, 0, 1, 2, 0, 0, 2, 2, 2, 1, 2, 0,
       2, 2, 0, 0, 0, 2, 0, 2, 0, 1, 2, 1, 1, 2, 0, 0, 1, 1, 2, 2, 2, 1,
       2, 0, 2, 2, 2, 1, 0, 2, 0, 0, 2, 0, 2, 0, 0, 1, 2, 0, 1, 1, 1, 1,
       0, 2, 1, 0, 2, 1, 2, 2, 2, 2, 2, 0, 1, 1, 2, 1, 1, 2, 2, 2, 2, 1,
       0, 0])

#### Metrics

In [54]:
from sklearn.metrics import accuracy_score ,classification_report, confusion_matrix

score = accuracy_score(y_test, y_pred)
print(f"Accuracy: {score:.2f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.79
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.81      0.84        63
           1       0.78      0.77      0.77        69
           2       0.74      0.79      0.77        68

    accuracy                           0.79       200
   macro avg       0.79      0.79      0.79       200
weighted avg       0.79      0.79      0.79       200

Confusion Matrix:
[[51  7  5]
 [ 2 53 14]
 [ 6  8 54]]


#### Hyperparameter Tuning

In [55]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [56]:
c_values = [0.01, 0.1, 1, 10, 100] # Regularization strength

In [57]:
param_grid = [
    # 'l1' penalty
    {
        'penalty': ['l1'],
        'solver': ['liblinear', 'saga'], 
        'C': c_values
    },
    # 'l2' penalty
    {
        'penalty': ['l2'],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
        'C': c_values
    },
    # 'elasticnet' penalty
    {
        'penalty': ['elasticnet'],
        'solver': ['saga'], 
        'C': c_values
    }
]

In [58]:
cv = StratifiedKFold()

StratifiedKFold: Veri setini parçalara (fold) ayırırken, target'in oranlarını her bir parçada korumaktır.
* Veriyi 5 parçaya bölerken şunu garanti eder:
Her bir parçanın içindeki sınıf oranları, orijinal veri setindeki oranlarla aynıdır.
örneğin StratifiedKFold(n_splits=5) kullandığınızda:
Orijinal set: %99 "sağlam", %1 "dolandırıcılık" ise
    - Oluşturulan 5 parçanın her birinde yine yaklaşık olarak %99 "sağlam" ve %1 "dolandırıcılık" örneği bulunur.

In [59]:
grid_cv = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=cv, n_jobs=-1, verbose=1)

In [60]:
grid_cv.fit(X_train, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


 0.78625 0.78875 0.78875 0.78375 0.78875 0.78875 0.785   0.785   0.7875
 0.785   0.785   0.785   0.785   0.7875  0.785   0.785   0.78625 0.78625
 0.78625 0.78625 0.78625 0.78625 0.78625 0.7875  0.78625 0.78625     nan
     nan     nan     nan     nan]


0,1,2
,estimator,LogisticRegression()
,param_grid,"[{'C': [0.01, 0.1, ...], 'penalty': ['l1'], 'solver': ['liblinear', 'saga']}, {'C': [0.01, 0.1, ...], 'penalty': ['l2'], 'solver': ['newton-cg', 'lbfgs', ...]}, ...]"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo...shuffle=False)
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,0.1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'saga'
,max_iter,100


In [61]:
best_params = grid_cv.best_params_
best_score = grid_cv.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score}")

Best Parameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}
Best Score: 0.7925000000000001


In [62]:
y_pred = grid_cv.predict(X_test)
y_pred

array([2, 1, 2, 1, 1, 0, 0, 0, 1, 0, 2, 1, 2, 1, 2, 2, 2, 0, 0, 2, 2, 1,
       1, 1, 1, 0, 0, 1, 2, 1, 0, 1, 2, 1, 2, 0, 0, 2, 2, 1, 2, 2, 2, 1,
       2, 0, 1, 2, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       2, 1, 0, 1, 0, 1, 2, 1, 2, 2, 1, 0, 1, 0, 1, 0, 1, 2, 2, 0, 1, 2,
       2, 1, 1, 2, 2, 0, 0, 0, 2, 2, 0, 1, 2, 1, 2, 1, 0, 1, 0, 2, 0, 1,
       1, 1, 2, 2, 1, 1, 1, 1, 2, 0, 2, 0, 1, 2, 0, 0, 2, 2, 2, 1, 2, 0,
       2, 2, 0, 0, 0, 2, 0, 2, 0, 1, 2, 1, 1, 2, 0, 0, 1, 1, 2, 2, 2, 1,
       2, 0, 2, 2, 2, 1, 0, 2, 0, 0, 2, 0, 2, 0, 0, 1, 2, 0, 1, 1, 1, 1,
       0, 2, 1, 0, 0, 1, 2, 2, 2, 2, 2, 0, 1, 1, 2, 1, 1, 2, 2, 2, 2, 1,
       0, 0])

##### Tuning Scores

In [63]:
score = accuracy_score(y_test, y_pred)
print(f"Accuracy: {score:.2f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.79
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.81      0.83        63
           1       0.77      0.77      0.77        69
           2       0.75      0.78      0.76        68

    accuracy                           0.79       200
   macro avg       0.79      0.79      0.79       200
weighted avg       0.79      0.79      0.79       200

Confusion Matrix:
[[51  7  5]
 [ 3 53 13]
 [ 6  9 53]]


### One vs Rest

In [64]:
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier

In [65]:
onevsonemodel = OneVsOneClassifier(LogisticRegression())

In [66]:
onevsrestmodel = OneVsRestClassifier(LogisticRegression())

In [67]:
onevsonemodel.fit(X_train, y_train)
y_pred_onevsone = onevsonemodel.predict(X_test)

print("One vs One Classifier Predictions:")
score = accuracy_score(y_test, y_pred_onevsone)
print(f"Accuracy: {score:.2f}")

print("Classification Report:")
print(classification_report(y_test, y_pred_onevsone))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_onevsone))

One vs One Classifier Predictions:
Accuracy: 0.79
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.79      0.83        63
           1       0.79      0.75      0.77        69
           2       0.72      0.81      0.76        68

    accuracy                           0.79       200
   macro avg       0.79      0.79      0.79       200
weighted avg       0.79      0.79      0.79       200

Confusion Matrix:
[[50  6  7]
 [ 3 52 14]
 [ 5  8 55]]


In [68]:
onevsrestmodel.fit(X_train, y_train)
y_pred_onevsrest = onevsrestmodel.predict(X_test)

print("One vs Rest Classifier Predictions:")
score = accuracy_score(y_test, y_pred_onevsrest)
print(f"Accuracy: {score:.2f}")

print("Classification Report:")
print(classification_report(y_test, y_pred_onevsrest))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_onevsrest))

One vs Rest Classifier Predictions:
Accuracy: 0.79
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.81      0.84        63
           1       0.82      0.72      0.77        69
           2       0.71      0.84      0.77        68

    accuracy                           0.79       200
   macro avg       0.80      0.79      0.79       200
weighted avg       0.80      0.79      0.79       200

Confusion Matrix:
[[51  6  6]
 [ 2 50 17]
 [ 6  5 57]]
