In [1]:
import os
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv('../data/CIDDS-001-external-week1.csv')

In [3]:
# Check the first few rows of the dataframe
df.head()

Unnamed: 0,Date first seen,Duration,Proto,Src IP Addr,Src Pt,Dst IP Addr,Dst Pt,Packets,Bytes,Flows,Flags,Tos,class,attackType,attackID,attackDescription
0,2017-03-14 17:43:57.172,81412.697,TCP,EXT_SERVER,8082,OPENSTACK_NET,56978.0,3057,2.1 M,1,.AP...,0,normal,---,---,---
1,2017-03-14 17:43:57.172,81412.697,TCP,OPENSTACK_NET,56978,EXT_SERVER,8082.0,4748,2.5 M,1,.AP...,0,normal,---,---,---
2,2017-03-14 17:43:26.135,81504.787,TCP,EXT_SERVER,8082,OPENSTACK_NET,56979.0,8639,9.1 M,1,.AP...,0,normal,---,---,---
3,2017-03-14 17:43:26.135,81504.787,TCP,OPENSTACK_NET,56979,EXT_SERVER,8082.0,12024,10.3 M,1,.AP...,0,normal,---,---,---
4,2017-03-14 18:17:09.005,82100.692,TCP,EXT_SERVER,8082,OPENSTACK_NET,51649.0,11012,27.2 M,1,.AP.S.,0,normal,---,---,---


In [4]:
# Check the shape of the dataframe(number of rows and columns)
df.shape

(172838, 16)

In [5]:
# check the data types of the columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172838 entries, 0 to 172837
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Date first seen    172838 non-null  object 
 1   Duration           172838 non-null  float64
 2   Proto              172838 non-null  object 
 3   Src IP Addr        172838 non-null  object 
 4   Src Pt             172838 non-null  int64  
 5   Dst IP Addr        172838 non-null  object 
 6   Dst Pt             172838 non-null  float64
 7   Packets            172838 non-null  int64  
 8   Bytes              172838 non-null  object 
 9   Flows              172838 non-null  int64  
 10  Flags              172838 non-null  object 
 11  Tos                172838 non-null  int64  
 12  class              172838 non-null  object 
 13  attackType         172838 non-null  object 
 14  attackID           172838 non-null  object 
 15  attackDescription  172838 non-null  object 
dtypes:

In [6]:
# print the description of the dataframe
df.describe()

Unnamed: 0,Duration,Src Pt,Dst Pt,Packets,Flows,Tos
count,172838.0,172838.0,172838.0,172838.0,172838.0,172838.0
mean,136.290559,22445.197526,22309.580341,13.809799,1.0,0.0
std,5215.645494,23810.373191,23811.494655,232.725151,0.0,0.0
min,0.0,0.0,0.0,1.0,1.0,0.0
25%,0.057,23.0,23.0,5.0,1.0,0.0
50%,5.93,8000.0,8000.0,7.0,1.0,0.0
75%,16.84825,49144.75,49001.5,17.0,1.0,0.0
max,519611.231,65535.0,65535.0,34136.0,1.0,0.0


In [7]:
# print the value of the dataframe
df.values

array([['2017-03-14 17:43:57.172', 81412.697, 'TCP  ', ..., '---', '---',
        '---'],
       ['2017-03-14 17:43:57.172', 81412.697, 'TCP  ', ..., '---', '---',
        '---'],
       ['2017-03-14 17:43:26.135', 81504.787, 'TCP  ', ..., '---', '---',
        '---'],
       ...,
       ['2017-03-16 12:10:17.340', 517292.865, 'TCP  ', ..., '---',
        '---', '---'],
       ['2017-03-16 15:52:58.342', 503931.863, 'TCP  ', ..., '---',
        '---', '---'],
       ['2017-03-16 15:52:58.342', 503931.863, 'TCP  ', ..., '---',
        '---', '---']], shape=(172838, 16), dtype=object)

In [8]:
# print the columns of the dataframe
df.columns

Index(['Date first seen', 'Duration', 'Proto', 'Src IP Addr', 'Src Pt',
       'Dst IP Addr', 'Dst Pt', 'Packets', 'Bytes', 'Flows', 'Flags', 'Tos',
       'class', 'attackType', 'attackID', 'attackDescription'],
      dtype='object')

In [9]:
# print the row index of the dataframe
df.index

RangeIndex(start=0, stop=172838, step=1)

In [10]:
#checking the missing value in the dataframe
df.isnull().sum()

Date first seen      0
Duration             0
Proto                0
Src IP Addr          0
Src Pt               0
Dst IP Addr          0
Dst Pt               0
Packets              0
Bytes                0
Flows                0
Flags                0
Tos                  0
class                0
attackType           0
attackID             0
attackDescription    0
dtype: int64

In [11]:
# droping the row with missing values
df.dropna(inplace=True)

In [12]:
#dropping the columns which are not needed for the model
df.drop(columns=["attackDescription", "attackType","attackID"], inplace=True)

In [13]:
new = df.columns
print(new)
df.head()

Index(['Date first seen', 'Duration', 'Proto', 'Src IP Addr', 'Src Pt',
       'Dst IP Addr', 'Dst Pt', 'Packets', 'Bytes', 'Flows', 'Flags', 'Tos',
       'class'],
      dtype='object')


Unnamed: 0,Date first seen,Duration,Proto,Src IP Addr,Src Pt,Dst IP Addr,Dst Pt,Packets,Bytes,Flows,Flags,Tos,class
0,2017-03-14 17:43:57.172,81412.697,TCP,EXT_SERVER,8082,OPENSTACK_NET,56978.0,3057,2.1 M,1,.AP...,0,normal
1,2017-03-14 17:43:57.172,81412.697,TCP,OPENSTACK_NET,56978,EXT_SERVER,8082.0,4748,2.5 M,1,.AP...,0,normal
2,2017-03-14 17:43:26.135,81504.787,TCP,EXT_SERVER,8082,OPENSTACK_NET,56979.0,8639,9.1 M,1,.AP...,0,normal
3,2017-03-14 17:43:26.135,81504.787,TCP,OPENSTACK_NET,56979,EXT_SERVER,8082.0,12024,10.3 M,1,.AP...,0,normal
4,2017-03-14 18:17:09.005,82100.692,TCP,EXT_SERVER,8082,OPENSTACK_NET,51649.0,11012,27.2 M,1,.AP.S.,0,normal


In [14]:
df["Duration"].value_counts().head(5)

Duration
0.000     21447
30.997     3270
30.998     3176
30.999     3104
30.996     2466
Name: count, dtype: int64

In [15]:
df[(df['Proto'].isin(['TCP','UDP'])) & (df["class"]=='suspicious')].value_counts()

Series([], Name: count, dtype: int64)

In [16]:
df['Src Pt'].value_counts()

Src Pt
22       40388
8000     24911
80        8136
23        5483
0         1807
         ...  
10126        1
2040         1
43207        1
46380        1
50509        1
Name: count, Length: 42293, dtype: int64

In [17]:
df['Tos'].value_counts()

Tos
0    172838
Name: count, dtype: int64

In [18]:
df.drop(columns=['Tos'], inplace=True)

In [19]:
df.head()

Unnamed: 0,Date first seen,Duration,Proto,Src IP Addr,Src Pt,Dst IP Addr,Dst Pt,Packets,Bytes,Flows,Flags,class
0,2017-03-14 17:43:57.172,81412.697,TCP,EXT_SERVER,8082,OPENSTACK_NET,56978.0,3057,2.1 M,1,.AP...,normal
1,2017-03-14 17:43:57.172,81412.697,TCP,OPENSTACK_NET,56978,EXT_SERVER,8082.0,4748,2.5 M,1,.AP...,normal
2,2017-03-14 17:43:26.135,81504.787,TCP,EXT_SERVER,8082,OPENSTACK_NET,56979.0,8639,9.1 M,1,.AP...,normal
3,2017-03-14 17:43:26.135,81504.787,TCP,OPENSTACK_NET,56979,EXT_SERVER,8082.0,12024,10.3 M,1,.AP...,normal
4,2017-03-14 18:17:09.005,82100.692,TCP,EXT_SERVER,8082,OPENSTACK_NET,51649.0,11012,27.2 M,1,.AP.S.,normal


In [20]:
df['class'].value_counts()

class
suspicious    107344
normal         49606
unknown        15888
Name: count, dtype: int64

In [21]:
# Drop rows where the 'class' column is 'unknown'
df = df[df['class'] != 'unknown']

In [22]:
df.index

Index([     0,      1,      2,      3,      4,      5,      6,      7,      8,
            9,
       ...
       172828, 172829, 172830, 172831, 172832, 172833, 172834, 172835, 172836,
       172837],
      dtype='int64', length=156950)

In [23]:
df["Packets"].value_counts()

Packets
6        28507
7        27533
1        25088
19       12195
17        8224
         ...  
17446        1
9406         1
15510        1
13362        1
20262        1
Name: count, Length: 202, dtype: int64

Packets more than 10 are mostly suspicious

In [24]:
df[(df["class"] =="suspicious") & (df["Packets"]>10)]

Unnamed: 0,Date first seen,Duration,Proto,Src IP Addr,Src Pt,Dst IP Addr,Dst Pt,Packets,Bytes,Flows,Flags,class
18,2017-03-15 00:00:09.156,15.420,TCP,EXT_SERVER,22,10001_123,4589.0,19,3093,1,.AP.S.,suspicious
19,2017-03-15 00:00:09.156,15.420,TCP,10001_123,4589,EXT_SERVER,22.0,13,2843,1,.APRS.,suspicious
22,2017-03-15 00:00:27.659,14.644,TCP,EXT_SERVER,22,10001_123,3477.0,18,2985,1,.AP.S.,suspicious
23,2017-03-15 00:00:27.659,14.644,TCP,10001_123,3477,EXT_SERVER,22.0,13,2843,1,.APRS.,suspicious
27,2017-03-15 00:00:02.296,62.046,TCP,EXT_SERVER,22,10004_35,57382.0,12,4773,1,.AP.SF,suspicious
...,...,...,...,...,...,...,...,...,...,...,...,...
172813,2017-03-21 23:58:29.347,15.890,TCP,EXT_SERVER,22,18851_139,40446.0,28,4849,1,.AP.SF,suspicious
172814,2017-03-21 23:59:17.705,11.037,TCP,10006_27,54306,EXT_SERVER,22.0,16,2231,1,.AP.SF,suspicious
172815,2017-03-21 23:59:17.705,11.037,TCP,EXT_SERVER,22,10006_27,54306.0,17,3081,1,.AP.SF,suspicious
172816,2017-03-21 23:59:30.671,12.660,TCP,18851_139,14878,EXT_SERVER,22.0,18,2531,1,.AP.SF,suspicious


in comparision, the total of 172838 rows, packets less than 10 has normal class 49360 and packets  more than 10  has suspicious class is 59939

In [25]:
df[(df["class"] =="normal") & (df["Packets"]<10)]

Unnamed: 0,Date first seen,Duration,Proto,Src IP Addr,Src Pt,Dst IP Addr,Dst Pt,Packets,Bytes,Flows,Flags,class
68,2017-03-15 00:01:39.467,0.075,TCP,EXT_SERVER,8000,OPENSTACK_NET,49835.0,7,556,1,.AP.SF,normal
69,2017-03-15 00:01:39.467,0.075,TCP,OPENSTACK_NET,49835,EXT_SERVER,8000.0,6,515,1,.AP.SF,normal
70,2017-03-15 00:01:39.467,0.079,TCP,EXT_SERVER,8000,OPENSTACK_NET,49834.0,7,561,1,.AP.SF,normal
71,2017-03-15 00:01:39.467,0.079,TCP,OPENSTACK_NET,49834,EXT_SERVER,8000.0,6,589,1,.AP.SF,normal
72,2017-03-15 00:01:39.523,0.065,TCP,EXT_SERVER,8000,OPENSTACK_NET,49836.0,7,702,1,.AP.SF,normal
...,...,...,...,...,...,...,...,...,...,...,...,...
172807,2017-03-21 23:58:40.059,0.089,TCP,OPENSTACK_NET,60703,EXT_SERVER,8000.0,6,586,1,.AP.SF,normal
172808,2017-03-21 23:58:40.558,0.059,TCP,EXT_SERVER,8000,OPENSTACK_NET,51076.0,7,702,1,.AP.SF,normal
172809,2017-03-21 23:58:40.558,0.059,TCP,OPENSTACK_NET,51076,EXT_SERVER,8000.0,6,586,1,.AP.SF,normal
172810,2017-03-21 23:58:40.558,0.045,TCP,EXT_SERVER,8000,OPENSTACK_NET,51075.0,7,556,1,.AP.SF,normal


In [26]:
df['Packets'].value_counts()

Packets
6        28507
7        27533
1        25088
19       12195
17        8224
         ...  
17446        1
9406         1
15510        1
13362        1
20262        1
Name: count, Length: 202, dtype: int64

Packets = 6 is mostly normal
,Packets = 1,5 is mostly suspicious, Packets = 7 is mostly normal
packets 19 ,17, 26, 15,24,20,12,2,18,8,13,4 are  all suspicious


In [27]:
df[(df["Packets"]==534) & (df["class"]=="normal")].value_counts()

Date first seen          Duration   Proto  Src IP Addr  Src Pt  Dst IP Addr    Dst Pt   Packets  Bytes  Flows  Flags   class 
2017-03-15 03:29:33.187  16291.778  TCP    EXT_SERVER   8082    OPENSTACK_NET  49195.0  534      98910  1      .AP.S.  normal    1
Name: count, dtype: int64

In [28]:
df[(df["Packets"]==21) & (df["class"]=="suspicious")].value_counts()

Date first seen          Duration  Proto  Src IP Addr  Src Pt  Dst IP Addr  Dst Pt   Packets  Bytes  Flows  Flags   class     
2017-03-21 23:52:59.704  10.564    TCP    EXT_SERVER   22      10006_27     59472.0  21       3369   1      .AP.SF  suspicious    1
2017-03-15 00:04:47.728  11.030    TCP    EXT_SERVER   22      10008_109    33056.0  21       3369   1      .AP.SF  suspicious    1
2017-03-15 00:06:57.407  17.283    TCP    EXT_SERVER   22      10001_123    1646.0   21       3309   1      .AP.S.  suspicious    1
2017-03-15 00:09:51.954  16.822    TCP    EXT_SERVER   22      10001_123    4093.0   21       3173   1      .AP.S.  suspicious    1
2017-03-15 00:15:09.451  10.413    TCP    EXT_SERVER   22      10006_27     50078.0  21       3369   1      .AP.SF  suspicious    1
                                                                                                                                 ..
2017-03-15 00:47:55.269  15.104    TCP    EXT_SERVER   22      10001_123    2246.

In [29]:
df['Src IP Addr'].value_counts()

Src IP Addr
EXT_SERVER       78312
OPENSTACK_NET    24803
10004_36          7404
10006_27          4557
10767_152         2871
                 ...  
19359_155            1
19360_95             1
19361_140            1
19362_254            1
19339_251            1
Name: count, Length: 10081, dtype: int64

In [30]:
# covert source and destination port number to integer
df['Src Pt'] = pd.to_numeric(df['Src Pt'], errors='coerce')

In [31]:
df.head()

Unnamed: 0,Date first seen,Duration,Proto,Src IP Addr,Src Pt,Dst IP Addr,Dst Pt,Packets,Bytes,Flows,Flags,class
0,2017-03-14 17:43:57.172,81412.697,TCP,EXT_SERVER,8082,OPENSTACK_NET,56978.0,3057,2.1 M,1,.AP...,normal
1,2017-03-14 17:43:57.172,81412.697,TCP,OPENSTACK_NET,56978,EXT_SERVER,8082.0,4748,2.5 M,1,.AP...,normal
2,2017-03-14 17:43:26.135,81504.787,TCP,EXT_SERVER,8082,OPENSTACK_NET,56979.0,8639,9.1 M,1,.AP...,normal
3,2017-03-14 17:43:26.135,81504.787,TCP,OPENSTACK_NET,56979,EXT_SERVER,8082.0,12024,10.3 M,1,.AP...,normal
4,2017-03-14 18:17:09.005,82100.692,TCP,EXT_SERVER,8082,OPENSTACK_NET,51649.0,11012,27.2 M,1,.AP.S.,normal


In [32]:
df['Bytes'].value_counts()

Bytes
3185    10882
  40    10383
  46    10303
 702     6686
 556     6682
        ...  
6509        1
5897        1
2263        1
1057        1
5586        1
Name: count, Length: 4567, dtype: int64

In [33]:
#preprocessing: clean the numerical colums
def clean_bytes(x, *args):
    if " M" in x:
        return float(x.replace(" M", "")) *1e6
    elif " K" in x:
        return float(x.replace(" K", "")) *1e3
    return float(x)

In [34]:
df['Bytes'] = df['Bytes'].astype(str).apply(clean_bytes)
df['Packets'] = df['Packets'].astype(str).str.strip().astype(int)

In [35]:
df .head()

Unnamed: 0,Date first seen,Duration,Proto,Src IP Addr,Src Pt,Dst IP Addr,Dst Pt,Packets,Bytes,Flows,Flags,class
0,2017-03-14 17:43:57.172,81412.697,TCP,EXT_SERVER,8082,OPENSTACK_NET,56978.0,3057,2100000.0,1,.AP...,normal
1,2017-03-14 17:43:57.172,81412.697,TCP,OPENSTACK_NET,56978,EXT_SERVER,8082.0,4748,2500000.0,1,.AP...,normal
2,2017-03-14 17:43:26.135,81504.787,TCP,EXT_SERVER,8082,OPENSTACK_NET,56979.0,8639,9100000.0,1,.AP...,normal
3,2017-03-14 17:43:26.135,81504.787,TCP,OPENSTACK_NET,56979,EXT_SERVER,8082.0,12024,10300000.0,1,.AP...,normal
4,2017-03-14 18:17:09.005,82100.692,TCP,EXT_SERVER,8082,OPENSTACK_NET,51649.0,11012,27200000.0,1,.AP.S.,normal


In [36]:
#X = df.drop('class', axis = 1).values
#y = df['class'].values
#print(type(X), type(y))

features columns must be an 2D array to fit the model in scikit-learn

## Feature Engineering

In [37]:
df[(df["Dst Pt"].isin([8080, 22, 23, 7547, 5385])) & (df["class"] == "suspicious")].value_counts()

Date first seen          Duration  Proto  Src IP Addr  Src Pt  Dst IP Addr  Dst Pt  Packets  Bytes   Flows  Flags   class     
2017-03-21 23:59:30.671  12.660    TCP    18851_139    14878   EXT_SERVER   22.0    18       2531.0  1      .AP.SF  suspicious    1
2017-03-15 00:00:02.296  62.046    TCP    10004_35     57382   EXT_SERVER   22.0    7        420.0   1      .APRSF  suspicious    1
2017-03-15 00:00:09.156  15.420    TCP    10001_123    4589    EXT_SERVER   22.0    13       2843.0  1      .APRS.  suspicious    1
2017-03-15 00:00:10.677  0.000     TCP    10000_214    8830    EXT_SERVER   23.0    1        46.0    1      ....S.  suspicious    1
2017-03-15 00:00:27.441  0.000     TCP    10002_148    18816   EXT_SERVER   23.0    1        46.0    1      ....S.  suspicious    1
                                                                                                                                 ..
2017-03-15 00:03:19.407  2.534     TCP    10016_97     55502   EXT_SERVER   22.0 

In [38]:
vulnerable_ports = {22, 23, 5385, 8080, 7547}
df['dst_port_risk'] = df['Dst Pt'].apply(
    lambda x: 'high' if x in vulnerable_ports or x < 1024 else 'low')

In [39]:
# feature engineering for packets
# Map major packets based on the analysis
def map_packets(packet):
    if packet == 6 or packet == 7:
        return 'normal'
    elif packet in {1, 5, 19, 17, 26, 15, 24, 20,12,2,18,8,13,4,3,27,16,9,11,10,25,21,28,23,22,29,30,31,32,33,34,35,36,37,40 }:
        return 'suspicious'
    return "normal"


df['packet_class'] =  df['Packets'].apply(map_packets).astype('category')

In [40]:
df[(df["dst_port_risk"] == "high") & (df["class"] == "suspicious")].value_counts()

Date first seen          Duration  Proto  Src IP Addr  Src Pt  Dst IP Addr  Dst Pt  Packets  Bytes   Flows  Flags   class       dst_port_risk  packet_class
2017-03-21 23:59:30.671  12.660    TCP    18851_139    14878   EXT_SERVER   22.0    18       2531.0  1      .AP.SF  suspicious  high           suspicious      1
2017-03-15 00:00:02.296  62.046    TCP    10004_35     57382   EXT_SERVER   22.0    7        420.0   1      .APRSF  suspicious  high           normal          1
2017-03-15 00:00:09.156  15.420    TCP    10001_123    4589    EXT_SERVER   22.0    13       2843.0  1      .APRS.  suspicious  high           suspicious      1
2017-03-15 00:00:10.677  0.000     TCP    10000_214    8830    EXT_SERVER   23.0    1        46.0    1      ....S.  suspicious  high           suspicious      1
2017-03-15 00:00:27.441  0.000     TCP    10002_148    18816   EXT_SERVER   23.0    1        46.0    1      ....S.  suspicious  high           suspicious      1
                                       

In [41]:
df[(df["dst_port_risk"] == "low") & (df["class"] =="normal")].value_counts()

Date first seen          Duration    Proto  Src IP Addr    Src Pt  Dst IP Addr    Dst Pt   Packets  Bytes       Flows  Flags   class   dst_port_risk  packet_class
2017-03-21 23:58:40.558  0.059       TCP    OPENSTACK_NET  51076   EXT_SERVER     8000.0   6        586.0       1      .AP.SF  normal  low            normal          1
2017-03-14 17:43:26.135  81504.787   TCP    EXT_SERVER     8082    OPENSTACK_NET  56979.0  8639     9100000.0   1      .AP...  normal  low            normal          1
                                            OPENSTACK_NET  56979   EXT_SERVER     8082.0   12024    10300000.0  1      .AP...  normal  low            normal          1
2017-03-14 17:43:39.011  183418.493  TCP    EXT_SERVER     8082    OPENSTACK_NET  60802.0  13266    33000000.0  1      .AP...  normal  low            normal          1
                                            OPENSTACK_NET  60802   EXT_SERVER     8082.0   20751    5800000.0   1      .AP...  normal  low            normal         

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 156950 entries, 0 to 172837
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype   
---  ------           --------------   -----   
 0   Date first seen  156950 non-null  object  
 1   Duration         156950 non-null  float64 
 2   Proto            156950 non-null  object  
 3   Src IP Addr      156950 non-null  object  
 4   Src Pt           156950 non-null  int64   
 5   Dst IP Addr      156950 non-null  object  
 6   Dst Pt           156950 non-null  float64 
 7   Packets          156950 non-null  int64   
 8   Bytes            156950 non-null  float64 
 9   Flows            156950 non-null  int64   
 10  Flags            156950 non-null  object  
 11  class            156950 non-null  object  
 12  dst_port_risk    156950 non-null  object  
 13  packet_class     156950 non-null  category
dtypes: category(1), float64(3), int64(3), object(7)
memory usage: 16.9+ MB


In [43]:
df.columns

Index(['Date first seen', 'Duration', 'Proto', 'Src IP Addr', 'Src Pt',
       'Dst IP Addr', 'Dst Pt', 'Packets', 'Bytes', 'Flows', 'Flags', 'class',
       'dst_port_risk', 'packet_class'],
      dtype='object')

In [44]:
df.set_index("Date first seen")

Unnamed: 0_level_0,Duration,Proto,Src IP Addr,Src Pt,Dst IP Addr,Dst Pt,Packets,Bytes,Flows,Flags,class,dst_port_risk,packet_class
Date first seen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017-03-14 17:43:57.172,81412.697,TCP,EXT_SERVER,8082,OPENSTACK_NET,56978.0,3057,2100000.0,1,.AP...,normal,low,normal
2017-03-14 17:43:57.172,81412.697,TCP,OPENSTACK_NET,56978,EXT_SERVER,8082.0,4748,2500000.0,1,.AP...,normal,low,normal
2017-03-14 17:43:26.135,81504.787,TCP,EXT_SERVER,8082,OPENSTACK_NET,56979.0,8639,9100000.0,1,.AP...,normal,low,normal
2017-03-14 17:43:26.135,81504.787,TCP,OPENSTACK_NET,56979,EXT_SERVER,8082.0,12024,10300000.0,1,.AP...,normal,low,normal
2017-03-14 18:17:09.005,82100.692,TCP,EXT_SERVER,8082,OPENSTACK_NET,51649.0,11012,27200000.0,1,.AP.S.,normal,low,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-03-16 11:42:01.298,518988.907,TCP,OPENSTACK_NET,49939,EXT_SERVER,8082.0,17446,10600000.0,1,.APRS.,normal,low,normal
2017-03-16 12:10:17.340,517292.865,TCP,EXT_SERVER,8082,OPENSTACK_NET,58749.0,9406,3400000.0,1,.AP.S.,normal,low,normal
2017-03-16 12:10:17.340,517292.865,TCP,OPENSTACK_NET,58749,EXT_SERVER,8082.0,15510,10300000.0,1,.APRS.,normal,low,normal
2017-03-16 15:52:58.342,503931.863,TCP,EXT_SERVER,8082,OPENSTACK_NET,62605.0,13362,5200000.0,1,.AP.S.,normal,low,normal


In [45]:
# Convert the specified columns to 'category' type
df[["class", "dst_port_risk", "packet_class"]] = df[["class", "dst_port_risk", "packet_class"]].astype("category")

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 156950 entries, 0 to 172837
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype   
---  ------           --------------   -----   
 0   Date first seen  156950 non-null  object  
 1   Duration         156950 non-null  float64 
 2   Proto            156950 non-null  object  
 3   Src IP Addr      156950 non-null  object  
 4   Src Pt           156950 non-null  int64   
 5   Dst IP Addr      156950 non-null  object  
 6   Dst Pt           156950 non-null  float64 
 7   Packets          156950 non-null  int64   
 8   Bytes            156950 non-null  float64 
 9   Flows            156950 non-null  int64   
 10  Flags            156950 non-null  object  
 11  class            156950 non-null  category
 12  dst_port_risk    156950 non-null  category
 13  packet_class     156950 non-null  category
dtypes: category(3), float64(3), int64(3), object(5)
memory usage: 14.8+ MB


In [47]:
# 2. TCP Flag Features
flag_mapping = {
    'S': 'SYN',
    'A': 'ACK',
    'P': 'PUSH',
    'R': 'RST',
    'F': 'FIN',
    'U': 'URG',
    'E': 'ECE',
    'C': 'CWR'
}
for flag in flag_mapping.values():
    df[f'flag_{flag}'] = 0

for idx, flags in enumerate(df['Flags']):
    for i, char in enumerate(flags):
        if char != '.' and char in flag_mapping:
            flag_name = flag_mapping[char]
            df.at[idx, f'flag_{flag_name}'] = 1

In [48]:
df.head()

Unnamed: 0,Date first seen,Duration,Proto,Src IP Addr,Src Pt,Dst IP Addr,Dst Pt,Packets,Bytes,Flows,...,dst_port_risk,packet_class,flag_SYN,flag_ACK,flag_PUSH,flag_RST,flag_FIN,flag_URG,flag_ECE,flag_CWR
0,2017-03-14 17:43:57.172,81412.697,TCP,EXT_SERVER,8082.0,OPENSTACK_NET,56978.0,3057.0,2100000.0,1.0,...,low,normal,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,2017-03-14 17:43:57.172,81412.697,TCP,OPENSTACK_NET,56978.0,EXT_SERVER,8082.0,4748.0,2500000.0,1.0,...,low,normal,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,2017-03-14 17:43:26.135,81504.787,TCP,EXT_SERVER,8082.0,OPENSTACK_NET,56979.0,8639.0,9100000.0,1.0,...,low,normal,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3,2017-03-14 17:43:26.135,81504.787,TCP,OPENSTACK_NET,56979.0,EXT_SERVER,8082.0,12024.0,10300000.0,1.0,...,low,normal,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,2017-03-14 18:17:09.005,82100.692,TCP,EXT_SERVER,8082.0,OPENSTACK_NET,51649.0,11012.0,27200000.0,1.0,...,low,normal,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [49]:
# 3. Traffic Volume Features
df['bytes_per_packet'] = df['Bytes'] / df['Packets']
df['duration_category'] = pd.cut(df['Duration'],
                                 bins=[-1, 0.001, 1, 60, np.inf],
                                 labels=['instant', 'short', 'medium', 'long'])

In [50]:
df['src_internal'] = df['Src IP Addr'].astype(str).apply(lambda x: 1 if x.startswith('100') else 0)
df['direction'] = df.apply(lambda x: 'internal_to_external' if x['src_internal'] else 'external_to_internal', axis=1)

In [51]:
# 5. Time-Based Features
df['timestamp'] = pd.to_datetime(df['Date first seen'])
df['hour'] = df['timestamp'].dt.hour
df['is_night'] = df['hour'].apply(lambda x: 1 if x < 6 or x >= 18 else 0)

In [52]:
# 6. Flow Symmetry Features
# Create bidirectional flow identifier
df['flow_key'] = df.apply(lambda x: tuple(sorted([(x['Src IP Addr'], x['Src Pt']),
                                                 (x['Dst IP Addr'], x['Dst Pt'])])), axis=1)

In [53]:
flow_counts = df.groupby('flow_key').size().reset_index(name='flow_count')
df = df.merge(flow_counts, on='flow_key')
df['bidirectional'] = df['flow_count'].apply(lambda x: 1 if x >= 2 else 0)

In [54]:
# 7. Attack Pattern Features
df['mixed_protocol'] = df.groupby('flow_key')['Proto'].transform(
    lambda x: 1 if len(set(x)) > 1 else 0)

In [55]:
# 8. Small Packet Detection
df['small_packet_alert'] = df.apply(lambda x: 1 if (x['Packets'] == 1 and x['Bytes'] < 100)
                                   else 0, axis=1)

In [56]:
df.head()

Unnamed: 0,Date first seen,Duration,Proto,Src IP Addr,Src Pt,Dst IP Addr,Dst Pt,Packets,Bytes,Flows,...,src_internal,direction,timestamp,hour,is_night,flow_key,flow_count,bidirectional,mixed_protocol,small_packet_alert
0,2017-03-14 17:43:57.172,81412.697,TCP,EXT_SERVER,8082.0,OPENSTACK_NET,56978.0,3057.0,2100000.0,1.0,...,0,external_to_internal,2017-03-14 17:43:57.172,17.0,0,"((EXT_SERVER, 8082.0), (OPENSTACK_NET, 56978.0))",4,1,0,0
1,2017-03-14 17:43:57.172,81412.697,TCP,OPENSTACK_NET,56978.0,EXT_SERVER,8082.0,4748.0,2500000.0,1.0,...,0,external_to_internal,2017-03-14 17:43:57.172,17.0,0,"((EXT_SERVER, 8082.0), (OPENSTACK_NET, 56978.0))",4,1,0,0
2,2017-03-14 17:43:26.135,81504.787,TCP,EXT_SERVER,8082.0,OPENSTACK_NET,56979.0,8639.0,9100000.0,1.0,...,0,external_to_internal,2017-03-14 17:43:26.135,17.0,0,"((EXT_SERVER, 8082.0), (OPENSTACK_NET, 56979.0))",4,1,0,0
3,2017-03-14 17:43:26.135,81504.787,TCP,OPENSTACK_NET,56979.0,EXT_SERVER,8082.0,12024.0,10300000.0,1.0,...,0,external_to_internal,2017-03-14 17:43:26.135,17.0,0,"((EXT_SERVER, 8082.0), (OPENSTACK_NET, 56979.0))",4,1,0,0
4,2017-03-14 18:17:09.005,82100.692,TCP,EXT_SERVER,8082.0,OPENSTACK_NET,51649.0,11012.0,27200000.0,1.0,...,0,external_to_internal,2017-03-14 18:17:09.005,18.0,1,"((EXT_SERVER, 8082.0), (OPENSTACK_NET, 51649.0))",4,1,0,0


In [57]:
# Final Feature Selection
feature_columns = [
    'Proto', 'dst_port_risk', 'flag_SYN', 'flag_ACK', 'flag_RST',
    'bytes_per_packet', 'duration_category', 'src_internal', 'direction',
    'hour', 'is_night', 'bidirectional', 'mixed_protocol', 'small_packet_alert',
    'Packets', 'Bytes', 'Flows', 'packet_class'
]

In [58]:
target_columns = ['class']

In [59]:

feature_df = df[feature_columns + ['class']]

In [60]:
feature_df.head()

Unnamed: 0,Proto,dst_port_risk,flag_SYN,flag_ACK,flag_RST,bytes_per_packet,duration_category,src_internal,direction,hour,is_night,bidirectional,mixed_protocol,small_packet_alert,Packets,Bytes,Flows,packet_class,class
0,TCP,low,0.0,1.0,0.0,686.947988,long,0,external_to_internal,17.0,0,1,0,0,3057.0,2100000.0,1.0,normal,normal
1,TCP,low,0.0,1.0,0.0,526.537489,long,0,external_to_internal,17.0,0,1,0,0,4748.0,2500000.0,1.0,normal,normal
2,TCP,low,0.0,1.0,0.0,1053.362658,long,0,external_to_internal,17.0,0,1,0,0,8639.0,9100000.0,1.0,normal,normal
3,TCP,low,0.0,1.0,0.0,856.620093,long,0,external_to_internal,17.0,0,1,0,0,12024.0,10300000.0,1.0,normal,normal
4,TCP,low,1.0,1.0,0.0,2470.032692,long,0,external_to_internal,18.0,1,1,0,0,11012.0,27200000.0,1.0,normal,normal


In [61]:
feature_df.to_csv('../data/CIDDS-001-feature_engineered.csv', index=False)

In [62]:
feature_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172202 entries, 0 to 172201
Data columns (total 19 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   Proto               156950 non-null  object  
 1   dst_port_risk       156950 non-null  category
 2   flag_SYN            170806 non-null  float64 
 3   flag_ACK            170637 non-null  float64 
 4   flag_RST            159924 non-null  float64 
 5   bytes_per_packet    156950 non-null  float64 
 6   duration_category   156950 non-null  category
 7   src_internal        172202 non-null  int64   
 8   direction           172202 non-null  object  
 9   hour                156950 non-null  float64 
 10  is_night            172202 non-null  int64   
 11  bidirectional       172202 non-null  int64   
 12  mixed_protocol      172202 non-null  int64   
 13  small_packet_alert  172202 non-null  int64   
 14  Packets             156950 non-null  float64 
 15  Bytes            

In [63]:
X = feature_df.drop('class', axis=1)
y = feature_df['class']

In [64]:
# Encode target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [65]:
# Define categorical and numerical features
categorical_features = ['Proto', 'dst_port_risk', 'duration_category', 'direction', 'packet_class']
numerical_features = ['bytes_per_packet', 'Packets', 'Bytes', 'Flows',
                      'flag_SYN', 'flag_ACK', 'flag_RST',
                      'src_internal', 'hour', 'is_night',
                      'bidirectional', 'mixed_protocol', 'small_packet_alert']

In [66]:
# Update numerical transformer to include imputation
numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [67]:
# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num',numerical_transformer, numerical_features)
    ])

In [68]:
# Create full pipeline with logistic regression
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        solver='saga',
        max_iter=1000,
        class_weight='balanced',
        random_state=42
    ))
])

In [78]:
from scipy.stats import loguniform

# Define hyperparameter grid
param_dist = {
    'classifier__C': loguniform(1e-3, 1e3),  # Log-uniform distribution
    'classifier__penalty': ['l1', 'l2'],
    'classifier__fit_intercept': [True, False],
    'classifier__l1_ratio': [None, 0.3, 0.5, 0.7]
}

In [79]:
# Set up stratified k-fold cross-validation
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [83]:
from sklearn.model_selection import RandomizedSearchCV

# Create RandomizedSearchCV object
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=20,  # Number of parameter combinations to try
    cv=cv,
    scoring='f1_weighted',
    n_jobs=-1,
    random_state=42,
    verbose=2
)

In [84]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [85]:
# Perform random search
print("Starting randomized search...")
random_search.fit(X_train, y_train)
print("Randomized search complete!")

Starting randomized search...
Fitting 3 folds for each of 20 candidates, totalling 60 fits




Randomized search complete!


In [73]:
# Train model
pipeline.fit(X_train, y_train)



In [86]:
# Best model
best_model = random_search.best_estimator_

In [87]:
# Evaluation
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)

print("\nBest Parameters:", random_search.best_params_)
print("Best CV Score (F1-weighted):", random_search.best_score_)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Best Parameters: {'classifier__C': np.float64(0.008629132190071854), 'classifier__fit_intercept': True, 'classifier__l1_ratio': 0.5, 'classifier__penalty': 'l2'}
Best CV Score (F1-weighted): 0.9822376923940718

Classification Report:


TypeError: object of type 'float' has no len()

In [None]:
feature_names = (
    best_model.named_steps['preprocessor']
    .transformers_[0][1]
    .get_feature_names_out(categorical_features)
).tolist() + numerical_features

coefficients = best_model.named_steps['classifier'].coef_

print("\nTop 10 Most Important Features:")
for i, class_name in enumerate(le.classes_):
    print(f"\nClass: {class_name}")
    coef_df = pd.DataFrame({
        'feature': feature_names,
        'coefficient': coefficients[i]
    }).sort_values(by='coefficient', key=abs, ascending=False)
    print(coef_df.head(10).to_string(index=False))

In [74]:
# Evaluate
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=[str(c) for c in le.classes_]))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

      normal       0.96      0.98      0.97      9921
  suspicious       0.99      0.98      0.98     21469
         nan       1.00      1.00      1.00      3051

    accuracy                           0.98     34441
   macro avg       0.98      0.99      0.98     34441
weighted avg       0.98      0.98      0.98     34441


Confusion Matrix:
[[ 9681   240     0]
 [  407 21062     0]
 [    0     0  3051]]


In [75]:
# For multiclass ROC-AUC (requires One-vs-Rest approach)
try:
    roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
    print(f"\nROC-AUC Score: {roc_auc:.3f}")
except Exception as e:
    print(f"\nROC-AUC calculation skipped: {str(e)}")


ROC-AUC Score: 0.999


In [76]:
feature_names = (
    pipeline.named_steps['preprocessor']
    .transformers_[0][1]
    .get_feature_names_out(categorical_features)
).tolist() + numerical_features

coefficients = pipeline.named_steps['classifier'].coef_

print("\nTop 10 Most Important Features:")
for i, class_name in enumerate(le.classes_):
    print(f"\nClass: {class_name}")
    coef_df = pd.DataFrame({
        'feature': feature_names,
        'coefficient': coefficients[i]
    }).sort_values(by='coefficient', ascending=False)
    print(coef_df.head(10).to_string(index=False))


Top 10 Most Important Features:

Class: normal
                feature  coefficient
      dst_port_risk_low     2.549675
duration_category_short     1.729736
    packet_class_normal     1.234148
                  Bytes     0.846221
                Packets     0.632467
 duration_category_long     0.368132
            Proto_TCP       0.324255
               is_night     0.308930
          bidirectional     0.158950
                  Flows     0.000000

Class: suspicious
                       feature  coefficient
            dst_port_risk_high     2.908589
       packet_class_suspicious     2.076437
      duration_category_medium     1.912148
                   Proto_TCP       1.383344
                  src_internal     0.892214
     duration_category_instant     0.891477
            small_packet_alert     0.851858
              bytes_per_packet     0.628726
direction_external_to_internal     0.496123
direction_internal_to_external     0.459613

Class: nan
              feature  coeffic

In [88]:
# 1. Create the "models" directory if it doesn't exist
os.makedirs('models', exist_ok=True)  # <- This creates the folder automatically

# 2. Save the model to the "models" directory
with open('models/logistic_regression_anamolydetectionl.pkl', 'wb') as file:
    pickle.dump(best_model, file)

print("Model saved to: models/trained_model.pkl")

Model saved to: models/trained_model.pkl
