# Botnet detection with machine learning

# Converting the netflow file to a vector data

In [1]:
!pip install xgboost



In [69]:
#Importing packages
import pandas as pd
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')


In [3]:
import pandas as pd
from keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import confusion_matrix, precision_score, f1_score, accuracy_score, recall_score
from numpy import argmax
from tensorflow.keras.models import save_model, load_model

In [4]:

binetflow_file_path = 'capture20110810.binetflow'

# Read the binary flow file into a pandas DataFrame
df = pd.read_csv(binetflow_file_path, delimiter=',')
df.head()

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
0,2011/08/10 09:46:53.047277,3550.182373,udp,212.50.71.179,39678,<->,147.32.84.229,13363,CON,0.0,0.0,12,875,413,flow=Background-UDP-Established
1,2011/08/10 09:46:53.048843,0.000883,udp,84.13.246.132,28431,<->,147.32.84.229,13363,CON,0.0,0.0,2,135,75,flow=Background-UDP-Established
2,2011/08/10 09:46:53.049895,0.000326,tcp,217.163.21.35,80,<?>,147.32.86.194,2063,FA_A,0.0,0.0,2,120,60,flow=Background
3,2011/08/10 09:46:53.053771,0.056966,tcp,83.3.77.74,32882,<?>,147.32.85.5,21857,FA_FA,0.0,0.0,3,180,120,flow=Background
4,2011/08/10 09:46:53.053937,3427.768066,udp,74.89.223.204,21278,<->,147.32.84.229,13363,CON,0.0,0.0,42,2856,1596,flow=Background-UDP-Established


In [5]:
#Dropping unneccesary columns
df.drop(['StartTime'],axis=1,inplace = True)

In [6]:
df.State.nunique()

230

In [7]:
df.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
0,3550.182373,udp,212.50.71.179,39678,<->,147.32.84.229,13363,CON,0.0,0.0,12,875,413,flow=Background-UDP-Established
1,0.000883,udp,84.13.246.132,28431,<->,147.32.84.229,13363,CON,0.0,0.0,2,135,75,flow=Background-UDP-Established
2,0.000326,tcp,217.163.21.35,80,<?>,147.32.86.194,2063,FA_A,0.0,0.0,2,120,60,flow=Background
3,0.056966,tcp,83.3.77.74,32882,<?>,147.32.85.5,21857,FA_FA,0.0,0.0,3,180,120,flow=Background
4,3427.768066,udp,74.89.223.204,21278,<->,147.32.84.229,13363,CON,0.0,0.0,42,2856,1596,flow=Background-UDP-Established


In [8]:
#Renaming the columns using their proper names from the previous dataset
df.rename(columns = {list(df)[0]:'Duration',list(df)[1]:'Protocol',list(df)[2]:'Source_IP',list(df)[3]:'Source_Port',
                    list(df)[5]:'Destination_IP',list(df)[6]:'Destination_Port',list(df)[7]:'Flags',list(df)[8]:'Source_Type_of_Service',
                    list(df)[9]:'Dest_Type_of_Service',list(df)[10]:'Packets',list(df)[11]:'Total_Bytes',list(df)[12]:'Source_Bytes'}, inplace = True)

In [9]:
df.head()

Unnamed: 0,Duration,Protocol,Source_IP,Source_Port,Dir,Destination_IP,Destination_Port,Flags,Source_Type_of_Service,Dest_Type_of_Service,Packets,Total_Bytes,Source_Bytes,Label
0,3550.182373,udp,212.50.71.179,39678,<->,147.32.84.229,13363,CON,0.0,0.0,12,875,413,flow=Background-UDP-Established
1,0.000883,udp,84.13.246.132,28431,<->,147.32.84.229,13363,CON,0.0,0.0,2,135,75,flow=Background-UDP-Established
2,0.000326,tcp,217.163.21.35,80,<?>,147.32.86.194,2063,FA_A,0.0,0.0,2,120,60,flow=Background
3,0.056966,tcp,83.3.77.74,32882,<?>,147.32.85.5,21857,FA_FA,0.0,0.0,3,180,120,flow=Background
4,3427.768066,udp,74.89.223.204,21278,<->,147.32.84.229,13363,CON,0.0,0.0,42,2856,1596,flow=Background-UDP-Established


In [10]:
df.Dir.unique()

array(['  <->', '  <?>', '   ->', '   ?>', '  who', '  <-', '  <?'],
      dtype=object)

In [11]:
direction = ['   ->','  <->','  <-']
df = df.loc[df['Dir'].isin(direction)]

In [12]:
df #Now 2.813Million rows - Only about 10,000 values removed from 2.824Million rows

Unnamed: 0,Duration,Protocol,Source_IP,Source_Port,Dir,Destination_IP,Destination_Port,Flags,Source_Type_of_Service,Dest_Type_of_Service,Packets,Total_Bytes,Source_Bytes,Label
0,3550.182373,udp,212.50.71.179,39678,<->,147.32.84.229,13363,CON,0.0,0.0,12,875,413,flow=Background-UDP-Established
1,0.000883,udp,84.13.246.132,28431,<->,147.32.84.229,13363,CON,0.0,0.0,2,135,75,flow=Background-UDP-Established
4,3427.768066,udp,74.89.223.204,21278,<->,147.32.84.229,13363,CON,0.0,0.0,42,2856,1596,flow=Background-UDP-Established
6,3589.631348,udp,182.239.167.121,49649,<->,147.32.84.229,13363,CON,0.0,0.0,12,1494,1122,flow=Background-UDP-Established
8,3118.470947,udp,24.117.206.20,8697,<->,147.32.84.229,13363,CON,0.0,0.0,13,4328,840,flow=Background-UDP-Established
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2824631,0.000393,udp,147.32.86.92,36363,<->,147.32.80.9,53,CON,0.0,0.0,2,208,79,flow=To-Background-UDP-CVUT-DNS-Server
2824632,0.000935,udp,58.165.41.84,60122,<->,147.32.84.229,13363,CON,0.0,0.0,2,539,75,flow=Background-UDP-Established
2824633,0.000000,tcp,147.32.84.171,47077,->,78.191.168.43,13754,S_,0.0,,1,74,74,flow=Background-TCP-Attempt
2824634,0.002618,udp,93.79.39.15,10520,<->,147.32.84.229,13363,CON,0.0,0.0,2,520,460,flow=Background-UDP-Established


In [13]:
df.replace(['   ->','  <->','  <-'],  
                            ['outgoing','two-way','incoming'],inplace=True)
# df.replace({'Dir':{'   ->': 'outgoing', '  <->': 'two-way', '  <-': 'incoming'}})

In [14]:
df.Dir.unique()

array(['two-way', 'outgoing', 'incoming'], dtype=object)

In [15]:
df.rename(columns = {list(df)[4]:'Direction'}, inplace = True)

In [16]:
df

Unnamed: 0,Duration,Protocol,Source_IP,Source_Port,Direction,Destination_IP,Destination_Port,Flags,Source_Type_of_Service,Dest_Type_of_Service,Packets,Total_Bytes,Source_Bytes,Label
0,3550.182373,udp,212.50.71.179,39678,two-way,147.32.84.229,13363,CON,0.0,0.0,12,875,413,flow=Background-UDP-Established
1,0.000883,udp,84.13.246.132,28431,two-way,147.32.84.229,13363,CON,0.0,0.0,2,135,75,flow=Background-UDP-Established
4,3427.768066,udp,74.89.223.204,21278,two-way,147.32.84.229,13363,CON,0.0,0.0,42,2856,1596,flow=Background-UDP-Established
6,3589.631348,udp,182.239.167.121,49649,two-way,147.32.84.229,13363,CON,0.0,0.0,12,1494,1122,flow=Background-UDP-Established
8,3118.470947,udp,24.117.206.20,8697,two-way,147.32.84.229,13363,CON,0.0,0.0,13,4328,840,flow=Background-UDP-Established
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2824631,0.000393,udp,147.32.86.92,36363,two-way,147.32.80.9,53,CON,0.0,0.0,2,208,79,flow=To-Background-UDP-CVUT-DNS-Server
2824632,0.000935,udp,58.165.41.84,60122,two-way,147.32.84.229,13363,CON,0.0,0.0,2,539,75,flow=Background-UDP-Established
2824633,0.000000,tcp,147.32.84.171,47077,outgoing,78.191.168.43,13754,S_,0.0,,1,74,74,flow=Background-TCP-Attempt
2824634,0.002618,udp,93.79.39.15,10520,two-way,147.32.84.229,13363,CON,0.0,0.0,2,520,460,flow=Background-UDP-Established


In [17]:
df.Label.unique()

array(['flow=Background-UDP-Established',
       'flow=Background-TCP-Established',
       'flow=To-Background-UDP-CVUT-DNS-Server',
       'flow=Background-UDP-Attempt',
       'flow=From-Normal-V42-UDP-CVUT-DNS-Server',
       'flow=Background-Established-cmpgw-CVUT',
       'flow=Background-UDP-NTP-Established-1',
       'flow=To-Background-CVUT-WebServer',
       'flow=Background-Attempt-cmpgw-CVUT', 'flow=Background',
       'flow=From-Background-CVUT-Proxy', 'flow=To-Background-CVUT-Proxy',
       'flow=Background-google-analytics3', 'flow=From-Normal-V42-Jist',
       'flow=Background-TCP-Attempt',
       'flow=Background-google-analytics14',
       'flow=Background-ajax.google', 'flow=Background-google-analytics6',
       'flow=Background-google-analytics13', 'flow=Background-google-pop',
       'flow=To-Normal-V42-UDP-NTP-server',
       'flow=From-Normal-V42-Stribrek',
       'flow=Background-google-analytics1',
       'flow=Background-google-analytics16',
       'flow=Backgr

In [18]:
df['Label'] = df['Label'].str.replace(r'flow=', '')

In [19]:
df['Label'] = df['Label'].str.replace(r'From-', '')

In [20]:
df['Label'] = df['Label'].str.replace(r'To-', '')

In [21]:
df['Label'] = df['Label'].str.extract(r'(\w+)')

In [22]:
df.Label.unique()

array(['Background', 'Normal', 'Botnet'], dtype=object)

In [23]:
df

Unnamed: 0,Duration,Protocol,Source_IP,Source_Port,Direction,Destination_IP,Destination_Port,Flags,Source_Type_of_Service,Dest_Type_of_Service,Packets,Total_Bytes,Source_Bytes,Label
0,3550.182373,udp,212.50.71.179,39678,two-way,147.32.84.229,13363,CON,0.0,0.0,12,875,413,Background
1,0.000883,udp,84.13.246.132,28431,two-way,147.32.84.229,13363,CON,0.0,0.0,2,135,75,Background
4,3427.768066,udp,74.89.223.204,21278,two-way,147.32.84.229,13363,CON,0.0,0.0,42,2856,1596,Background
6,3589.631348,udp,182.239.167.121,49649,two-way,147.32.84.229,13363,CON,0.0,0.0,12,1494,1122,Background
8,3118.470947,udp,24.117.206.20,8697,two-way,147.32.84.229,13363,CON,0.0,0.0,13,4328,840,Background
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2824631,0.000393,udp,147.32.86.92,36363,two-way,147.32.80.9,53,CON,0.0,0.0,2,208,79,Background
2824632,0.000935,udp,58.165.41.84,60122,two-way,147.32.84.229,13363,CON,0.0,0.0,2,539,75,Background
2824633,0.000000,tcp,147.32.84.171,47077,outgoing,78.191.168.43,13754,S_,0.0,,1,74,74,Background
2824634,0.002618,udp,93.79.39.15,10520,two-way,147.32.84.229,13363,CON,0.0,0.0,2,520,460,Background


In [24]:
df.dropna(inplace=True)


In [25]:
df #dropping all null values removes 200k records = 2.613Million rows

Unnamed: 0,Duration,Protocol,Source_IP,Source_Port,Direction,Destination_IP,Destination_Port,Flags,Source_Type_of_Service,Dest_Type_of_Service,Packets,Total_Bytes,Source_Bytes,Label
0,3550.182373,udp,212.50.71.179,39678,two-way,147.32.84.229,13363,CON,0.0,0.0,12,875,413,Background
1,0.000883,udp,84.13.246.132,28431,two-way,147.32.84.229,13363,CON,0.0,0.0,2,135,75,Background
4,3427.768066,udp,74.89.223.204,21278,two-way,147.32.84.229,13363,CON,0.0,0.0,42,2856,1596,Background
6,3589.631348,udp,182.239.167.121,49649,two-way,147.32.84.229,13363,CON,0.0,0.0,12,1494,1122,Background
8,3118.470947,udp,24.117.206.20,8697,two-way,147.32.84.229,13363,CON,0.0,0.0,13,4328,840,Background
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2824630,0.000330,udp,147.32.86.92,36363,two-way,147.32.80.9,53,CON,0.0,0.0,2,296,79,Background
2824631,0.000393,udp,147.32.86.92,36363,two-way,147.32.80.9,53,CON,0.0,0.0,2,208,79,Background
2824632,0.000935,udp,58.165.41.84,60122,two-way,147.32.84.229,13363,CON,0.0,0.0,2,539,75,Background
2824634,0.002618,udp,93.79.39.15,10520,two-way,147.32.84.229,13363,CON,0.0,0.0,2,520,460,Background


In [26]:
df['Source_Type_of_Service'].unique()

array([0., 3., 2., 1.])

In [27]:
df['Dest_Type_of_Service'].unique()

array([0., 2., 3., 1.])

# Label Encoding all Categorical Variables 

### Direction

In [28]:
# Import label encoder 
from sklearn import preprocessing 
  
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in column 'species'. 
cat_dir_data= label_encoder.fit_transform(df[['Direction']]) 
New_cat_dir = pd.DataFrame(cat_dir_data)
New_cat_dir=New_cat_dir.rename(columns={0:'Direction'})


In [29]:
New_cat_dir.Direction.nunique()

3

### Protocol

In [30]:
df.Protocol.unique()

array(['udp', 'tcp', 'rtp', 'icmp', 'rtcp', 'udt'], dtype=object)

In [31]:
label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in column 'species'. 
Protocol_Copy= label_encoder.fit_transform(df[['Protocol']]) 
New_cat_protocol = pd.DataFrame(Protocol_Copy)
New_cat_protocol=New_cat_protocol.rename(columns={0:'Protocol'})

In [32]:
New_cat_protocol.Protocol.nunique()

6

### Flags

In [33]:
df.Flags.nunique()

169

In [34]:
label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in column 'species'. 
cat_flags_data= label_encoder.fit_transform(df[['Flags']]) 
New_cat_flags = pd.DataFrame(cat_flags_data)
New_cat_flags=New_cat_flags.rename(columns={0:'Flags'})

In [35]:
New_cat_flags.nunique()

Flags    169
dtype: int64

### Ports

In [36]:
imp_ports = ['22', '443', '80', '53', '389', '25', '113', '123', '554', '520', '161', '995', '67', '993', '631', '110',
 '143', '0', '445', '137', '427', '138', '524', '514', '139', '1000', '784', '12', '465', '592', '587', '88', '2', '888', '21',
 '500', '544', '81', '418', '294', '34', '98', '68', '709', '23', '8', '625', '768', '579', '135', '104', '916', '877', '310',
 '490', '1', '82', '369', '1013', '83', '832', '843', '471', '118']

In [37]:
len(imp_ports)

64

In [38]:
df['ports'] = df['Destination_Port'].where(df['Destination_Port'].isin(imp_ports), 'NaN')

In [39]:
df.ports.value_counts()

ports
NaN     1279389
53       989469
80       254554
443       68655
22        11002
123        2743
993        2316
110        1340
25         1214
995         999
113         378
21          322
161         292
143         113
587          71
427          66
465          44
81           34
631          28
843          24
389          16
524          15
139          14
500          11
23           10
888          10
82            7
118           4
1             3
88            2
12            2
1000          2
554           2
544           1
1013          1
916           1
8             1
592           1
Name: count, dtype: int64

In [40]:
df = df[df.ports != 'NaN']

In [41]:
df

Unnamed: 0,Duration,Protocol,Source_IP,Source_Port,Direction,Destination_IP,Destination_Port,Flags,Source_Type_of_Service,Dest_Type_of_Service,Packets,Total_Bytes,Source_Bytes,Label,ports
11,0.187434,tcp,147.32.86.194,2065,outgoing,217.163.21.35,80,FSPA_FSPA,0.0,0.0,11,3872,1147,Background,80
29,0.000368,udp,147.32.84.138,42315,two-way,147.32.80.9,53,CON,0.0,0.0,2,214,81,Background,53
30,0.000225,udp,147.32.84.138,42626,two-way,147.32.80.9,53,CON,0.0,0.0,2,214,81,Background,53
33,0.000227,udp,147.32.84.138,58276,two-way,147.32.80.9,53,CON,0.0,0.0,2,214,81,Background,53
34,0.000272,udp,147.32.84.138,58867,two-way,147.32.80.9,53,CON,0.0,0.0,2,214,81,Background,53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2824625,0.000271,udp,147.32.84.138,52130,two-way,147.32.80.9,53,CON,0.0,0.0,2,214,81,Background,53
2824626,0.000207,udp,147.32.84.138,35552,two-way,147.32.80.9,53,CON,0.0,0.0,2,214,81,Background,53
2824627,0.000207,udp,147.32.84.138,58286,two-way,147.32.80.9,53,CON,0.0,0.0,2,214,81,Background,53
2824630,0.000330,udp,147.32.86.92,36363,two-way,147.32.80.9,53,CON,0.0,0.0,2,296,79,Background,53


# Merging everything

In [42]:
Interval_Variables = df[['Source_IP','Destination_IP','Duration','Packets','Total_Bytes','Source_Bytes','ports']]
Interval_Variables.head()

Unnamed: 0,Source_IP,Destination_IP,Duration,Packets,Total_Bytes,Source_Bytes,ports
11,147.32.86.194,217.163.21.35,0.187434,11,3872,1147,80
29,147.32.84.138,147.32.80.9,0.000368,2,214,81,53
30,147.32.84.138,147.32.80.9,0.000225,2,214,81,53
33,147.32.84.138,147.32.80.9,0.000227,2,214,81,53
34,147.32.84.138,147.32.80.9,0.000272,2,214,81,53


In [43]:
a = pd.merge(Interval_Variables, New_cat_flags, right_index=True,left_index=True)
b = pd.merge(a, New_cat_protocol, right_index=True,left_index=True)
final_X = pd.merge(b, New_cat_dir, right_index=True,left_index=True)



In [44]:
final_X

Unnamed: 0,Source_IP,Destination_IP,Duration,Packets,Total_Bytes,Source_Bytes,ports,Flags,Protocol,Direction
11,147.32.86.194,217.163.21.35,0.187434,11,3872,1147,80,4,4,2
29,147.32.84.138,147.32.80.9,0.000368,2,214,81,53,4,4,2
30,147.32.84.138,147.32.80.9,0.000225,2,214,81,53,4,4,2
33,147.32.84.138,147.32.80.9,0.000227,2,214,81,53,4,4,2
34,147.32.84.138,147.32.80.9,0.000272,2,214,81,53,4,4,2
...,...,...,...,...,...,...,...,...,...,...
2613143,147.32.84.170,69.63.189.59,10.794330,7,482,272,80,4,4,2
2613144,147.32.84.170,69.63.189.59,10.794090,7,482,272,80,4,4,2
2613145,147.32.84.170,66.220.153.15,10.794067,7,482,272,80,4,4,2
2613146,147.32.84.170,66.220.153.15,10.793828,7,482,272,80,4,4,2


In [45]:
final_X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1233207 entries, 11 to 2613147
Data columns (total 10 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   Source_IP       1233207 non-null  object 
 1   Destination_IP  1233207 non-null  object 
 2   Duration        1233207 non-null  float64
 3   Packets         1233207 non-null  int64  
 4   Total_Bytes     1233207 non-null  int64  
 5   Source_Bytes    1233207 non-null  int64  
 6   ports           1233207 non-null  object 
 7   Flags           1233207 non-null  int32  
 8   Protocol        1233207 non-null  int32  
 9   Direction       1233207 non-null  int32  
dtypes: float64(1), int32(3), int64(3), object(3)
memory usage: 89.4+ MB


# Defining X and y variables
## Target

In [46]:
df

Unnamed: 0,Duration,Protocol,Source_IP,Source_Port,Direction,Destination_IP,Destination_Port,Flags,Source_Type_of_Service,Dest_Type_of_Service,Packets,Total_Bytes,Source_Bytes,Label,ports
11,0.187434,tcp,147.32.86.194,2065,outgoing,217.163.21.35,80,FSPA_FSPA,0.0,0.0,11,3872,1147,Background,80
29,0.000368,udp,147.32.84.138,42315,two-way,147.32.80.9,53,CON,0.0,0.0,2,214,81,Background,53
30,0.000225,udp,147.32.84.138,42626,two-way,147.32.80.9,53,CON,0.0,0.0,2,214,81,Background,53
33,0.000227,udp,147.32.84.138,58276,two-way,147.32.80.9,53,CON,0.0,0.0,2,214,81,Background,53
34,0.000272,udp,147.32.84.138,58867,two-way,147.32.80.9,53,CON,0.0,0.0,2,214,81,Background,53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2824625,0.000271,udp,147.32.84.138,52130,two-way,147.32.80.9,53,CON,0.0,0.0,2,214,81,Background,53
2824626,0.000207,udp,147.32.84.138,35552,two-way,147.32.80.9,53,CON,0.0,0.0,2,214,81,Background,53
2824627,0.000207,udp,147.32.84.138,58286,two-way,147.32.80.9,53,CON,0.0,0.0,2,214,81,Background,53
2824630,0.000330,udp,147.32.86.92,36363,two-way,147.32.80.9,53,CON,0.0,0.0,2,296,79,Background,53


In [47]:
Target = df.iloc[:,-2:-1]


In [48]:
Target.rename(columns = {-2:'label'}, inplace = True) 
Target['Label'].value_counts()

Label
Background    1276112
Normal          30144
Botnet          27511
Name: count, dtype: int64

In [49]:
#Classifying labels into two categories
#botnet = Yes
#Background,Normal = No
a = ['Botnet']
Target['yes/no'] = (df['Label'].isin(a)).astype(int)

In [50]:
Target['yes/no'].value_counts()

yes/no
0    1306256
1      27511
Name: count, dtype: int64

## Downsampling

In [51]:
from sklearn.utils import resample

d = pd.merge(Target, final_X, right_index=True, left_index=True)

d.head()

Unnamed: 0,Label,yes/no,Source_IP,Destination_IP,Duration,Packets,Total_Bytes,Source_Bytes,ports,Flags,Protocol,Direction
11,Background,0,147.32.86.194,217.163.21.35,0.187434,11,3872,1147,80,4,4,2
29,Background,0,147.32.84.138,147.32.80.9,0.000368,2,214,81,53,4,4,2
30,Background,0,147.32.84.138,147.32.80.9,0.000225,2,214,81,53,4,4,2
33,Background,0,147.32.84.138,147.32.80.9,0.000227,2,214,81,53,4,4,2
34,Background,0,147.32.84.138,147.32.80.9,0.000272,2,214,81,53,4,4,2


In [52]:
d['yes/no'].value_counts()

yes/no
0    1210617
1      22590
Name: count, dtype: int64

In [53]:
majo = d.loc[d['yes/no']== 0]
mino = d.loc[d['yes/no']== 1]

In [54]:
# downsample majority class
maj_dsampled = resample(majo, 
                        replace=False,     # sample with replacement
                        n_samples=20515,   # to match minority class
                        random_state=1)    # reproducible results

In [55]:
dsampled = pd.concat([mino, maj_dsampled])
dsampled['yes/no'].value_counts()

yes/no
1    22590
0    20515
Name: count, dtype: int64

## X and Y variables

In [56]:
y=dsampled[['yes/no']]
X=dsampled.drop(['yes/no','Label','Source_IP','Destination_IP'],axis=1)


X

Unnamed: 0,Duration,Packets,Total_Bytes,Source_Bytes,ports,Flags,Protocol,Direction
675537,0.000278,2,203,64,53,4,4,2
675872,0.020525,2,590,87,53,4,4,2
675877,0.045125,7,882,629,80,4,4,2
689920,0.336250,2,215,71,53,4,4,2
691663,0.459301,2,212,77,53,4,4,2
...,...,...,...,...,...,...,...,...
856611,0.000129,2,214,81,53,4,4,2
2564490,0.000272,2,214,81,53,4,4,2
561572,0.000249,2,208,79,53,4,4,2
1405041,45.757549,174,136019,18555,80,4,4,2


In [57]:
y

Unnamed: 0,yes/no
675537,1
675872,1
675877,1
689920,1
691663,1
...,...
856611,0
2564490,0
561572,0
1405041,0


We have 41,030 records now.

# Applying machine learning algorithms 

In [58]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3,random_state=42)
scaler_minmax = MinMaxScaler()
scaler_minmax.fit(X_train)
scaler_minmax.fit(X_test)

X_train = scaler_minmax.transform(X_train)
X_test = scaler_minmax.transform(X_test)

## Decision Tree

In [59]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(criterion='gini')
dtree.fit(X_train,y_train)
dtree.score(X_test, y_test)

0.9273894215898546

## Bagging Classifier

In [60]:
from sklearn.metrics import confusion_matrix as cm
from sklearn.ensemble import BaggingClassifier
clf = BaggingClassifier(n_estimators=10).fit(X_train, y_train)

clf.score(X_test, y_test)

0.9378286421280544

## XGBoost

In [61]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder

# Create a label encoder
label_encoder = LabelEncoder()

# Fit and transform the training labels
y_train_encoded = label_encoder.fit_transform(y_train)

# Transform the test labels
y_test_encoded = label_encoder.transform(y_test)

# Fitting the model. 
model = XGBClassifier()
model.fit(X_train, y_train_encoded)

# Making predictions for the test data. 
y_pred_encoded = model.predict(X_test)
predictions = y_pred_encoded

# Inverse transform the predictions to get original labels for evaluation
predictions_original = label_encoder.inverse_transform(predictions)

# Evaluating predictions for the test data. 
accuracy = accuracy_score(y_test, predictions_original)
f1 = f1_score(y_test, predictions_original, average='binary')  # Use 'micro', 'macro', or 'weighted' for multiclass

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.9541
F1 Score: 0.9566


In [71]:
!pip install tabulate
from tabulate import tabulate

results = [
    ["Accuracy", f"{accuracy:.4f}"],
    ["F1 Score", f"{f1:.4f}"],
]

print(tabulate(results, headers=["Metric", "Score"]))


Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0
Metric      Score
--------  -------
Accuracy   0.9541
F1 Score   0.9566


# Neural network

## NN architecture 1

In [72]:
# Define the model
model = Sequential()
model.add(Dense(12, input_dim=X.shape[1], activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))



In [73]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.0001), metrics=['accuracy'])



In [74]:
model.fit(X_train, y_train, epochs=20, batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1c223694100>

In [75]:
# Predict the test set results
y_pred = model.predict(X_test)
y_pred = [1 if p >= 0.5 else 0 for p in y_pred]  # Threshold can be adjusted





In [76]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix:\n {cm}')

# Precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision}')

# F1 Score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1}')

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')


Confusion Matrix:
 [[ 620 5533]
 [ 288 6491]]
Precision: 0.539836992681304
F1 Score: 0.6904217412115088
Accuracy: 0.5498762759047324


## NN architecture 2 

In [77]:
# Define the model
model = Sequential()
model.add(Dense(64, input_dim=X.shape[1], activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [78]:
# Compile the model with an appropriate learning rate
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])




In [79]:
# Train the model with your data
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1c224412050>

In [80]:
# Save the trained model to an HDF5 file
save_model(model, 'raw_data_model.h5')


In [81]:
# Predict the test set results
y_pred = model.predict(X_test)
y_pred = [1 if p >= 0.5 else 0 for p in y_pred]  # Threshold can be adjusted

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix:\n {cm}')

# Precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision}')

# F1 Score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1}')

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')


Confusion Matrix:
 [[1534 4619]
 [ 415 6364]]
Precision: 0.5794409542019484
F1 Score: 0.716585970048418
Accuracy: 0.6107330652644603


In [82]:
d.rename(columns={'yes/no': 'lablel'}, inplace=True)

In [83]:
d.drop(columns=['Label'], inplace=True)

In [84]:
d

Unnamed: 0,lablel,Source_IP,Destination_IP,Duration,Packets,Total_Bytes,Source_Bytes,ports,Flags,Protocol,Direction
11,0,147.32.86.194,217.163.21.35,0.187434,11,3872,1147,80,4,4,2
29,0,147.32.84.138,147.32.80.9,0.000368,2,214,81,53,4,4,2
30,0,147.32.84.138,147.32.80.9,0.000225,2,214,81,53,4,4,2
33,0,147.32.84.138,147.32.80.9,0.000227,2,214,81,53,4,4,2
34,0,147.32.84.138,147.32.80.9,0.000272,2,214,81,53,4,4,2
...,...,...,...,...,...,...,...,...,...,...,...
2613143,0,147.32.84.170,69.63.189.59,10.794330,7,482,272,80,4,4,2
2613144,0,147.32.84.170,69.63.189.59,10.794090,7,482,272,80,4,4,2
2613145,0,147.32.84.170,66.220.153.15,10.794067,7,482,272,80,4,4,2
2613146,0,147.32.84.170,66.220.153.15,10.793828,7,482,272,80,4,4,2


In [85]:
# Save the DataFrame to a CSV file
df.to_csv('raw data.csv', index=False)