In [216]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import copy
from sklearn.preprocessing import LabelEncoder

Import the data

In [217]:
#load datasets
training_df = pd.read_csv("training_data_with_labels.csv", encoding="utf8", names = ['timestamp', 'duration', 'protocol', 'src_ip', 'src_port', 'direction', 'dst_ip', 'dst_port', 'state', 'src_type_service', 'dst_type_service', 'number_total_packets', 'bytes_both_directions', 'bytes_src_to_dst','label'], dtype = {'timestamp':object, 'duration':float, 'protocol':object, 'src_ip':object, 'src_port':object, 'direction':object, 'dst_ip':object, 'dst_port':object, 'state':object, 'src_type_service':object, 'dst_type_service':object, 'number_total_packets':int, 'bytes_both_directions':int, 'bytes_src_to_dst':int}, low_memory = False)
test_df = pd.read_csv("test_data_with_labels.csv", encoding="utf8", names = ['timestamp', 'duration', 'protocol', 'src_ip', 'src_port', 'direction', 'dst_ip', 'dst_port', 'state', 'src_type_service', 'dst_type_service', 'number_total_packets', 'bytes_both_directions', 'bytes_src_to_dst','label'], dtype = {'timestamp':object, 'duration':float, 'protocol':object, 'src_ip':object, 'src_port':object, 'direction':object, 'dst_ip':object, 'dst_port':object, 'state':object, 'src_type_service':object, 'dst_type_service':object, 'number_total_packets':int, 'bytes_both_directions':int, 'bytes_src_to_dst':int}, low_memory = False)
valid_df = pd.read_csv("valid_data_with_labels.csv", encoding="utf8", names = ['timestamp', 'duration', 'protocol', 'src_ip', 'src_port', 'direction', 'dst_ip', 'dst_port', 'state', 'src_type_service', 'dst_type_service', 'number_total_packets', 'bytes_both_directions', 'bytes_src_to_dst','label'], dtype = {'timestamp':object, 'duration':float, 'protocol':object, 'src_ip':object, 'src_port':object, 'direction':object, 'dst_ip':object, 'dst_port':object, 'state':object, 'src_type_service':object, 'dst_type_service':object, 'number_total_packets':int, 'bytes_both_directions':int, 'bytes_src_to_dst':int, 'label':object}, low_memory = False)

In [218]:
#add ID
training_df['streamID'] = training_df.index + 1
test_df['streamID'] = test_df.index + 1
valid_df['streamID'] = valid_df.index + 1

In [219]:
traindata = copy.deepcopy(training_df)
testdata = copy.deepcopy(test_df)
validdata = copy.deepcopy(valid_df)

Fix data

In [220]:
#fix data
traindata['src_port'].fillna('None', inplace = True)
traindata['direction'] = traindata['direction'].str.strip()
traindata['dst_port'].fillna('None', inplace = True)
traindata['state'].fillna('None', inplace = True)
traindata['src_type_service'].fillna('None', inplace = True)
traindata['dst_type_service'].fillna('None', inplace = True)

In [221]:
#fix data
testdata['src_port'].fillna('None', inplace = True)
testdata['direction'] = testdata['direction'].str.strip()
testdata['dst_port'].fillna('None', inplace = True)
testdata['state'].fillna('None', inplace = True)
testdata['src_type_service'].fillna('None', inplace = True)
testdata['dst_type_service'].fillna('None', inplace = True)

In [222]:
#fix data
validdata['src_port'].fillna('None', inplace = True)
validdata['direction'] = validdata['direction'].str.strip()
validdata['dst_port'].fillna('None', inplace = True)
validdata['state'].fillna('None', inplace = True)
validdata['src_type_service'].fillna('None', inplace = True)
validdata['dst_type_service'].fillna('None', inplace = True)

Add labels to botnet and normal classes, Botnet = 1, Normal = 0

In [223]:
#add labels - botnet is 1
truelabels = traindata[['label']].copy()
truelabels['labelvalues'] = np.where(truelabels['label'].str.contains('Botnet', case = False, na = False), 1, 0)
labelvalues = truelabels[['labelvalues']].copy()
traindata_pr = traindata.join(labelvalues)

In [224]:
#add labels - botnet is 1
truelabels = testdata[['label']].copy()
truelabels['labelvalues'] = np.where(truelabels['label'].str.contains('Botnet', case = False, na = False), 1, 0)
labelvalues = truelabels[['labelvalues']].copy()
testdata_pr = testdata.join(labelvalues)

In [225]:
#add labels - botnet is 1
truelabels = validdata[['label']].copy()
truelabels['labelvalues'] = np.where(truelabels['label'].str.contains('Botnet', case = False, na = False), 1, 0)
labelvalues = truelabels[['labelvalues']].copy()
validdata_pr = validdata.join(labelvalues)

Calculate features from the report

In [226]:
#calculate report features
testdata_pr.loc[:, 'pps'] = testdata_pr.number_total_packets/testdata_pr.duration.replace({0: np.inf})
testdata_pr.loc[:, 'bps_oneway'] = testdata_pr.bytes_src_to_dst/testdata_pr.duration.replace({0: np.inf})
testdata_pr.loc[:, 'bpp_oneway'] = testdata_pr.bytes_src_to_dst/testdata_pr.number_total_packets.replace({0: np.inf})
testdata_pr.loc[:, 'bps_twoway'] = testdata_pr.bytes_both_directions/testdata_pr.duration.replace({0: np.inf})
testdata_pr.loc[:, 'bpp_twoway'] = testdata_pr.bytes_both_directions/testdata_pr.number_total_packets.replace({0: np.inf})

In [227]:
#calculate report features
validdata_pr.loc[:, 'pps'] = validdata_pr.number_total_packets/validdata_pr.duration.replace({0: np.inf})
validdata_pr.loc[:, 'bps_oneway'] = validdata_pr.bytes_src_to_dst/validdata_pr.duration.replace({0: np.inf})
validdata_pr.loc[:, 'bpp_oneway'] = validdata_pr.bytes_src_to_dst/validdata_pr.number_total_packets.replace({0: np.inf})
validdata_pr.loc[:, 'bps_twoway'] = validdata_pr.bytes_both_directions/validdata_pr.duration.replace({0: np.inf})
validdata_pr.loc[:, 'bpp_twoway'] = validdata_pr.bytes_both_directions/validdata_pr.number_total_packets.replace({0: np.inf})

In [228]:
#calculate report features
traindata_pr.loc[:, 'pps'] = traindata_pr.number_total_packets/traindata_pr.duration.replace({0: np.inf})
traindata_pr.loc[:, 'bps_oneway'] = traindata_pr.bytes_src_to_dst/traindata_pr.duration.replace({0: np.inf})
traindata_pr.loc[:, 'bpp_oneway'] = traindata_pr.bytes_src_to_dst/traindata_pr.number_total_packets.replace({0: np.inf})
traindata_pr.loc[:, 'bps_twoway'] = traindata_pr.bytes_both_directions/traindata_pr.duration.replace({0: np.inf})
traindata_pr.loc[:, 'bpp_twoway'] = traindata_pr.bytes_both_directions/traindata_pr.number_total_packets.replace({0: np.inf})

In [229]:
#merge columns
traindata_num = traindata_pr[['src_ip','duration','pps','bps_oneway','bpp_oneway','bps_twoway','bpp_twoway','labelvalues']]
testdata_num = testdata_pr[['src_ip','duration','pps','bps_oneway','bpp_oneway','bps_twoway','bpp_twoway','labelvalues']]
validdata_num = validdata_pr[['src_ip','duration','pps','bps_oneway','bpp_oneway','bps_twoway','bpp_twoway','labelvalues']]

In [230]:
traindata_num_noip = traindata_num.drop(['src_ip'],1)
testdata_num_noip = testdata_num.drop(['src_ip'],1)
validdata_num_noip = validdata_num.drop(['src_ip'],1)

Train and Test the model with all the numerical features

In [231]:
#try the model to check accuracy
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

ytrain = traindata_num_noip.iloc[:, 6].values
Xtrain = traindata_num_noip.iloc[:, 0:6]

ytest = testdata_num_noip.iloc[:, 6].values
Xtest = testdata_num_noip.iloc[:, 0:6]

In [232]:
sc = StandardScaler()
sc.fit(Xtrain)
Xtrain = pd.DataFrame(sc.transform(Xtrain), columns = Xtrain.columns, index = Xtrain.index)
Xtest = pd.DataFrame(sc.transform(Xtest), columns = Xtest.columns, index = Xtest.index)

In [233]:
c = LogisticRegression(random_state = 0, solver = 'newton-cg',max_iter = 100000).fit(Xtrain, ytrain)
ypred = c.predict(Xtest)

First attempt -> 77% accuracy

In [234]:
print(metrics.classification_report(ytest, ypred, digits=4))

              precision    recall  f1-score   support

           0     0.2428    0.5250    0.3321     60641
           1     0.9350    0.8067    0.8662    513742

    accuracy                         0.7770    574383
   macro avg     0.5889    0.6659    0.5991    574383
weighted avg     0.8619    0.7770    0.8098    574383



Perform Feature selection with ANOVA test

In [235]:
#MI
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

#apply SelectKBest class to extract best features
bestfeatures = SelectKBest(score_func = f_classif, k = 6)
fit = bestfeatures.fit(Xtrain, ytrain)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(Xtrain.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs', 'Score']  #naming the dataframe columns
print(featureScores.nlargest(6, 'Score'))  #print 20 best features

        Specs         Score
4  bps_twoway  92153.664519
1         pps  60813.221138
2  bps_oneway  53772.078221
0    duration  37207.533752
5  bpp_twoway  36316.858475
3  bpp_oneway  14185.250969


In [236]:
bestFeatures = featureScores.nlargest(3,'Score')
Xtrain1 = Xtrain[bestFeatures['Specs']]
Xtest1 = Xtest[bestFeatures['Specs']]

Train and Test with the best 3 features from our dataset

In [237]:
c1 = LogisticRegression(random_state = 0, solver = 'newton-cg',max_iter = 100000).fit(Xtrain1, ytrain)
ypred = c1.predict(Xtest1)

Second attempt -> 79.84% accuracy

In [238]:
print(metrics.classification_report(ytest, ypred, digits=4))

              precision    recall  f1-score   support

           0     0.1969    0.2955    0.2363     60641
           1     0.9116    0.8578    0.8839    513742

    accuracy                         0.7984    574383
   macro avg     0.5543    0.5766    0.5601    574383
weighted avg     0.8362    0.7984    0.8155    574383



Sort values to select IP

In [239]:
ypred_df = pd.DataFrame(data=ypred,columns=['predictedlabel'])
ytest_df = pd.DataFrame(data=ytest,columns=['truelabel'])
srcip_df = testdata[['src_ip','streamID']]
Xcomparison = pd.concat([Xtest1,srcip_df,ypred_df,ytest_df], axis=1)
Xcomparison_sort = Xcomparison.sort_values(by=['src_ip'])

In [240]:
Xcomparison_sort['predictedlabel'].value_counts()

1    483385
0     90998
Name: predictedlabel, dtype: int64

In [241]:
Xcomparison_sort['truelabel'].value_counts()

1    513742
0     60641
Name: truelabel, dtype: int64

Select IP that contains over 85% as attack class -> 147.101.94.182

In the final lines of code we confirm this IP to be an attack by using group by

In [242]:
XselectedIP = Xcomparison.loc[Xcomparison['src_ip'] == '147.101.94.182']

In [243]:
XselectedIP['predictedlabel'].value_counts()

1    17462
0     2843
Name: predictedlabel, dtype: int64

In [244]:
XselectedIP['truelabel'].value_counts()

1    20305
Name: truelabel, dtype: int64

Start the FGSM method

Copy test set with 3 features to keep track, also the values

In [245]:
selectedrow1 = Xtest1
labelvalue1 = testdata_num[['labelvalues']]

Get the probability estimates from Attack class from column with index = 1 and subtract true values to calculate the gradient of cost with respect to test data

In [246]:
delta1 = c1.predict_proba(selectedrow1.values).T[[1]].T - labelvalue1.values

Get the LR model weights

In [247]:
weight1 = c1.coef_

Get the direction matrix

In [248]:
direction1 = np.sign(np.matmul(delta1,weight1))
direction1_df = pd.DataFrame(data = direction1,columns=['d1','d2','d3'])
direction1_df1 = pd.concat([testdata['streamID'],direction1_df], axis=1)
direction1_df1_sel = direction1_df1[direction1_df1.streamID.isin(XselectedIP.streamID)]

Use auxiliary numpy arrays to calculate FGSM

In [249]:
XselectedIP_aux = XselectedIP.drop(['src_ip','streamID','predictedlabel','truelabel'],1)
direction1_df1_sel_aux = direction1_df1_sel.drop(['streamID'],1)
XselectedIP_aux = XselectedIP_aux.to_numpy()
direction1_df1_sel_aux = direction1_df1_sel_aux.to_numpy()

Calculate Adversarial Samples with FGSM step (Epsilon = 0.8)

In [250]:
Xadv1 = XselectedIP_aux + 0.8*direction1_df1_sel_aux

Set streamID as index

In [251]:
Xadv1_df = pd.DataFrame(data=Xadv1,columns=['bps_twoway','pps','bps_oneway'])
XselectedIP_res = XselectedIP.reset_index(drop=True)
Xadv1_df_res = Xadv1_df.reset_index(drop=True)
Xadv1_df_1_aux = XselectedIP_res.drop(['bps_twoway','pps','bps_oneway'],1)
Xadv1_df_1 = pd.concat([Xadv1_df_1_aux,Xadv1_df_res],axis=1)
Xadv1_df_1 = Xadv1_df_1.set_index('streamID')
Xadv1_df_2 = Xadv1_df_1.drop(['src_ip','predictedlabel','truelabel'],1)

(*****) Get first and final values of Adversarial Samples to keep track and assess at the end

In [252]:
Xadv1_df_2.iloc[0]

bps_twoway   -0.416293
pps           1.193303
bps_oneway    1.049458
Name: 31, dtype: float64

In [253]:
Xadv1_df_2.iloc[20304]

bps_twoway   -1.250522
pps           0.481799
bps_oneway    0.537201
Name: 479629, dtype: float64

Put streamID as index in the test set to update the attacked features' values

In [254]:
XtestNew = pd.concat([Xtest1,testdata['streamID']],axis=1)
XtestNew = XtestNew.set_index('streamID')

(*****) These are the first and final values of the selected IP from the original test set according to the streamID

In [255]:
XtestNew.iloc[30]

bps_twoway    0.383707
pps           0.393303
bps_oneway    0.249458
Name: 31, dtype: float64

In [256]:
XtestNew.iloc[479628]

bps_twoway   -0.450522
pps          -0.318201
bps_oneway   -0.262799
Name: 479629, dtype: float64

Hold the dataframes for comparison and apply update

In [257]:
XtestNew1 = XtestNew.copy()
XtestNew1.update(Xadv1_df_2)

In [258]:
XtestNew1.iloc[30]

bps_twoway   -0.416293
pps           1.193303
bps_oneway    1.049458
Name: 31, dtype: float64

In [259]:
XtestNew1.iloc[479628]

bps_twoway   -1.250522
pps           0.481799
bps_oneway    0.537201
Name: 479629, dtype: float64

Values are correctly updated, OK to proceed

Now we test the model again with the adversarial samples set for IP = 147.101.94.182

In [260]:
ypred1 = c1.predict(XtestNew1)

Third attemot: Accuracy goes down from 79.84% to 76.80%

In [261]:
print(metrics.classification_report(ytest, ypred1, digits=4))

              precision    recall  f1-score   support

           0     0.1652    0.2955    0.2119     60641
           1     0.9083    0.8238    0.8640    513742

    accuracy                         0.7680    574383
   macro avg     0.5368    0.5596    0.5380    574383
weighted avg     0.8299    0.7680    0.7951    574383



Now we check that labels of IP = 147.101.94.182 are all classified as Normal

In [262]:
ypred_df = pd.DataFrame(data=ypred1,columns=['predictedlabel'])
ytest_df = pd.DataFrame(data=ytest,columns=['truelabel'])
srcip_df = testdata[['src_ip','streamID']]
Xcomparison = pd.concat([XtestNew1,srcip_df,ypred_df,ytest_df], axis=1)
Xcomparison_sort = Xcomparison.sort_values(by=['src_ip'])
XselectedIP = Xcomparison.loc[Xcomparison['src_ip'] == '147.101.94.182']

Now the targeted IP classified all as Normal as expected by the FGSM model, in the first attempt we predicted 17462 Attack labels, now we have all the 20305 samples predicted as Normal

In [263]:
XselectedIP['predictedlabel'].value_counts()

0.0    20305
Name: predictedlabel, dtype: int64

In [264]:
XselectedIP['truelabel'].value_counts()

1.0    20305
Name: truelabel, dtype: int64

Export

In [265]:
XselectedIP = XselectedIP.rename(columns={'bps_twoway':'bps_twoway_advSamp','pps':'pps_advSamp','bps_oneway':'bps_oneway_advSamp'})

In [266]:
XselectedIPfeats = testdata.loc[testdata['src_ip'] == '147.101.94.182']

In [267]:
Xtest_export = pd.concat([XselectedIP['bps_twoway_advSamp'],XselectedIP['pps_advSamp'],XselectedIP['bps_oneway_advSamp'],XselectedIPfeats],axis=1)

In [268]:
Xaux = Xtest1.copy()

In [269]:
Xaux = Xaux.rename(columns={'bps_twoway':'bps_twoway_origScaled','pps':'pps_origScaled','bps_oneway':'bps_oneway_origScaled'})

In [270]:
Xaux1 = pd.concat([Xaux,testdata['src_ip']],axis=1)

In [271]:
Xaux2 = Xaux1.loc[Xaux1['src_ip'] == '147.101.94.182']

In [272]:
Xaux3 = pd.concat([Xaux2['bps_twoway_origScaled'],Xaux2['pps_origScaled'],Xaux2['bps_oneway_origScaled'],Xtest_export],axis=1)

In [273]:
Xaux3.to_csv('task2_export.csv',index=False)

Generate Identified Botnet IP addresses

In [274]:
traindata_num_filt = traindata_num.drop(['duration','bps_oneway','bpp_oneway'],1)
testdata_num_filt = testdata_num.drop(['duration','bps_oneway','bpp_oneway'],1)

In [275]:
#group by src_ip
traingroup = traindata_num_filt.groupby(['src_ip']).mean()
traingroup['labelvalues'] = traingroup['labelvalues'].round()
traingroup['labelvalues'].value_counts()

0.0    70
1.0    48
Name: labelvalues, dtype: int64

In [276]:
#group by src_ip
testgroup = testdata_num_filt.groupby(['src_ip']).mean()
testgroup['labelvalues'] = testgroup['labelvalues'].round()
testgroup['labelvalues'].value_counts()

0.0    29
1.0    28
Name: labelvalues, dtype: int64

In [277]:
traingroup['src_ip'] = traingroup.index
traingroup = traingroup.reset_index(drop = True)
testgroup['src_ip'] = testgroup.index
testgroup = testgroup.reset_index(drop = True)

In [278]:
Xtrain2 = traingroup.drop(['labelvalues','src_ip'],1)
ytrain2 = traingroup[['labelvalues']]
Xtest2 = testgroup.drop(['labelvalues','src_ip'],1)
ytest2 = testgroup[['labelvalues']]
sc = StandardScaler()
sc.fit(Xtrain2)
Xtrain2 = pd.DataFrame(sc.transform(Xtrain2),columns=Xtrain2.columns,index=Xtrain2.index)
Xtest2 = pd.DataFrame(sc.transform(Xtest2),columns=Xtest2.columns,index=Xtest2.index)
c = LogisticRegression(random_state = 0, solver = 'liblinear').fit(Xtrain2, ytrain2)
ypred2 = c.predict(Xtest2)
#accuracy went up by 10 points
print(metrics.classification_report(ytest2, ypred2, digits=4))

              precision    recall  f1-score   support

         0.0     1.0000    0.7931    0.8846        29
         1.0     0.8235    1.0000    0.9032        28

    accuracy                         0.8947        57
   macro avg     0.9118    0.8966    0.8939        57
weighted avg     0.9133    0.8947    0.8938        57



  y = column_or_1d(y, warn=True)


Confirm that 147.101.94.182 is attack after group by

In [279]:
#search for IP to attack
predictedlabels = pd.DataFrame(ypred2,columns=['predictedlabels'])
comparison = pd.concat([testgroup[['src_ip','labelvalues']].copy(),predictedlabels],1)
comparison

Unnamed: 0,src_ip,labelvalues,predictedlabels
0,122.226.12.150,0.0,1.0
1,147.101.94.182,1.0,1.0
2,147.113.94.198,1.0,1.0
3,147.114.98.192,1.0,1.0
4,147.122.97.192,1.0,1.0
5,147.130.96.195,1.0,1.0
6,147.143.92.183,1.0,1.0
7,147.148.92.184,1.0,1.0
8,147.153.100.188,1.0,1.0
9,147.153.92.185,1.0,1.0


In [280]:
comparison.to_csv('task2_botnetIPs_export.csv',index=False)