In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import copy
from sklearn.preprocessing import LabelEncoder

In [2]:
training_df = pd.read_csv("training_data", encoding="utf8", names = ['timestamp', 'duration', 'protocol', 'src_ip', 'src_port', 'direction', 'dst_ip', 'dst_port', 'state', 'src_type_service', 'dst_type_service', 'number_total_packets', 'bytes_both_directions', 'bytes_src_to_dst'], dtype = {'timestamp':object, 'duration':float, 'protocol':object, 'src_ip':object, 'src_port':object, 'direction':object, 'dst_ip':object, 'dst_port':object, 'state':object, 'src_type_service':object, 'dst_type_service':object, 'number_total_packets':int, 'bytes_both_directions':int, 'bytes_src_to_dst':int}, low_memory = False)
test_df = pd.read_csv("test_data", encoding="utf8", names = ['timestamp', 'duration', 'protocol', 'src_ip', 'src_port', 'direction', 'dst_ip', 'dst_port', 'state', 'src_type_service', 'dst_type_service', 'number_total_packets', 'bytes_both_directions', 'bytes_src_to_dst'], dtype = {'timestamp':object, 'duration':float, 'protocol':object, 'src_ip':object, 'src_port':object, 'direction':object, 'dst_ip':object, 'dst_port':object, 'state':object, 'src_type_service':object, 'dst_type_service':object, 'number_total_packets':int, 'bytes_both_directions':int, 'bytes_src_to_dst':int}, low_memory = False)
valid_df = pd.read_csv("valid_data_with_labels", encoding="utf8", names = ['timestamp', 'duration', 'protocol', 'src_ip', 'src_port', 'direction', 'dst_ip', 'dst_port', 'state', 'src_type_service', 'dst_type_service', 'number_total_packets', 'bytes_both_directions', 'bytes_src_to_dst','label'], dtype = {'timestamp':object, 'duration':float, 'protocol':object, 'src_ip':object, 'src_port':object, 'direction':object, 'dst_ip':object, 'dst_port':object, 'state':object, 'src_type_service':object, 'dst_type_service':object, 'number_total_packets':int, 'bytes_both_directions':int, 'bytes_src_to_dst':int, 'label':object}, low_memory = False)

In [3]:
traindata = copy.deepcopy(training_df)
testdata = copy.deepcopy(test_df)
validdata = copy.deepcopy(valid_df)

Checking training data

In [4]:
traindata['src_port'].fillna('None', inplace = True)
traindata['direction'] = traindata['direction'].str.strip()
traindata['dst_port'].fillna('None', inplace = True)
traindata['state'].fillna('None', inplace = True)
traindata['src_type_service'].fillna('None', inplace = True)
traindata['dst_type_service'].fillna('None', inplace = True)

Checking test data

In [5]:
testdata['src_port'].fillna('None', inplace = True)
testdata['direction'] = testdata['direction'].str.strip()
testdata['dst_port'].fillna('None', inplace = True)
testdata['state'].fillna('None', inplace = True)
testdata['src_type_service'].fillna('None', inplace = True)
testdata['dst_type_service'].fillna('None', inplace = True)

Checking valid data

In [6]:
validdata['src_port'].fillna('None', inplace = True)
validdata['direction'] = validdata['direction'].str.strip()
validdata['dst_port'].fillna('None', inplace = True)
validdata['state'].fillna('None', inplace = True)
validdata['src_type_service'].fillna('None', inplace = True)
validdata['dst_type_service'].fillna('None', inplace = True)

In [7]:
truelabels = validdata[['label']].copy()
truelabels['labelvalues'] = np.where(truelabels['label'].str.contains('Botnet', case = False, na = False), 1, 0)
labelvalues = truelabels[['labelvalues']].copy()
validationdata = validdata.join(labelvalues)

In [8]:
testdata['flowID'] = testdata[['src_ip', 'src_port', 'direction', 'dst_ip', 'dst_port']].agg('/'.join, axis = 1)

In [9]:
testdata.loc[:, 'pps'] = testdata.number_total_packets/testdata.duration.replace({0: np.inf})

In [10]:
testdata.loc[:, 'bps_oneway'] = testdata.bytes_src_to_dst/testdata.duration.replace({0: np.inf})

In [11]:
testdata.loc[:, 'bpp_oneway'] = testdata.bytes_src_to_dst/testdata.number_total_packets.replace({0: np.inf})

In [12]:
testdata.loc[:, 'bps_twoway'] = testdata.bytes_both_directions/testdata.duration.replace({0: np.inf})

In [14]:
testdata.loc[:, 'bpp_twoway'] = testdata.bytes_both_directions/testdata.number_total_packets.replace({0: np.inf})

In [31]:
testdata.to_csv('testFeatGen.csv', index = False)

In [15]:
validationdata['flowID'] = validationdata[['src_ip', 'src_port', 'direction', 'dst_ip', 'dst_port']].agg('/'.join, axis = 1)

In [16]:
validationdata.loc[:, 'pps'] = validationdata.number_total_packets/validationdata.duration.replace({0: np.inf})

In [17]:
validationdata.loc[:, 'bps_oneway'] = validationdata.bytes_src_to_dst/validationdata.duration.replace({0: np.inf})

In [18]:
validationdata.loc[:, 'bpp_oneway'] = validationdata.bytes_src_to_dst/validationdata.number_total_packets.replace({0: np.inf})

In [19]:
validationdata.loc[:, 'bps_twoway'] = validationdata.bytes_both_directions/validationdata.duration.replace({0: np.inf})

In [20]:
validationdata.loc[:, 'bpp_twoway'] = validationdata.bytes_both_directions/validationdata.number_total_packets.replace({0: np.inf})

In [32]:
validationdata.to_csv('validationFeatGen.csv', index = False)

In [15]:
traindata['flowID'] = traindata[['src_ip', 'src_port', 'direction', 'dst_ip', 'dst_port']].agg('/'.join, axis = 1)

In [16]:
traindata.loc[:, 'pps'] = traindata.number_total_packets/traindata.duration.replace({0: np.inf})

In [17]:
traindata.loc[:, 'bps_oneway'] = traindata.bytes_src_to_dst/traindata.duration.replace({0: np.inf})

In [18]:
traindata.loc[:, 'bpp_oneway'] = traindata.bytes_src_to_dst/traindata.number_total_packets.replace({0: np.inf})

In [19]:
traindata.loc[:, 'bps_twoway'] = traindata.bytes_both_directions/traindata.duration.replace({0: np.inf})

In [20]:
traindata.loc[:, 'bpp_twoway'] = traindata.bytes_both_directions/traindata.number_total_packets.replace({0: np.inf})

In [32]:
traindata.to_csv('trainFeatGen.csv', index = False)