In [1]:
pip install --upgrade tensorflow

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install altair

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install pydot

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install matplotlib

Note: you may need to restart the kernel to use updated packages.


In [6]:
import numpy as np
import pandas as pd
import altair as alt
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from scipy.stats import chi2_contingency
from collections import Counter
from imblearn.over_sampling import SMOTE

In [7]:
def feature_target_split(data):
  label = data.pop('attack_cat')
  return data,label
def group_by_dtype(data):
    columns=data.columns # get list of columns
    unique=data.dtypes.unique() # select one of each dtype
    result={'int':[],'float':[],'object':[],'labels':[]}
    for col in columns:
      if data[col].dtypes=='float64': 
        temp= result['float'] # asign float64 array
      elif data[col].dtypes=='int64':
        temp= result['int'] # asign int array
      else:
        temp= result['object'] # asign object array
      temp.append(col) # append the array
    return result

def balance_dataset(X,y):
  sm = SMOTE(random_state=42)
  X_res, y_res = sm.fit_resample(X, y)
  return X_res, y_res

In [8]:
# import dataset into colab
imbalanced_train_val = pd.read_csv('./data/UNSW_NB15_training-set.csv')
test = pd.read_csv('./data/UNSW_NB15_testing-set.csv')
# Drop unwanted  columns
imbalanced_train_val=imbalanced_train_val.drop(columns=['id','label'])
test=test.drop(columns=['id','label'])
combined_df= pd.concat([imbalanced_train_val,test])
print(combined_df.shape, 'combined examples')
print(imbalanced_train_val.shape, 'imbalanced_train_val examples')
print(test.shape, 'test examples')

(257673, 43) combined examples
(82332, 43) imbalanced_train_val examples
(175341, 43) test examples


In [9]:
#step 1: Display first 5 rows of the dataset
combined_df.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat
0,1.1e-05,udp,-,INT,2,0,496,0,90909.0902,254,...,1,1,2,0,0,0,1,2,0,Normal
1,8e-06,udp,-,INT,2,0,1762,0,125000.0003,254,...,1,1,2,0,0,0,1,2,0,Normal
2,5e-06,udp,-,INT,2,0,1068,0,200000.0051,254,...,1,1,3,0,0,0,1,3,0,Normal
3,6e-06,udp,-,INT,2,0,900,0,166666.6608,254,...,2,1,3,0,0,0,2,3,0,Normal
4,1e-05,udp,-,INT,2,0,2126,0,100000.0025,254,...,2,1,3,0,0,0,2,3,0,Normal


In [10]:
#step 2: Check for empty cell
print('number of empty cell in combined set is: {}'.format(combined_df.isna().sum().sum()))

number of empty cell in combined set is: 0


In [11]:
# step 3: seperate the fetures from dataset
_combined_df=combined_df.copy()
_features,_target=feature_target_split(_combined_df)
print('Feature shape: {} rows and {} columns'.format(_features.shape[0],_features.shape[1]))
print('Target shape: {} rows and 1 columns'.format(_target.shape[0]))


Feature shape: 257673 rows and 42 columns
Target shape: 257673 rows and 1 columns


In [12]:
# step 4: group dataset based on type and display results
features_col=group_by_dtype(_features)
# # load an example dataset
features_table=pd.DataFrame({'Name':['int','float','object'],'Count':[len(features_col['int']),len(features_col['float']),len(features_col['object'])]})
# plot the dataset, referencing dataframe column names
alt.Chart(features_table, title="Categories of dataframe columns by type").mark_bar().encode(
  x='Count',
  y='Name',
  color='Name'
)

In [13]:
# imbalanced data
# step 5: Visualise the target categories in the train_val dataset
combined=combined_df.copy()
_features,_target=feature_target_split(combined)
target_table=_target.value_counts().rename_axis('Name').reset_index(name='Count')
alt.Chart(target_table, title="Categories of target column in imbalanced dataset (attack type)").mark_bar().encode(
  x='Count',
  y='Name',
  color='Name'
)

In [14]:
train_val_df=imbalanced_train_val.copy()
_features,_target=feature_target_split(train_val_df)

In [22]:
# # create balansed dataset from previous one

string_uniques={}
string_features = features_col['object']
string_df=_features.copy()
encoded_features=_features.copy()
string_df=string_df[string_features]
for n in string_features:
    codes, uniques = pd.factorize(string_df[n])
    string_uniques[n]=uniques
    string_df[n]=codes
encoded_features[string_features]=string_df
# test balance function on combined features
print('Original dataset shape %s' % Counter(_target))
_features_res,_target_res=balance_dataset(encoded_features,_target)
print('Resampled dataset shape %s' % Counter(_target_res))
features_res=_features_res.copy()
for n in string_features:
    uniques=string_uniques[n]
    features_res[n]=features_res[n].apply(lambda x: uniques[x])
resampled_train_val = pd.concat([features_res, _target_res], axis=1) # train val dataset resampled

Original dataset shape Counter({'Normal': 37000, 'Reconnaissance': 37000, 'Backdoor': 37000, 'DoS': 37000, 'Exploits': 37000, 'Analysis': 37000, 'Fuzzers': 37000, 'Worms': 37000, 'Shellcode': 37000, 'Generic': 37000})
Resampled dataset shape Counter({'Normal': 37000, 'Reconnaissance': 37000, 'Backdoor': 37000, 'DoS': 37000, 'Exploits': 37000, 'Analysis': 37000, 'Fuzzers': 37000, 'Worms': 37000, 'Shellcode': 37000, 'Generic': 37000})


In [23]:
resampled_train_val

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat
0,0.000011,udp,-,INT,2,0,496,0,90909.090200,254,...,1,1,2,0,0,0,1,2,0,Normal
1,0.000008,udp,-,INT,2,0,1762,0,125000.000300,254,...,1,1,2,0,0,0,1,2,0,Normal
2,0.000005,udp,-,INT,2,0,1068,0,200000.005100,254,...,1,1,3,0,0,0,1,3,0,Normal
3,0.000006,udp,-,INT,2,0,900,0,166666.660800,254,...,2,1,3,0,0,0,2,3,0,Normal
4,0.000010,udp,-,INT,2,0,2126,0,100000.002500,254,...,2,1,3,0,0,0,2,3,0,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369995,1.424299,tcp,http,FIN,15,45,1432,50553,33.565271,254,...,1,1,1,0,0,0,1,1,0,Worms
369996,0.467546,tcp,http,FIN,10,6,1298,268,46.072385,254,...,1,1,1,0,0,1,1,1,0,Worms
369997,3.518716,tcp,http,FIN,41,220,2353,280081,54.888184,254,...,2,1,1,0,0,1,2,1,0,Worms
369998,0.525146,tcp,http,FIN,10,6,1267,268,30.279165,254,...,1,1,1,0,0,1,1,1,0,Worms


In [24]:
resampled=resampled_train_val.copy()
_features,_target=feature_target_split(resampled)
target_table=_target.value_counts().rename_axis('Name').reset_index(name='Count')
alt.Chart(target_table, title="Categories of target column in train val (attack type)").mark_bar().encode(
  x='Count',
  y='Name',
  color='Name'
)

In [25]:
IMBALANCED_TRAIN_VAL =imbalanced_train_val
RESAMPLED_TRAIN_VAL =resampled_train_val
TEST=test
%store IMBALANCED_TRAIN_VAL
%store RESAMPLED_TRAIN_VAL
%store TEST

Stored 'IMBALANCED_TRAIN_VAL' (DataFrame)
Stored 'RESAMPLED_TRAIN_VAL' (DataFrame)
Stored 'TEST' (DataFrame)


In [26]:
resampled_train_val

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat
0,0.000011,udp,-,INT,2,0,496,0,90909.090200,254,...,1,1,2,0,0,0,1,2,0,Normal
1,0.000008,udp,-,INT,2,0,1762,0,125000.000300,254,...,1,1,2,0,0,0,1,2,0,Normal
2,0.000005,udp,-,INT,2,0,1068,0,200000.005100,254,...,1,1,3,0,0,0,1,3,0,Normal
3,0.000006,udp,-,INT,2,0,900,0,166666.660800,254,...,2,1,3,0,0,0,2,3,0,Normal
4,0.000010,udp,-,INT,2,0,2126,0,100000.002500,254,...,2,1,3,0,0,0,2,3,0,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369995,1.424299,tcp,http,FIN,15,45,1432,50553,33.565271,254,...,1,1,1,0,0,0,1,1,0,Worms
369996,0.467546,tcp,http,FIN,10,6,1298,268,46.072385,254,...,1,1,1,0,0,1,1,1,0,Worms
369997,3.518716,tcp,http,FIN,41,220,2353,280081,54.888184,254,...,2,1,1,0,0,1,2,1,0,Worms
369998,0.525146,tcp,http,FIN,10,6,1267,268,30.279165,254,...,1,1,1,0,0,1,1,1,0,Worms
