In [1]:
# Step 2
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_recall_fscore_support
from sklearn.metrics import f1_score,roc_auc_score
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import QuantileTransformer
import time

In [None]:
# Read the dataset
df=pd.read_csv('./UNSW_final_baseline_undersampled.csv')  



start_time = time.time()

# Here, use the columns derived from step 1.
columns_to_extract = [8, 6, 13, 10, 2, 19, 34, 7, 36, 37, 3, 1, 27, 5, 17, 4, 0, 35, 26, 25]

# Append the "label" column
columns_to_extract.append(df.shape[1] - 1)


df = df.iloc[:, columns_to_extract]

labelencoder = LabelEncoder()
df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1])


In [4]:
df

Unnamed: 0,sbytes,state,sloss,dttl,sport,swin,tcprtt,dur,is_sm_ips_ports,ct_state_ttl,...,srcip,sjit,proto,spkts,dsport,flow_id,ackdat,trans_depth,res_bdy_len,label
0,0.000000,1.265130,0.000000,0.000000,0.989555,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.815262,0.000000,0.000000,0.000000,0.958216,-1.041790,0.000000,0.000000,0.000000,6
1,0.000000,1.265130,0.000000,0.000000,0.989555,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.815262,0.000000,0.000000,0.000000,0.958216,-1.044133,0.000000,0.000000,0.000000,6
2,0.000000,1.265130,0.000000,0.000000,0.989555,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.815262,0.000000,0.000000,0.000000,0.958216,-1.043920,0.000000,0.000000,0.000000,6
3,0.000000,1.265130,0.000000,0.000000,0.989555,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.815262,0.000000,0.000000,0.000000,0.958216,-1.044098,0.000000,0.000000,0.000000,6
4,0.000000,1.265130,0.000000,0.000000,0.989555,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.815262,0.000000,0.000000,0.000000,0.958216,-1.042749,0.000000,0.000000,0.000000,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19567,-0.090612,-0.895813,-0.149201,4.875937,0.844210,0.604714,5.268379,-0.052035,-0.197784,0.964813,...,-0.615498,-0.066170,-0.305177,-0.468069,-1.218803,0.058036,5.960295,-0.050143,-0.014219,4
19568,-0.088286,-0.895813,-0.179381,4.875937,0.845453,0.604714,3.872846,-0.031605,-0.197784,0.964813,...,-0.615498,0.176210,-0.305177,-0.425371,-1.218803,0.058107,2.756131,-0.050143,-0.014219,4
19569,-0.087598,-0.895813,-0.149201,4.875937,0.845453,0.604714,3.872846,-0.036011,-0.197784,0.964813,...,-0.615498,30.515189,-0.305177,-0.425371,-1.218803,0.058107,2.756131,-0.050143,-0.014219,4
19570,-0.083777,-0.895813,-0.058661,4.875937,0.848160,0.604714,3.480929,-0.036551,-0.197784,0.964813,...,-0.615498,0.024663,-0.305177,-0.339976,-1.708583,0.058178,4.679006,-0.050143,-0.014219,3


In [5]:
# Transform all features into the scale of [0,1]

df['label'] = df['label'].astype('object')

numeric_features = df.dtypes[df.dtypes != 'object'].index
scaler = QuantileTransformer() 
df[numeric_features] = scaler.fit_transform(df[numeric_features])

df['label'] = df['label'].astype('int')

In [6]:
# Multiply the feature values by 255 to transform them into the scale of [0,255]
df[numeric_features] = df[numeric_features].apply(
    lambda x: (x*255))

In [7]:
X = df.drop(['label'],axis=1)
y = df['label']

In [8]:
df['label'].value_counts()

7    6342
5    5612
6    4048
3    1484
4    1058
2     424
1     290
0     259
8      50
9       5
Name: label, dtype: int64

In [9]:
df.to_csv('UNSW_final_baseline_smote.csv', index=False)

In [11]:
from imblearn.over_sampling import SMOTE
smote=SMOTE(n_jobs=-1,sampling_strategy={9:1800, 8:1800, 0:1800, 2:1800, 1:1800, 3:1800, 4:1800}, k_neighbors=1)

X_resampled, y_resampled = smote.fit_resample(X, y)

In [12]:
df_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['label'])], axis=1)

In [13]:
df_resampled

Unnamed: 0,sbytes,state,sloss,dttl,sport,swin,tcprtt,dur,is_sm_ips_ports,ct_state_ttl,...,srcip,sjit,proto,spkts,dsport,flow_id,ackdat,trans_depth,res_bdy_len,label
0,203.566066,255.00000,198.971471,198.843844,255.000000,133.881381,199.481982,211.861862,225.007508,105.037538,...,255.000000,199.737237,126.989489,182.379880,255.000000,31.904248,199.737237,226.156156,227.687688,6
1,203.566066,255.00000,198.971471,198.843844,255.000000,133.881381,199.481982,211.861862,225.007508,105.037538,...,255.000000,199.737237,126.989489,182.379880,255.000000,31.833235,199.737237,226.156156,227.687688,6
2,203.566066,255.00000,198.971471,198.843844,255.000000,133.881381,199.481982,211.861862,225.007508,105.037538,...,255.000000,199.737237,126.989489,182.379880,255.000000,31.839691,199.737237,226.156156,227.687688,6
3,203.566066,255.00000,198.971471,198.843844,255.000000,133.881381,199.481982,211.861862,225.007508,105.037538,...,255.000000,199.737237,126.989489,182.379880,255.000000,31.834311,199.737237,226.156156,227.687688,6
4,203.566066,255.00000,198.971471,198.843844,255.000000,133.881381,199.481982,211.861862,225.007508,105.037538,...,255.000000,199.737237,126.989489,182.379880,255.000000,31.875197,199.737237,226.156156,227.687688,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28597,128.167032,50.03003,108.483483,240.450450,109.531138,160.810811,250.708424,241.982577,0.000000,145.750751,...,77.085522,237.830365,12.379880,161.058561,196.291291,137.346311,251.513740,0.000000,0.000000,9
28598,164.823107,50.03003,108.483483,240.450450,48.516009,160.810811,243.590521,237.783549,0.000000,145.750751,...,32.608355,233.280489,12.379880,232.837677,196.291291,83.225537,249.008026,252.702703,254.776597,9
28599,133.466416,50.03003,108.483483,240.450450,138.511845,160.810811,244.974839,241.567284,0.000000,145.750751,...,95.105517,235.130558,12.379880,189.300981,196.291291,154.625109,246.161106,0.000000,0.000000,9
28600,137.892740,50.03003,108.483483,240.450450,162.718050,160.810811,240.185849,241.220409,0.000000,145.750751,...,110.156761,232.875537,12.379880,212.890530,196.291291,169.057266,241.690305,0.000000,0.000000,9


In [15]:
df_resampled.to_csv('UNSW_final_baseline_smote.csv', index=False)

In [16]:
end_time = time.time()

print(f"Elapsed time: {end_time - start_time} seconds.")

Elapsed time: 50.18856072425842 seconds.
