In [142]:
import pandas as pd
from sklearn.decomposition import PCA
from time import time
import numpy as np
import matplotlib.pyplot as plt; plt.style.use('dark_background')
import seaborn as sns
from tqdm import tqdm
from sklearn.metrics import confusion_matrix
import warnings; warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectPercentile, f_classif, RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [143]:
col_names = ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent", "hot", 
             "num_failed_logins", "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", 
             "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login", "count", "srv_count", 
             "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", 
             "srv_diff_host_rate", "dst_host_count","dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate", 
             "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate", 
             "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"]

num_features = ["duration", "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", 
                "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", "num_shells",
                "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate", 
                "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate",
                "dst_host_count", "dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate", 
                "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
                "dst_host_rerror_rate", "dst_host_srv_rerror_rate"]

pd_df = pd.read_csv('../pump_sensor_data/sensor.csv')
nd_df = pd.read_csv('../KDD99/corrected.csv', names=col_names)

In [144]:
# Clean Physical Data
pd_df.drop(['sensor_50','sensor_15', 'timestamp'], axis=1, inplace=True)
pd_df = pd_df.dropna()

In [145]:
# Split into X and y
pd_x_features = pd_df.columns.drop('machine_status').tolist()
pd_x = pd_df[pd_x_features]
pd_y = pd_df['machine_status']

nd_x = nd_df[num_features]
nd_y = nd_df['label']

In [176]:
nd_y.value_counts()

label
smurf.              164091
normal.              60593
neptune.             58001
snmpgetattack.        7741
mailbomb.             5000
guess_passwd.         4367
snmpguess.            2406
satan.                1633
warezmaster.          1602
back.                 1098
mscan.                1053
apache2.               794
processtable.          759
saint.                 736
portsweep.             354
ipsweep.               306
httptunnel.            158
pod.                    87
nmap.                   84
buffer_overflow.        22
multihop.               18
named.                  17
sendmail.               17
ps.                     16
xterm.                  13
rootkit.                13
teardrop.               12
xlock.                   9
land.                    9
xsnoop.                  4
ftp_write.               3
perl.                    2
phf.                     2
udpstorm.                2
worm.                    2
loadmodule.              2
sqlattack.            

In [146]:
# Scale the data
scaler = MinMaxScaler()

# Apply Min-Max scaling to each column separately
pd_x_scaled = pd_x.copy()
for column in pd_x.columns:
    column_data = pd_x[column].values.reshape(-1, 1)
    pd_x[column] = scaler.fit_transform(column_data)

pd_x_scaled.columns = pd_x.columns

nd_x_scaled = nd_x.copy()
for column in nd_x.columns:
    column_data = nd_x[column].values.reshape(-1, 1)
    nd_x_scaled[column] = scaler.fit_transform(column_data)

nd_x_scaled.columns = nd_x.columns

In [147]:
def identify_anomaly(row, y, clf):
    pred = clf.predict(row)
    anom_acc = accuracy_score(y, pred)
    return pred, anom_acc
    
def get_anomaly_scores(df_original, df_restored):
    loss = np.sum((np.array(df_original) - np.array(df_restored)) ** 2, axis=1)
    return loss

In [152]:
# Train PCA models
pca_pd = PCA(n_components=10, random_state=0)
pca_pd_x = pd.DataFrame(pca_pd.fit_transform(pd_x_scaled))

X_train_pd_pca, X_test_pd_pca, y_train_pd_pca, y_test_pd_pca = train_test_split(pca_pd_x, pd_y, test_size=0.2, random_state=42)
X_train_pd_org, X_test_pd_org, y_train_pd_org, y_test_pd_org = train_test_split(pd_x_scaled, pd_y, test_size=0.2, random_state=42)

X_train_pd_pca.reset_index(drop=True, inplace=True)
X_test_pd_pca.reset_index(drop=True, inplace=True)
X_train_pd_org.reset_index(drop=True, inplace=True)
X_test_pd_org.reset_index(drop=True, inplace=True)

clf_pd = RandomForestClassifier(random_state = 0)
t0 = time()
clf_pd.fit(X_train_pd_org, y_train_pd_org)
tt = time() - t0
print ("Trained in {} seconds".format(round(tt,3)))

Trained in 59.477 seconds


In [153]:
list_of_anomaly_preds_pd = []
list_of_anomaly_pd = []
list_of_norm_preds_pd = []
list_of_loss_pd = []
list_of_anomaly_loss_pd = []
data_and_preds_pd = []

for index, row in X_test_pd_pca.iterrows():
    inverse_row = pca_pd.inverse_transform(row.values.reshape(1, -1))
    org_row = X_test_pd_org.iloc[index].values.reshape(1, -1)  
    org_y = [y_test_pd_org.iloc[index]] 
    loss = get_anomaly_scores(org_row, inverse_row)
    list_of_loss_pd.append(loss)
    if loss > 0.02:
        list_of_anomaly_loss_pd.append(loss)
        anomaly_pred, anomaly_acc = identify_anomaly(org_row, org_y, clf_pd)
        list_of_anomaly_preds_pd.append(anomaly_acc)
        list_of_anomaly_pd.append(org_y) 
        data_and_preds_pd.append({'Data': org_row.flatten(), 'Prediction': anomaly_pred})     
    else:
        norm_pred = clf_pd.predict(org_row)
        norm_acc = accuracy_score(org_y, norm_pred)
        list_of_norm_preds_pd.append(norm_acc)
        data_and_preds_pd.append({'Data': org_row.flatten(), 'Prediction': norm_pred})

In [154]:
pca_nd = PCA(n_components=10, random_state=0)
pca_nd_x = pd.DataFrame(pca_nd.fit_transform(nd_x_scaled))

X_train_nd_pca, X_test_nd_pca, y_train_nd_pca, y_test_nd_pca = train_test_split(pca_nd_x, nd_y, test_size=0.2, random_state=42)
X_train_nd_org, X_test_nd_org, y_train_nd_org, y_test_nd_org = train_test_split(nd_x_scaled, nd_y, test_size=0.2, random_state=42)

X_train_nd_pca.reset_index(drop=True, inplace=True)
X_test_nd_pca.reset_index(drop=True, inplace=True)
X_train_nd_org.reset_index(drop=True, inplace=True)
X_test_nd_org.reset_index(drop=True, inplace=True)

clf_nd = RandomForestClassifier(random_state = 0)
t0 = time()
clf_nd.fit(X_train_nd_org, y_train_nd_org)
tt = time() - t0
print ("Trained in {} seconds".format(round(tt,3)))

Trained in 12.011 seconds


In [155]:
list_of_anomaly_preds_nd = []
list_of_anomaly_nd = []
list_of_norm_preds_nd = []
list_of_loss_nd = []
list_of_anomaly_loss_nd = []
data_and_preds_nd = []


for index, row in X_test_nd_pca.iterrows():
    inverse_row = pca_nd.inverse_transform(row.values.reshape(1, -1))
    org_row = X_test_nd_org.iloc[index].values.reshape(1, -1) 
    org_y = [y_test_nd_org.iloc[index]] 
    loss = get_anomaly_scores(org_row, inverse_row)
    list_of_loss_nd.append(loss)
    if loss > 0.02:
        list_of_anomaly_loss_nd.append(loss)
        anomaly_pred, anomaly_acc = identify_anomaly(org_row, org_y, clf_nd)
        list_of_anomaly_preds_nd.append(anomaly_acc)
        list_of_anomaly_nd.append(org_y) 
        data_and_preds_nd.append({'Data': org_row.flatten(), 'Prediction': anomaly_pred})
    else:
        norm_pred = clf_nd.predict(org_row)
        norm_acc = accuracy_score(org_y, norm_pred) 
        list_of_norm_preds_nd.append(norm_acc)
        data_and_preds_nd.append({'Data': org_row.flatten(), 'Prediction': norm_pred})

In [159]:
df_data_predictions_pd = pd.DataFrame(data_and_preds_pd)
df_data_predictions_nd = pd.DataFrame(data_and_preds_nd)

In [187]:
def logic_layer(network_data, physical_data):
    if network_data and physical_data:
        return 'Probably FDI'
    elif network_data and not physical_data:
        return 'Probably Network Attack'
    elif not network_data and physical_data:
        return 'Probably Physical Fault'
    else:
        return 'Normal Operation'

In [178]:
data_length = 32000

In [180]:
df_data_predictions_pd = df_data_predictions_pd.iloc[:data_length]
df_data_predictions_pd['Attack'] = df_data_predictions_pd['Prediction'].apply(lambda x: False if x == 'NORMAL' else True)
df_data_predictions_pd

Unnamed: 0,Data,Prediction,Attack
0,"[212193.0, 2.406366, 50.824649810790994, 52.12...",[NORMAL],False
1,"[51945.0, 2.455556, 48.00347, 54.03646, 46.050...",[NORMAL],False
2,"[142521.0, 2.39456, 45.44271, 52.73437, 47.612...",[NORMAL],False
3,"[66200.0, 2.537211, 48.17708, 53.2118, 43.4027...",[NORMAL],False
4,"[84708.0, 2.460474, 50.824649810790994, 54.687...",[NORMAL],False
...,...,...,...
31995,"[189237.0, 2.463426, 48.9149284362793, 51.1718...",[NORMAL],False
31996,"[200192.0, 2.406366, 50.434024810790994, 50.52...",[NORMAL],False
31997,"[174580.0, 2.469329, 47.7864570617676, 50.6076...",[NORMAL],False
31998,"[42958.0, 2.453588, 45.6597213745117, 50.34722...",[NORMAL],False


In [182]:
df_data_predictions_nd = df_data_predictions_nd.iloc[:data_length]
df_data_predictions_nd['Attack'] = df_data_predictions_nd['Prediction'].apply(lambda x: False if x == 'normal.' else True)
df_data_predictions_nd

Unnamed: 0,Data,Prediction,Attack
0,"[0.0, 1.642641234675367e-05, 0.0, 0.0, 0.0, 0....",[smurf.],True
1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[neptune.],True
2,"[0.0, 1.642641234675367e-05, 0.0, 0.0, 0.0, 0....",[smurf.],True
3,"[0.0, 6.685167815539284e-07, 8.07198829792325e...",[normal.],False
4,"[0.0, 1.642641234675367e-05, 0.0, 0.0, 0.0, 0....",[smurf.],True
...,...,...,...
31995,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[neptune.],True
31996,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[apache2.],True
31997,"[0.0, 1.642641234675367e-05, 0.0, 0.0, 0.0, 0....",[smurf.],True
31998,"[0.0, 1.642641234675367e-05, 0.0, 0.0, 0.0, 0....",[smurf.],True


In [188]:
logic = []

for i in range(data_length):
    logic_layer_response = logic_layer(df_data_predictions_nd['Attack'][i], df_data_predictions_pd['Attack'][i])
    logic.append(logic_layer_response)

In [189]:
logic_df = pd.DataFrame(logic)
logic_df

Unnamed: 0,0
0,Probably Network Attack
1,Probably Network Attack
2,Probably Network Attack
3,Normal Operation
4,Probably Network Attack
...,...
31995,Probably Network Attack
31996,Probably Network Attack
31997,Probably Network Attack
31998,Probably Network Attack


In [166]:
overall_accuracy_nd = (sum(list_of_norm_preds_nd) + sum(list_of_anomaly_preds_nd)) / (len(list_of_norm_preds_nd) + len(list_of_anomaly_preds_nd))
print(overall_accuracy_nd)

0.9810146931164196


In [167]:
overall_accuracy_pd = (sum(list_of_norm_preds_pd) + sum(list_of_anomaly_preds_pd)) / (len(list_of_norm_preds_pd) + len(list_of_anomaly_preds_pd))
print(overall_accuracy_pd)

0.9998723284733039
