# **Isolation Forest**

In [23]:
import numpy as np 
import pandas as pd 
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.metrics import mean_absolute_error, classification_report

In [24]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
PATH="C:\\Users\\c.manganaro\\Desktop\\TESI\\sample\\datasets\\SWaT\\normalized"
PLOT=True;

**read the normalized dataset**

In [25]:
df = pd.read_csv(PATH+'\\normalized.csv',parse_dates=['Timestamp'])

In [26]:
float64_cols = df.select_dtypes(include=['float64']).columns
df[float64_cols] = df[float64_cols].astype('float32')
int64_cols = df.select_dtypes(include=['int64']).columns
df[int64_cols] = df[int64_cols].astype('int32')
print(df.dtypes)

Timestamp         datetime64[ns]
Machine_Status            object
FIT101                   float32
LIT101                   float32
AIT201                   float32
AIT202                   float32
AIT203                   float32
FIT201                   float32
DPIT301                  float32
FIT301                   float32
LIT301                   float32
AIT401                   float32
AIT402                   float32
FIT401                   float32
LIT401                   float32
AIT501                   float32
AIT502                   float32
AIT503                   float32
AIT504                   float32
FIT501                   float32
FIT502                   float32
FIT503                   float32
FIT504                   float32
PIT501                   float32
PIT502                   float32
PIT503                   float32
FIT601                   float32
MV101                      int32
P101                       int32
P102                       int32
MV201     

**Dropping the timestamp**

In [27]:
df_copy = df
df_copy = df_copy.drop(['Timestamp'], axis=1)

**Handling the return values**

In Isolation Forest, the return values are -1, 0, and 1.

- The value -1 indicates that the data instance is highly anomalous compared to the rest of the dataset.
- The value 1 indicates that the data instance is not anomalous compared to the rest of the dataset.
- The value 0 indicates that the data instance is close to the anomaly threshold and could be considered anomalous or non-anomalous depending on the chosen threshold.

In general, data instances with an anomaly value less than zero are considered anomalous, while those with an anomaly value greater than zero are considered non-anomalous.

In [28]:
df_copy.head()

Unnamed: 0,Machine_Status,FIT101,LIT101,AIT201,AIT202,AIT203,FIT201,DPIT301,FIT301,LIT301,AIT401,AIT402,FIT401,LIT401,AIT501,AIT502,AIT503,AIT504,FIT501,FIT502,FIT503,FIT504,PIT501,PIT502,PIT503,FIT601,MV101,P101,P102,MV201,P201,P203,P204,P205,P206,MV301,MV302,MV303,MV304,P301,P302,P402,P403,UV401,P501,P602
0,Normal,0.879322,0.500023,0.899417,0.801947,0.153462,0.865044,0.438853,0.928726,0.770793,0.999677,0.079038,0.98035,0.929175,0.521646,0.106829,0.375534,0.010781,0.980677,0.939528,0.962783,0.970903,0.946126,0.449782,0.944116,7.1e-05,2,2,1,2,1,2,1,2,1,1,2,1,1,1,2,2,1,2,2,1
1,Normal,0.886285,0.500072,0.899417,0.801947,0.153462,0.865044,0.438853,0.929319,0.770793,0.999677,0.079038,0.981743,0.929131,0.521646,0.106829,0.375534,0.010781,0.980677,0.952695,0.962783,0.970903,0.946126,0.449782,0.944521,7.1e-05,2,2,1,2,1,2,1,2,1,1,2,1,1,1,2,2,1,2,2,1
2,Normal,0.901833,0.500023,0.899417,0.801304,0.153462,0.863956,0.437572,0.92948,0.771093,0.999677,0.079038,0.981743,0.928955,0.521646,0.106829,0.375534,0.010781,0.980458,0.950061,0.962783,0.97353,0.946188,0.449782,0.944521,7.1e-05,2,2,1,2,1,2,1,2,1,1,2,1,1,1,2,2,1,2,2,1
3,Normal,0.918195,0.50017,0.899417,0.801304,0.153462,0.863956,0.437572,0.92948,0.771393,0.999677,0.079038,0.98101,0.928867,0.521646,0.106112,0.375534,0.010781,0.980458,0.940656,0.962783,0.97353,0.946188,0.449782,0.944197,7.1e-05,2,2,1,2,1,2,1,2,1,1,2,1,1,1,2,2,1,2,2,1
4,Normal,0.930842,0.500804,0.899417,0.801304,0.153462,0.864228,0.437572,0.92948,0.771655,0.999677,0.079038,0.98101,0.929351,0.521646,0.105216,0.375534,0.010781,0.980458,0.940656,0.962783,0.97353,0.946188,0.449782,0.943629,7.1e-05,2,2,1,2,1,2,1,2,1,1,2,1,1,1,2,2,1,2,2,1


in this code was selected only the -1 values and 1 values in order to have 

- -1 means that a there is an anomaly/attack value 
- 1 otherwise 

In [30]:
df_copy['isAttack'] = np.where(df_copy['Machine_Status'] == 'Attack', -1, 1)
df_copy = df_copy.drop(['Machine_Status'], axis=1)

In [31]:
anomaly_df = df_copy[df_copy['isAttack'] == -1]
not_anomaly_df = df_copy[df_copy['isAttack'] == 1]

In [36]:
anomaly_df['isAttack'].value_counts()

-1    54621
Name: isAttack, dtype: int64

In [37]:
not_anomaly_df['isAttack'].value_counts()

1    890298
Name: isAttack, dtype: int64

**Training the isolation forest**

In [38]:
X = df_copy.drop(['isAttack'], axis=1)
y = df_copy['isAttack']

**splitting into train and test**

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [39]:
contamination=0.03

In [40]:
model = IsolationForest(random_state=42, 
                      n_jobs=-1,
                      contamination=contamination)

In [41]:
X

Unnamed: 0,FIT101,LIT101,AIT201,AIT202,AIT203,FIT201,DPIT301,FIT301,LIT301,AIT401,AIT402,FIT401,LIT401,AIT501,AIT502,AIT503,AIT504,FIT501,FIT502,FIT503,FIT504,PIT501,PIT502,PIT503,FIT601,MV101,P101,P102,MV201,P201,P203,P204,P205,P206,MV301,MV302,MV303,MV304,P301,P302,P402,P403,UV401,P501,P602
0,0.879322,0.500023,0.899417,0.801947,0.153462,0.865044,0.438853,0.928726,0.770793,0.999677,0.079038,0.980350,0.929175,0.521646,0.106829,0.375534,0.010781,0.980677,0.939528,0.962783,0.970903,0.946126,0.449782,0.944116,0.000071,2,2,1,2,1,2,1,2,1,1,2,1,1,1,2,2,1,2,2,1
1,0.886285,0.500072,0.899417,0.801947,0.153462,0.865044,0.438853,0.929319,0.770793,0.999677,0.079038,0.981743,0.929131,0.521646,0.106829,0.375534,0.010781,0.980677,0.952695,0.962783,0.970903,0.946126,0.449782,0.944521,0.000071,2,2,1,2,1,2,1,2,1,1,2,1,1,1,2,2,1,2,2,1
2,0.901833,0.500023,0.899417,0.801304,0.153462,0.863956,0.437572,0.929480,0.771093,0.999677,0.079038,0.981743,0.928955,0.521646,0.106829,0.375534,0.010781,0.980458,0.950061,0.962783,0.973530,0.946188,0.449782,0.944521,0.000071,2,2,1,2,1,2,1,2,1,1,2,1,1,1,2,2,1,2,2,1
3,0.918195,0.500170,0.899417,0.801304,0.153462,0.863956,0.437572,0.929480,0.771393,0.999677,0.079038,0.981010,0.928867,0.521646,0.106112,0.375534,0.010781,0.980458,0.940656,0.962783,0.973530,0.946188,0.449782,0.944197,0.000071,2,2,1,2,1,2,1,2,1,1,2,1,1,1,2,2,1,2,2,1
4,0.930842,0.500804,0.899417,0.801304,0.153462,0.864228,0.437572,0.929480,0.771655,0.999677,0.079038,0.981010,0.929351,0.521646,0.105216,0.375534,0.010781,0.980458,0.940656,0.962783,0.973530,0.946188,0.449782,0.943629,0.000071,2,2,1,2,1,2,1,2,1,1,2,1,1,1,2,2,1,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
944914,0.891390,0.500267,0.899417,0.801947,0.153007,0.863956,0.438853,0.930936,0.770455,0.999677,0.079038,0.981450,0.928911,0.521646,0.107904,0.375534,0.010781,0.982135,0.948933,0.962783,0.973530,0.945937,0.484716,0.945414,0.000071,2,2,1,2,1,2,1,2,1,1,2,1,1,1,2,2,1,2,2,1
944915,0.887213,0.500170,0.899417,0.801947,0.153007,0.863956,0.438853,0.930073,0.770530,0.999677,0.079038,0.981450,0.929087,0.521646,0.107904,0.375534,0.010781,0.981333,0.940656,0.962783,0.973530,0.945937,0.484716,0.943629,0.000071,2,2,1,2,1,2,1,2,1,1,2,1,1,1,2,2,1,2,2,1
944916,0.882107,0.500072,0.899417,0.801947,0.153462,0.864863,0.438853,0.929103,0.770605,0.999677,0.079038,0.981450,0.929307,0.521646,0.107904,0.375534,0.010781,0.980677,0.934355,0.962783,0.973530,0.945937,0.484716,0.943629,0.000071,2,2,1,2,1,2,1,2,1,1,2,1,1,1,2,2,1,2,2,1
944917,0.879786,0.500121,0.899417,0.801947,0.153462,0.865044,0.438853,0.928726,0.770830,0.999677,0.079038,0.980350,0.929043,0.521646,0.106829,0.375534,0.010781,0.980677,0.934355,0.962783,0.973530,0.945937,0.449782,0.943629,0.000071,2,2,1,2,1,2,1,2,1,1,2,1,1,1,2,2,1,2,2,1


In [42]:
model.fit(X_train)

  "X does not have valid feature names, but"


IsolationForest(contamination=0.03, n_jobs=-1, random_state=42)

**predict the value**

In [43]:
y_predict = model.predict(X_test)

**Evaluation**

In [46]:
y_groudtruth = y_test.to_numpy()
target_names = ['normal', 'attack']
report = classification_report(y_groudtruth, y_predict, target_names=target_names, zero_division = 1)

In [47]:
print(report)

              precision    recall  f1-score   support

      normal       0.85      0.44      0.58     16458
      attack       0.97      1.00      0.98    267018

    accuracy                           0.96    283476
   macro avg       0.91      0.72      0.78    283476
weighted avg       0.96      0.96      0.96    283476



|           | precision | recall | f1-score | support  |
|-----------|-----------|--------|----------|----------|
| normal    | 0.85      | 0.44   | 0.58     | 16458    |
| attack    | 0.97      | 1.00   | 0.98     | 267018   |
| accuracy  |           |        | 0.96     | 283476   |
| macro avg | 0.91      | 0.72   | 0.78     | 283476   |
| weighted avg | 0.96   | 0.96   | 0.96     | 283476   |

comparison with the paper  -> <bR>



*On the Performance of Isolation Forest and Multi Layer Perceptron for Anomaly Detection in Industrial Control Systems Networks*


| Algorithm | Precision (%) | F-1 (%) |
|-----------|----------------|---------|
| iForest   | 75             | 90      |

