In [None]:
import pandas as pd
df = pd.read_csv('..\data\data_arsenic\scada_data_conta_22_node_22.csv')

   timestep  node  chlorine_concentration  arsenic_concentration
0         0    22                     0.0                    0.0
1         1    22                     0.0                    0.0
2         2    22                     0.0                    0.0
3         3    22                     0.0                    0.0
4         4    22                     0.0                    0.0


In [18]:
X = df[['chlorine_concentration']]
print(X.head())


   chlorine_concentration
0                     0.0
1                     0.0
2                     0.0
3                     0.0
4                     0.0


In [None]:
from sklearn.ensemble import IsolationForest

model = IsolationForest(contamination='auto', random_state=42)
df['anomaly'] = model.fit_predict(X)


     timestep  node  chlorine_concentration  arsenic_concentration  anomaly
8           8    22                0.362457               0.000000       -1
9           9    22                0.507978               0.000000       -1
10         10    22                0.545589               0.000000       -1
11         11    22                0.626842               0.000000       -1
12         12    22                0.624299               0.000000       -1
..        ...   ...                     ...                    ...      ...
977       977    22                0.001220               0.000131       -1
980       980    22                0.350980               0.030649       -1
991       991    22                0.030090               0.003125       -1
994       994    22                0.000946               0.000095       -1
998       998    22                0.326189               0.024927       -1

[397 rows x 5 columns]
anomaly
 1    612
-1    397
Name: count, dtype: int64


In [44]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix

y_true = df['arsenic_concentration'].apply(lambda x: -1 if x > 0 else 1).values
y_pred = df['anomaly'].values

print(df['anomaly'].value_counts()) 

print(f"Accuracy: {accuracy_score(y_true, y_pred):.2%}")
print("Matrice de confusion :")
print(confusion_matrix(y_true, y_pred, labels=[1, -1]))


anomaly
 1    612
-1    397
Name: count, dtype: int64
Accuracy: 48.17%
Matrice de confusion :
[[135  46]
 [477 351]]


In [None]:

def create_features(df, feature_col, window_size=10):
    """ create features for anomaly detection using a sliding window approach
    Parameters:
    - df: a pandas DataFrame containing the data
    - feature_col: the name of the column to use as feature
    - window_size: the size of the sliding window
    
    Returns:
    - a numpy array containing the features for each time step
    """
    feature = df[feature_col].values
    print(feature)
    
    features = []
    
    for i in range(window_size, len(feature)):
        
        window = feature[i-window_size:i]
           
        features.append([
            feature[i],
            window.mean(),
            window.std(),
            window.min(),
            window.max()
        ])
    
    return np.array(features)

def calculate_labels(df, feature_col, window_size=10):
    """ calculate labels for anomaly detection
    Parameters:
    - df: a pandas DataFrame containing the data
    - feature_col: the name of the column to use as feature
    
    Returns:
    - a numpy array containing the labels for each time step (1 if anomaly, 0 otherwise)
    """
    feature = df[feature_col].values
    labels = []
    
    for i in range(window_size, len(feature)):
        if feature[i] > 0: 
            labels.append(-1)
        else:
            labels.append(1)
    
    return np.array(labels)


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

X = create_features(df, 'chlorine_concentration', window_size=10)
y_true = calculate_labels(df, 'arsenic_concentration', window_size=10)

model = IsolationForest(contamination='auto', random_state=42)
y_pred = model.fit_predict(X)

print(f"Accuracy: {accuracy_score(y_true, y_pred):.2%}")
print("Matrice de confusion :")
print(confusion_matrix(y_true, y_pred, labels=[1, -1]))

[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 3.18831362e-01
 2.18041646e-01 1.72007986e-07]
Accuracy: 32.93%
Matrice de confusion :
[[108  63]
 [607 221]]
