In [1]:
pip install pandas unicode matplotlib seaborn scikit-learn plotly scipy 

Note: you may need to restart the kernel to use updated packages.




In [1]:
import pandas as pd
pd.set_option("mode.copy_on_write", True)

# 1. Calculation Average Silhouette Score by Cluster using ISOLATION FOREST

The objective is to measure How well the inliers were separated from the outliers. In this analyses, we consider 2 clusters (Inlier and Outlier) and hence, the Silhoutte score comes to play an important role.

In [2]:
import pandas as pd
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler



features = ['Active_energy', 'Reactive_energy', 'Voltaje_FA', 'Voltaje_FC', 'Factor_Potencia']
time_features= ['Dia_Sin', 'Dia_Cos','Hora_Sin','Hora_Cos','Mes_Sin','Mes_Cos']
final_features= features + time_features
# Load the data
isoforest_df = pd.read_csv('../data/output/consumo_datamart_gold_isoforest.csv', usecols= final_features + ['Anomaly_Cluster'])

scaler = StandardScaler()

# Scale the features for IsoForest dataset
isoforest_df[features] = scaler.fit_transform(isoforest_df[features])
labels_iso = isoforest_df['Anomaly_Cluster']


# Compute the silhouette scores
silhouette_score_iso = silhouette_score(isoforest_df, labels_iso)
print(f"Silhouette Score Isolation Forest {silhouette_score_iso}")




Silhouette Score Isolation Forest 0.24949489471168887


# 2. Calculation Average Silhouette Score by Cluster using LOCAL OUTLIER FACTOR

The objective is to measure How well the inliers were separated from the outliers. In this analyses, we consider 2 clusters (Inlier and Outlier) and hence, the Silhoutte score comes to play an important role.

In [3]:
import pandas as pd
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler


features = ['Active_energy', 'Reactive_energy', 'Voltaje_FA', 'Voltaje_FC', 'Factor_Potencia']
time_features= ['Dia_Sin', 'Dia_Cos','Hora_Sin','Hora_Cos','Mes_Sin','Mes_Cos']
final_features= features + time_features
# Load the data
lof_df = pd.read_csv('../data/output/consumo_datamart_gold_lof.csv',usecols= final_features + ['Anomaly_Cluster'])

scaler = StandardScaler()

# Scale the features for LOF dataset
lof_df[features] = scaler.fit_transform(lof_df[features])
labels_lof = lof_df['Anomaly_Cluster']

# Compute the silhouette scores
silhouette_score_lof = silhouette_score(lof_df, labels_lof)
print(f"Silhouette Score  LOF {silhouette_score_lof}")




Silhouette Score  LOF 0.12332711273302875


# 3. Calculation Average Silhouette Score by Cluster using OneClassSVM


The objective is to measure How well the inliers were separated from the outliers. In this analyses, we consider 2 clusters (Inlier and Outlier) and hence, the Silhoutte score comes to play an important role.

In [6]:
import pandas as pd
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler


# Define the feature columns
features = ['Active_energy', 'Reactive_energy', 'Voltaje_FA', 'Voltaje_FC', 'Factor_Potencia']
time_features= ['Dia_Sin', 'Dia_Cos','Hora_Sin','Hora_Cos','Mes_Sin','Mes_Cos']
final_features= features + time_features
# Load the data
ocsvm_df = pd.read_csv('../data/output/consumo_datamart_gold_ocsvm.csv',usecols= final_features + ['Anomaly_Cluster'])



# Initialize the scaler
scaler = StandardScaler()

# Scale the features for LOF dataset
ocsvm_df[features] = scaler.fit_transform(ocsvm_df[features])
labels_ocsvm = ocsvm_df['Anomaly_Cluster']

# Compute the silhouette scores
silhouette_score_ocsvm = silhouette_score(ocsvm_df, labels_ocsvm)
print(f"Silhouette Score  OneClassSVM {silhouette_score_ocsvm}")




Silhouette Score  OneClassSVM 0.22593096248235311


In [5]:
silhouette_score_lof

0.22593096248235311

# 4. Calculation Average Silhouette Score by Cluster using DBSCAN


The objective is to measure How well the inliers were separated from the outliers. In this analyses, we consider 2 clusters (Inlier and Outlier) and hence, the Silhoutte score comes to play an important role.

In [2]:
import pandas as pd
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler


# Define the feature columns
features = ['Active_energy', 'Reactive_energy', 'Voltaje_FA', 'Voltaje_FC', 'Factor_Potencia']
time_features= ['Dia_Sin', 'Dia_Cos','Hora_Sin','Hora_Cos','Mes_Sin','Mes_Cos']
final_features= features + time_features
# Load the data
dbscan_df = pd.read_csv('../data/output/consumo_datamart_gold_dbscan.csv', usecols= final_features + ['Anomaly_Cluster'])



# Initialize the scaler
scaler = StandardScaler()

# Scale the features for dbscan_df dataset
dbscan_df[features] = scaler.fit_transform(dbscan_df[features])
labels_dbscan = dbscan_df['Anomaly_Cluster']


# Compute the silhouette scores
silhouette_score_dbscan = silhouette_score(dbscan_df, labels_dbscan)
print(f"Silhouette Score DBScan {silhouette_score_dbscan}")




Silhouette Score DBScan 0.10099289782890346


# 5 Selection of anomaly detection model


<table>
<tr> 
  <th>Model</th>
  <th> Silhouette Score</th>
</tr>
<tr style="background-color:blue;">
  <td>Isolation Forest</td>
  <td>0.2494</td>
</tr>
<tr>
  <td>OneClassSVM</td>
  <td>0.2259</td>
</tr>
<tr>
  <td>Local Oultier Factor</td>
  <td>0.1233</td>
</tr>
<tr>
  <td>HDBSCAN</td>
  <td>0.10099</td>
</tr>
</table>

The model with the best sillhoutte escore  is selected and hence, we will use Isolation Forest. We know the score of 0.24 is not very significant but in this unsupervised scenario, at least we count on a metric.