In [16]:
pip install pandas unicode matplotlib seaborn scikit-learn plotly scipy dask[complete]

Note: you may need to restart the kernel to use updated packages.




In [8]:
import pandas as pd
pd.set_option("mode.copy_on_write", True)

In [7]:
import os

# Set LOKY_MAX_CPU_COUNT environment variable to the number of cores 
os.environ['LOKY_MAX_CPU_COUNT'] = '4'  

# 1. Analysis of  Silhouette Score using ISOLATION FOREST

The objective is to measure How well the inliers were separated from the outliers. In this analyses, we consider 2 clusters (Inlier and Outlier) and hence, the Silhoutte score comes to play an important role.

## 1.1 Sillhoute Score by Global Model

In [17]:
import dask.dataframe as dd
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

# Load the data with Dask
isoforest_df = dd.read_csv('../data/output/consumo_datamart_gold_isoforest.csv', usecols= final_features + ['Anomaly_Global'])

# Convert to Dask DataFrame
isoforest_ddf = isoforest_df.compute()

scaler = StandardScaler()

# Scale the features for IsoForest dataset
isoforest_ddf[features] = scaler.fit_transform(isoforest_ddf[features])
labels_iso = isoforest_ddf['Anomaly_Global']

# Compute the silhouette scores
silhouette_score_iso = silhouette_score(isoforest_ddf[features], labels_iso)
print(f"Silhouette Score Isolation Forest {silhouette_score_iso}")


Silhouette Score Isolation Forest 0.43179600271244606


## 1.2 Sillhouete Score by Cluster

In [13]:
import pandas as pd
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler



features = ['Active_energy', 'Reactive_energy', 'Voltaje_FA', 'Voltaje_FC', 'Factor_Potencia']
time_features= ['Dia_Sin', 'Dia_Cos','Hora_Sin','Hora_Cos','Mes_Sin','Mes_Cos']
final_features= features + time_features
# Load the data
df = pd.read_csv('../data/output/consumo_datamart_gold_isoforest.csv')

cluster_silhouette_score_iso=[]
for cluster in range(0,4):
    isoforest_df = df[df['Cluster']==cluster]
    scaler = StandardScaler()
    # Scale the features for IsoForest dataset
    isoforest_df[features] = scaler.fit_transform(isoforest_df[features])
    labels_iso = isoforest_df['Anomaly_Cluster']
    # Compute the silhouette scores
    silhouette_score_iso = silhouette_score(isoforest_df[final_features], labels_iso)
    print(f"Silhouette Score Isolation Forest {silhouette_score_iso} Cluster {cluster}")
    cluster_silhouette_score_iso.append(silhouette_score_iso)
print(f'Mean of Sillhoutte Index {sum(cluster_silhouette_score_iso)/len(cluster_silhouette_score_iso)} ')
   
    




Silhouette Score Isolation Forest 0.3253967071414611 Cluster 0
Silhouette Score Isolation Forest 0.28394063526151 Cluster 1
Silhouette Score Isolation Forest 0.4274799830516023 Cluster 2
Silhouette Score Isolation Forest 0.2661928477662486 Cluster 3
Mean of Sillhoutte Index 0.3257525433052055 


## 1.3 Sillhouete Score by Sector 

In [29]:
pip install pyarrow --upgrade

Note: you may need to restart the kernel to use updated packages.




In [1]:
import dask.dataframe as dd
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define your features
features = ['Active_energy', 'Reactive_energy', 'Voltaje_FA', 'Voltaje_FC', 'Factor_Potencia']
time_features = ['Dia_Sin', 'Dia_Cos', 'Hora_Sin', 'Hora_Cos', 'Mes_Sin', 'Mes_Cos']
final_features = features + time_features

# Load the data with Dask
ddf = dd.read_csv('../data/output/consumo_datamart_gold_isoforest.csv', usecols=final_features + ['Anomaly_Sector', 'Sector_Economico'])

# Get unique sectors (requires computation)
sectors = ddf['Sector_Economico'].unique().compute().tolist()

sector_silhouette_score_iso = []

for sector in sectors:
    isoforest_ddf = ddf[ddf['Sector_Economico'] == sector].compute()
    
    scaler = StandardScaler()
    
    # Scale the features
    isoforest_ddf[features] = scaler.fit_transform(isoforest_ddf[features])
    labels_iso = isoforest_ddf['Anomaly_Sector']
    
    # Compute the silhouette scores
    silhouette_score_iso = silhouette_score(isoforest_ddf[final_features], labels_iso)
    print(f"Silhouette Score Isolation Forest {silhouette_score_iso} Sector {sector}")
    sector_silhouette_score_iso.append(silhouette_score_iso)

mean_silhouette_score = sum(sector_silhouette_score_iso) / len(sector_silhouette_score_iso)
print(f'Mean of Silhouette Index {mean_silhouette_score}')


Silhouette Score Isolation Forest 0.4009953665386885 Sector elaboracion_cacao_chocolate_productos_confiteria
Silhouette Score Isolation Forest 0.2782406464546849 Sector cultivo_frutos_nueces_arboles_arbustos
Silhouette Score Isolation Forest 0.2668525232590551 Sector cultivo_arboles_frutales_nueces
Silhouette Score Isolation Forest 0.2683451478491312 Sector cultivo_hortalizas_melones_raices_tuberculos
Silhouette Score Isolation Forest 0.22823935817319818 Sector captacion_tratamiento_distribucion_agua
Silhouette Score Isolation Forest 0.25844327932450545 Sector cultivo_hortalizas
Silhouette Score Isolation Forest 0.21927728254699647 Sector venta_mayor_metales_minerales_metaliferos
Mean of Silhouette Index 0.2743419434494657


# 2. Silhouette Analysis  using LOCAL OUTLIER FACTOR

The objective is to measure How well the inliers were separated from the outliers. In this analyses, we consider 2 clusters (Inlier and Outlier) and hence, the Silhoutte score comes to play an important role.

## 2.1 Silhoute Score by Global Model

In [3]:
import dask.dataframe as dd
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

# Load the data with Dask
lof_df = dd.read_csv('../data/output/consumo_datamart_gold_lof.csv', usecols= final_features + ['Anomaly_Global'])

# Convert to Dask DataFrame
lof_ddf = lof_df.compute()

scaler = StandardScaler()

# Scale the features
lof_ddf[features] = scaler.fit_transform(lof_ddf[features])
labels_iso = lof_ddf['Anomaly_Global']

# Compute the silhouette scores
silhouette_score_lof = silhouette_score(lof_ddf[features], labels_iso)
print(f"Silhouette Score LOF {silhouette_score_lof}")


Silhouette Score LOF -0.021587352920776137


## 2.2 Sillhoute Score by Sector

In [6]:
import dask.dataframe as dd
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define your features
features = ['Active_energy', 'Reactive_energy', 'Voltaje_FA', 'Voltaje_FC', 'Factor_Potencia']
time_features = ['Dia_Sin', 'Dia_Cos', 'Hora_Sin', 'Hora_Cos', 'Mes_Sin', 'Mes_Cos']
final_features = features + time_features

# Load the data with Dask
ddf = dd.read_csv('../data/output/consumo_datamart_gold_lof.csv', usecols=final_features + ['Anomaly_Sector', 'Sector_Economico'])

# Get unique sectors (requires computation)
sectors = ddf['Sector_Economico'].unique().compute().tolist()

sector_silhouette_score_lof = []

for sector in sectors:
    lof_ddf = ddf[ddf['Sector_Economico'] == sector].compute()
    
    scaler = StandardScaler()
    
    # Scale the features 
    lof_ddf[features] = scaler.fit_transform(lof_ddf[features])
    labels_lof= lof_ddf['Anomaly_Sector']
    
    # Compute the silhouette scores
    silhouette_score_lof = silhouette_score(lof_ddf[final_features], labels_lof)
    print(f"Silhouette Score LOF {silhouette_score_lof} Sector {sector}")
    sector_silhouette_score_lof.append(silhouette_score_lof)

mean_silhouette_score = sum(sector_silhouette_score_lof) / len(sector_silhouette_score_lof)
print(f'Mean of Silhouette Index {mean_silhouette_score}')


Silhouette Score LOF 0.01660629892893317 Sector elaboracion_cacao_chocolate_productos_confiteria
Silhouette Score LOF 0.00985094324804361 Sector cultivo_frutos_nueces_arboles_arbustos
Silhouette Score LOF 0.0067805564135244994 Sector cultivo_arboles_frutales_nueces
Silhouette Score LOF 0.1386379142759033 Sector cultivo_hortalizas_melones_raices_tuberculos
Silhouette Score LOF 0.05721223434210363 Sector captacion_tratamiento_distribucion_agua
Silhouette Score LOF 0.1591449417275712 Sector cultivo_hortalizas
Silhouette Score LOF 0.05794124569561515 Sector venta_mayor_metales_minerales_metaliferos
Mean of Silhouette Index 0.06373916209024208


## 2.3 Sillhoute Score by Cluster

In [9]:
import pandas as pd
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler



features = ['Active_energy', 'Reactive_energy', 'Voltaje_FA', 'Voltaje_FC', 'Factor_Potencia']
time_features= ['Dia_Sin', 'Dia_Cos','Hora_Sin','Hora_Cos','Mes_Sin','Mes_Cos']
final_features= features + time_features
# Load the data
df = pd.read_csv('../data/output/consumo_datamart_gold_lof.csv')

cluster_silhouette_score_lof=[]
for cluster in range(0,4):
    lof_df = df[df['Cluster']==cluster]
    scaler = StandardScaler()
    # Scale the features 
    lof_df[features] = scaler.fit_transform(lof_df[features])
    labels_lof = lof_df['Anomaly_Cluster']
    # Compute the silhouette scores
    silhouette_score_lof = silhouette_score(lof_df[final_features], labels_lof)
    print(f"Silhouette Score LOF {silhouette_score_lof} Cluster {cluster}")
    cluster_silhouette_score_lof.append(silhouette_score_lof)
print(f'Mean of Sillhoutte Index {sum(cluster_silhouette_score_lof)/len(cluster_silhouette_score_lof)} ')
   
    




Silhouette Score LOF 0.05088958722180204 Cluster 0
Silhouette Score LOF 0.01250304587755965 Cluster 1
Silhouette Score LOF 0.012898316619281504 Cluster 2
Silhouette Score LOF 0.013035491158995811 Cluster 3
Mean of Sillhoutte Index 0.022331610219409753 


# 3. Silhouette Analysis  using OneClassSVM


The objective is to measure How well the inliers were separated from the outliers. In this analyses, we consider 2 clusters (Inlier and Outlier) and hence, the Silhoutte score comes to play an important role.

## 3.1 Silhouette Score of Global Model

In [4]:
import dask.dataframe as dd
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

# Load the data with Dask
ocsvm_df = dd.read_csv('../data/output/consumo_datamart_gold_ocsvm.csv', usecols= final_features + ['Anomaly_Global'])

# Convert to Dask DataFrame
ocsvm_ddf = ocsvm_df.compute()

scaler = StandardScaler()

# Scale the features 
ocsvm_ddf[features] = scaler.fit_transform(ocsvm_ddf[features])
labels_ocsvm = ocsvm_ddf['Anomaly_Global']

# Compute the silhouette scores
silhouette_score_ocsvm = silhouette_score(ocsvm_ddf[features], labels_ocsvm)
print(f"Silhouette Score OCSVM {silhouette_score_ocsvm}")


Silhouette Score OCSVM 0.3926740717288932


## 3.2 Silhouette Score  by Sector

In [11]:
import dask.dataframe as dd
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import pandas as pd

features = ['Active_energy', 'Reactive_energy', 'Voltaje_FA', 'Voltaje_FC', 'Factor_Potencia']
time_features = ['Dia_Sin', 'Dia_Cos', 'Hora_Sin', 'Hora_Cos', 'Mes_Sin', 'Mes_Cos']
final_features = features + time_features

# Load the data with Dask
ddf = dd.read_csv('../data/output/consumo_datamart_gold_ocsvm.csv', usecols=final_features + ['Anomaly_Sector', 'Sector_Economico'])

# Get unique sectors (requires computation)
sectors = ddf['Sector_Economico'].unique().compute().tolist()

sector_silhouette_score_ocsvm = []

for sector in sectors:
    ocsvm_ddf = ddf[ddf['Sector_Economico'] == sector].compute()
    
    scaler = StandardScaler()
    
    # Scale the features 
    ocsvm_ddf[features] = scaler.fit_transform(ocsvm_ddf[features])
    labels_ocsvm= ocsvm_ddf['Anomaly_Sector']
    
    # Compute the silhouette scores
    silhouette_score_ocsvm = silhouette_score(ocsvm_ddf[final_features], labels_ocsvm)
    print(f"Silhouette Score LOF {silhouette_score_ocsvm} Sector {sector}")
    sector_silhouette_score_ocsvm.append(silhouette_score_ocsvm)

mean_silhouette_score = sum(sector_silhouette_score_ocsvm) / len(sector_silhouette_score_ocsvm)
print(f'Mean of Silhouette Index {mean_silhouette_score}')


Silhouette Score LOF 0.3191913838716725 Sector elaboracion_cacao_chocolate_productos_confiteria
Silhouette Score LOF 0.29168873230704456 Sector cultivo_frutos_nueces_arboles_arbustos
Silhouette Score LOF 0.24405273573976133 Sector cultivo_arboles_frutales_nueces
Silhouette Score LOF 0.2427527381876423 Sector cultivo_hortalizas_melones_raices_tuberculos
Silhouette Score LOF 0.23991634045456334 Sector captacion_tratamiento_distribucion_agua
Silhouette Score LOF 0.2301577369143608 Sector cultivo_hortalizas
Silhouette Score LOF 0.2017139464711203 Sector venta_mayor_metales_minerales_metaliferos
Mean of Silhouette Index 0.25278194484945216


## 3.3 Silhouette Score by Cluster

In [13]:
import dask.dataframe as dd
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define your features
features = ['Active_energy', 'Reactive_energy', 'Voltaje_FA', 'Voltaje_FC', 'Factor_Potencia']
time_features = ['Dia_Sin', 'Dia_Cos', 'Hora_Sin', 'Hora_Cos', 'Mes_Sin', 'Mes_Cos']
final_features = features + time_features

# Load the data with Dask
ddf = dd.read_csv('../data/output/consumo_datamart_gold_ocsvm.csv')

cluster_silhouette_score_ocsvm = []

for cluster in range(0, 4):
    ocsvm_ddf = ddf[ddf['Cluster'] == cluster].compute()
    
    scaler = StandardScaler()
    
    # Scale the features 
    ocsvm_ddf[features] = scaler.fit_transform(ocsvm_ddf[features])
    labels_ocsvm = ocsvm_ddf['Anomaly_Cluster']
    
    # Compute the silhouette scores
    silhouette_score_ocsvm = silhouette_score(ocsvm_ddf[final_features], labels_ocsvm)
    print(f"Silhouette Score OCSVM {silhouette_score_ocsvm} Cluster {cluster}")
    cluster_silhouette_score_ocsvm.append(silhouette_score_ocsvm)

mean_silhouette_score = sum(cluster_silhouette_score_ocsvm) / len(cluster_silhouette_score_ocsvm)
print(f'Mean of Silhouette Index {mean_silhouette_score}')


Silhouette Score OCSVM 0.321736158797977 Cluster 0
Silhouette Score OCSVM 0.25675689668647456 Cluster 1
Silhouette Score OCSVM 0.3468984809005893 Cluster 2
Silhouette Score OCSVM 0.24380929791265726 Cluster 3
Mean of Silhouette Index 0.2923002085744245


# 4. Calculation Average Silhouette Score by Cluster using DBSCAN


The objective is to measure How well the inliers were separated from the outliers. In this analyses, we consider 2 clusters (Inlier and Outlier) and hence, the Silhoutte score comes to play an important role.

In [2]:
import pandas as pd
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler


# Define the feature columns
features = ['Active_energy', 'Reactive_energy', 'Voltaje_FA', 'Voltaje_FC', 'Factor_Potencia']
time_features= ['Dia_Sin', 'Dia_Cos','Hora_Sin','Hora_Cos','Mes_Sin','Mes_Cos']
final_features= features + time_features
# Load the data
dbscan_df = pd.read_csv('../data/output/consumo_datamart_gold_dbscan.csv', usecols= final_features + ['Anomaly_Cluster'])



# Initialize the scaler
scaler = StandardScaler()

# Scale the features for dbscan_df dataset
dbscan_df[features] = scaler.fit_transform(dbscan_df[features])
labels_dbscan = dbscan_df['Anomaly_Cluster']


# Compute the silhouette scores
silhouette_score_dbscan = silhouette_score(dbscan_df, labels_dbscan)
print(f"Silhouette Score DBScan {silhouette_score_dbscan}")




Silhouette Score DBScan 0.10099289782890346


# 5 Selection of anomaly detection model


<table>
<tr> 
  <th>Model</th>
  <th> Silhouette Score</th>
</tr>
<tr style="background-color:blue;">
  <td>Isolation Forest / Unique Global Model</td>
  <td>0.4317</td>
</tr>
<tr>
  <td>Isolation Forest / Models by Sector</td>
  <td>0.2743</td>
</tr>
<tr>
  <td>Isolation Forest / Models by Cluster</td>
  <td>0.3257</td>
</tr>
<tr>
  <td>Local Outlier Factor / Unique Global Model</td>
  <td>-0.021</td>
</tr>
<tr>
  <td>Local Outlier Factor / Models by Sector</td>
  <td>0.063</td>
</tr>
<tr>
  <td>Local Outlier Factor / Models by Cluster</td>
  <td>0.023</td>
</tr>

<tr>
  <td>OneClassSVM / Unique Global Model</td>
  <td>0.3926</td>
</tr>
<tr>
  <td>OneClassSVM / Models by Sector</td>
  <td>0.2527</td>
</tr>
<tr>
  <td>OneClassSVM / Models by Cluster</td>
  <td>0.2923</td>
</tr>

<tr>
  <td>HDBSCAN</td>
  <td>0.10099</td>
</tr>
</table>
The model with the best silhouette score is chosen, which in this case is the Isolation Forest. Although the score of 0.4317 is only moderately significant, in this unsupervised context, it provides us with a valuable metric to rely on.