## Anomaly Detection using Pycaret

PyCaret’s Anomaly Detection Module is an unsupervised machine learning module that is used for identifying rare items, events or observations which raise suspicions by differing significantly from the majority of the data. Typically, the anomalous items will translate to some kind of problem such as bank fraud, a structural defect, medical problems or errors . This module provide several pre-processing features that prepares the data for modeling through setup function. This module has over 12 ready-to-use algorithms and several plots to analyze the results of trained models.

In [1]:
# importing dependencies here
from pycaret.datasets import get_data
anomaly = get_data("anomaly")

# importing anomaly detection module
from pycaret.anomaly import *

# initializing the setup function used for pre-processing 
setup_anomaly_data = setup(anomaly)


Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,4456
1,Original Data,"(1000, 10)"
2,Missing Values,False
3,Numeric Features,10
4,Categorical Features,0
5,Ordinal Features,False
6,High Cardinality Features,False
7,Transformed Data,"(1000, 10)"
8,Numeric Imputer,mean
9,Categorical Imputer,constant


In [2]:
# checking the dataset
anomaly

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10
0,0.263995,0.764929,0.138424,0.935242,0.605867,0.518790,0.912225,0.608234,0.723782,0.733591
1,0.546092,0.653975,0.065575,0.227772,0.845269,0.837066,0.272379,0.331679,0.429297,0.367422
2,0.336714,0.538842,0.192801,0.553563,0.074515,0.332993,0.365792,0.861309,0.899017,0.088600
3,0.092108,0.995017,0.014465,0.176371,0.241530,0.514724,0.562208,0.158963,0.073715,0.208463
4,0.325261,0.805968,0.957033,0.331665,0.307923,0.355315,0.501899,0.558449,0.885169,0.182754
...,...,...,...,...,...,...,...,...,...,...
995,0.305055,0.656837,0.331665,0.822525,0.907127,0.882276,0.855732,0.584786,0.808640,0.242762
996,0.812627,0.864258,0.616604,0.167966,0.811223,0.938071,0.418462,0.472306,0.348347,0.671129
997,0.250967,0.138627,0.919703,0.461234,0.886555,0.869888,0.800908,0.530324,0.779433,0.234952
998,0.502436,0.936820,0.580062,0.540773,0.151995,0.059452,0.225220,0.242755,0.279385,0.538755


## Creating Anomaly Detection Models

### Isolation Forest

In [3]:
# instantiating isolation forest model
iforest = create_model("iforest")

# plotting the data using iforest
plot_model(iforest)

The Yellow Points are the Anomaly Points based on the Isolation Forest Anomaly Detection Technique. 

In [4]:
# generating the predictions using KNN trained model
iforest_predictions = predict_model(iforest, data = anomaly)
iforest_predictions

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10,Label,Score
0,0.263995,0.764929,0.138424,0.935242,0.605867,0.518790,0.912225,0.608234,0.723782,0.733591,0,-0.029775
1,0.546092,0.653975,0.065575,0.227772,0.845269,0.837066,0.272379,0.331679,0.429297,0.367422,0,-0.085167
2,0.336714,0.538842,0.192801,0.553563,0.074515,0.332993,0.365792,0.861309,0.899017,0.088600,1,0.016764
3,0.092108,0.995017,0.014465,0.176371,0.241530,0.514724,0.562208,0.158963,0.073715,0.208463,1,0.051090
4,0.325261,0.805968,0.957033,0.331665,0.307923,0.355315,0.501899,0.558449,0.885169,0.182754,0,-0.014326
...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.305055,0.656837,0.331665,0.822525,0.907127,0.882276,0.855732,0.584786,0.808640,0.242762,0,-0.087898
996,0.812627,0.864258,0.616604,0.167966,0.811223,0.938071,0.418462,0.472306,0.348347,0.671129,0,-0.077894
997,0.250967,0.138627,0.919703,0.461234,0.886555,0.869888,0.800908,0.530324,0.779433,0.234952,0,-0.066641
998,0.502436,0.936820,0.580062,0.540773,0.151995,0.059452,0.225220,0.242755,0.279385,0.538755,0,-0.087503


In [5]:
# checking anomaly rows. Label = 1 is the anomaly data.
iforest_anomaly_rows = iforest_predictions[iforest_predictions["Label"] == 1]
print(iforest_anomaly_rows)

         Col1      Col2      Col3      Col4      Col5      Col6      Col7  \
2    0.336714  0.538842  0.192801  0.553563  0.074515  0.332993  0.365792   
3    0.092108  0.995017  0.014465  0.176371  0.241530  0.514724  0.562208   
7    0.869237  0.277979  0.423076  0.112472  0.183727  0.034960  0.111114   
8    0.197078  0.843918  0.243396  0.281278  0.329148  0.734582  0.191947   
11   0.796623  0.230543  0.993018  0.077075  0.094068  0.718628  0.977611   
14   0.950745  0.005154  0.084187  0.478148  0.212836  0.714347  0.664819   
15   0.336346  0.252265  0.212287  0.020201  0.203514  0.534468  0.476263   
17   0.741544  0.398253  0.766472  0.635670  0.261735  0.837371  0.001797   
18   0.184433  0.174112  0.301976  0.895893  0.285614  0.615993  0.197073   
20   0.162825  0.674069  0.705447  0.774799  0.894267  0.443057  0.399779   
23   0.057884  0.227162  0.022494  0.167069  0.631315  0.610103  0.277753   
24   0.445535  0.640881  0.689264  0.937542  0.037016  0.316847  0.972312   

In [6]:
# checking the number of anomaly rows returned by iforest
iforest_anomaly_rows.shape       # returned 50 rows

(50, 12)

### K Nearest Neighbor

In [7]:
# instantiating KNN model
knn = create_model("knn")

# plotting the data using KNN 
plot_model(knn)

In [8]:
# generating the predictions using KNN trained model
knn_predictions = predict_model(knn, data = anomaly)

# predictions with Label = 1 are considered anomalies
knn_predictions

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10,Label,Score
0,0.263995,0.764929,0.138424,0.935242,0.605867,0.518790,0.912225,0.608234,0.723782,0.733591,0,0.558927
1,0.546092,0.653975,0.065575,0.227772,0.845269,0.837066,0.272379,0.331679,0.429297,0.367422,0,0.477482
2,0.336714,0.538842,0.192801,0.553563,0.074515,0.332993,0.365792,0.861309,0.899017,0.088600,0,0.676207
3,0.092108,0.995017,0.014465,0.176371,0.241530,0.514724,0.562208,0.158963,0.073715,0.208463,1,0.804769
4,0.325261,0.805968,0.957033,0.331665,0.307923,0.355315,0.501899,0.558449,0.885169,0.182754,0,0.630836
...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.305055,0.656837,0.331665,0.822525,0.907127,0.882276,0.855732,0.584786,0.808640,0.242762,0,0.266822
996,0.812627,0.864258,0.616604,0.167966,0.811223,0.938071,0.418462,0.472306,0.348347,0.671129,0,0.403480
997,0.250967,0.138627,0.919703,0.461234,0.886555,0.869888,0.800908,0.530324,0.779433,0.234952,0,0.337727
998,0.502436,0.936820,0.580062,0.540773,0.151995,0.059452,0.225220,0.242755,0.279385,0.538755,0,0.300265


In [9]:
# checking KNN anomaly rows
knn_anomaly_rows = knn_predictions[knn_predictions["Label"] == 1]
knn_anomaly_rows

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10,Label,Score
3,0.092108,0.995017,0.014465,0.176371,0.24153,0.514724,0.562208,0.158963,0.073715,0.208463,1,0.804769
8,0.197078,0.843918,0.243396,0.281278,0.329148,0.734582,0.191947,0.927804,0.713269,0.891548,1,0.844047
9,0.292985,0.703432,0.439621,0.107868,0.922947,0.253458,0.295652,0.355287,0.980911,0.308864,1,0.732952
11,0.796623,0.230543,0.993018,0.077075,0.094068,0.718628,0.977611,0.333386,0.634843,0.028729,1,0.830877
13,0.61366,0.709061,0.726767,0.862584,0.593116,0.474138,0.941633,0.788438,0.323293,0.931074,1,0.681433
14,0.950745,0.005154,0.084187,0.478148,0.212836,0.714347,0.664819,0.803558,0.465466,0.305692,1,0.821167
15,0.336346,0.252265,0.212287,0.020201,0.203514,0.534468,0.476263,0.845669,0.155377,0.442781,1,0.718586
17,0.741544,0.398253,0.766472,0.63567,0.261735,0.837371,0.001797,0.568841,0.03736,0.292769,1,0.791093
18,0.184433,0.174112,0.301976,0.895893,0.285614,0.615993,0.197073,0.738881,0.831371,0.265761,1,0.775046
20,0.162825,0.674069,0.705447,0.774799,0.894267,0.443057,0.399779,0.009136,0.941851,0.982711,1,0.856567


In [10]:
# checking the number of anomaly rows returned by KNN
knn_anomaly_rows.shape   # returned 46 rows  

(46, 12)

## Clustering

In [11]:
# instantiating Cluster model
cluster = create_model("cluster")

# plotting the data using Cluster 
plot_model(cluster)

In [12]:
# generating the predictions using Cluster trained model
cluster_predictions = predict_model(cluster, data = anomaly)

# predictions with Label = 1 are considered anomalies
cluster_predictions

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10,Label,Score
0,0.263995,0.764929,0.138424,0.935242,0.605867,0.518790,0.912225,0.608234,0.723782,0.733591,0,0.694721
1,0.546092,0.653975,0.065575,0.227772,0.845269,0.837066,0.272379,0.331679,0.429297,0.367422,0,0.585941
2,0.336714,0.538842,0.192801,0.553563,0.074515,0.332993,0.365792,0.861309,0.899017,0.088600,1,0.875902
3,0.092108,0.995017,0.014465,0.176371,0.241530,0.514724,0.562208,0.158963,0.073715,0.208463,1,1.011624
4,0.325261,0.805968,0.957033,0.331665,0.307923,0.355315,0.501899,0.558449,0.885169,0.182754,0,0.655363
...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.305055,0.656837,0.331665,0.822525,0.907127,0.882276,0.855732,0.584786,0.808640,0.242762,0,0.348868
996,0.812627,0.864258,0.616604,0.167966,0.811223,0.938071,0.418462,0.472306,0.348347,0.671129,0,0.613922
997,0.250967,0.138627,0.919703,0.461234,0.886555,0.869888,0.800908,0.530324,0.779433,0.234952,0,0.532668
998,0.502436,0.936820,0.580062,0.540773,0.151995,0.059452,0.225220,0.242755,0.279385,0.538755,0,0.476143


In [13]:
# checking cluster anomaly rows
cluster_anomaly_rows = cluster_predictions[cluster_predictions["Label"] == 1]
cluster_anomaly_rows

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10,Label,Score
2,0.336714,0.538842,0.192801,0.553563,0.074515,0.332993,0.365792,0.861309,0.899017,0.0886,1,0.875902
3,0.092108,0.995017,0.014465,0.176371,0.24153,0.514724,0.562208,0.158963,0.073715,0.208463,1,1.011624
8,0.197078,0.843918,0.243396,0.281278,0.329148,0.734582,0.191947,0.927804,0.713269,0.891548,1,0.944945
9,0.292985,0.703432,0.439621,0.107868,0.922947,0.253458,0.295652,0.355287,0.980911,0.308864,1,0.883354
11,0.796623,0.230543,0.993018,0.077075,0.094068,0.718628,0.977611,0.333386,0.634843,0.028729,1,0.87589
14,0.950745,0.005154,0.084187,0.478148,0.212836,0.714347,0.664819,0.803558,0.465466,0.305692,1,1.015701
15,0.336346,0.252265,0.212287,0.020201,0.203514,0.534468,0.476263,0.845669,0.155377,0.442781,1,0.840946
17,0.741544,0.398253,0.766472,0.63567,0.261735,0.837371,0.001797,0.568841,0.03736,0.292769,1,0.874732
18,0.184433,0.174112,0.301976,0.895893,0.285614,0.615993,0.197073,0.738881,0.831371,0.265761,1,1.006507
20,0.162825,0.674069,0.705447,0.774799,0.894267,0.443057,0.399779,0.009136,0.941851,0.982711,1,1.029388


In [14]:
# checking the number of anomaly rows returned by clustering
cluster_anomaly_rows.shape   # returned 50 rows  

(50, 12)