<a href="https://colab.research.google.com/github/davidofitaly/08_decetion_anomaly_projects/blob/main/02_anomaly_decetion_using_isolation_forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Table of contents:
1. [Import of libraries](#0)
2. [Data generation](#1)
3. [Visualization created data](#2)
4. [Isolation Forest](#3)


### <a name='0'> </a> Import of libraries


In [5]:
# Import the necessary libraries
import pandas as pd
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(font_scale=1.0)

# Print the version of the imported libraries for reference
print(f'Pandas: {pd.__version__}')
print(f'Numpy: {np.__version__}')
print(f'Seaborn: {sns.__version__}')

Pandas: 2.2.2
Numpy: 1.26.4
Seaborn: 0.13.2


### <a name='1'> </a> Data generation


In [39]:
# Import make_blobs datasets
from sklearn.datasets import make_blobs


raw_data = make_blobs(n_samples=50, cluster_std=1.2, center_box=(-7,7), random_state=42)[0]

raw_data[:5]

array([[-2.47848627,  8.53273411],
       [-1.50580202,  3.95839614],
       [ 1.4736888 ,  0.51740573],
       [-7.95943316, -3.82979371],
       [-2.84606723,  4.61523585]])

In [40]:
# Create DataFrame
df_blobs = pd.DataFrame(raw_data, columns=['x1', 'x2'])
df_blobs.head()

Unnamed: 0,x1,x2
0,-2.478486,8.532734
1,-1.505802,3.958396
2,1.473689,0.517406
3,-7.959433,-3.829794
4,-2.846067,4.615236


In [41]:
# Create a scatter plot to visualize the clustering of the data points
px.scatter(df_blobs, 'x1', 'x2', width=1000, height=500, title='Visualization of the generated data', template='ggplot2')

### <a name='3'> </a> Isolation Forest


In [47]:
# Importing the IsolationForest class from the sklearn.ensemble module.
from sklearn.ensemble import IsolationForest

# Initializing the Isolation Forest model with 100 decision trees
# and an assumed contamination level of 10% (0.1).
isolation= IsolationForest(n_estimators=100, contamination=0.1)

outliers = isolation.fit_predict(df_blobs)


In [48]:
df_blobs['outliers'] = outliers

df_blobs.head()

Unnamed: 0,x1,x2,outliers
0,-2.478486,8.532734,-1
1,-1.505802,3.958396,1
2,1.473689,0.517406,1
3,-7.959433,-3.829794,-1
4,-2.846067,4.615236,1


In [50]:
px.scatter(df_blobs, x='x1', y='x2', color='outliers', width=1000, title='Local Outlier Factor', template='ggplot2')