<a href="https://colab.research.google.com/github/davidofitaly/05_ml_clustering_projects/blob/main/03_dbscan_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Table of contents:
1. [Import of libraries](#0)
2. [Data generation](#1)
3. [Visualization created data](#2)
4. [DBSCAN](#3)


In [66]:
# Import the necessary libraries
import pandas as pd
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN

sns.set(font_scale=1.1)
# Print the version of the imported libraries for reference
print(f'Pandas: {pd.__version__}')
print(f'Seaborn: {sns.__version__}')
print(f'Numpy: {np.__version__}')

Pandas: 2.2.2
Seaborn: 0.13.2
Numpy: 1.26.4


### <a name='1'> </a> Data generation

In [67]:
# Import make_blobs datasets
from sklearn.datasets import make_blobs

data = make_blobs(n_samples=1000, centers=4, cluster_std=1.3, center_box=(-8.0, 8.0), random_state=42)[0]

# Create DataFrame
df = pd.DataFrame(data, columns=['x1', 'x2'])
df

Unnamed: 0,x1,x2
0,-6.702388,5.522242
1,-4.538978,-5.115007
2,-4.777877,-5.405281
3,3.114734,0.473738
4,3.374434,-0.375550
...,...,...
995,-2.823747,-3.933212
996,-1.196002,8.028278
997,2.477685,5.000632
998,-6.296068,-5.747150


### <a name='2'> </a> Visualization created data

In [68]:
# Create a scatter plot to visualize the clustering of the data points
px.scatter(df, 'x1', 'x2', width=1000, height=500, title='Clusterization', template='ggplot2')

### <a name='3'> </a> DBSCAN

#### first model

- eps = 0.5 min_samples = 5

In [69]:
cluster = DBSCAN(eps = 0.5, min_samples=5)
cluster.fit(data)

In [70]:
cluster.labels_[:20]

array([ 0,  1,  1,  2,  2,  1,  2,  1,  1,  2,  2, -1,  0,  9,  1,  1,  0,
        0,  0,  2])

In [71]:
df['cluster'] = cluster.labels_

df.head()

Unnamed: 0,x1,x2,cluster
0,-6.702388,5.522242,0
1,-4.538978,-5.115007,1
2,-4.777877,-5.405281,1
3,3.114734,0.473738,2
4,3.374434,-0.37555,2


- visualization

In [72]:
px.scatter(df, 'x1', 'x2', 'cluster', width=1000, height=500, title= f'DBSCAN(eps=0.5, min_samples=5)', color_continuous_midpoint=0, template='plotly_dark')

#### second model

- eps=0.6 min_samples=8

In [89]:
cluster = DBSCAN(eps=0.6, min_samples=8)
cluster.fit(data)

In [90]:
cluster.labels_[:20]

array([ 0,  1,  1,  2,  2,  1,  2,  1,  1,  2,  2,  0,  0, -1,  1,  1,  0,
        0,  0,  2])

In [91]:
df['cluster'] = cluster.labels_

df.head()

Unnamed: 0,x1,x2,cluster
0,-6.702388,5.522242,0
1,-4.538978,-5.115007,1
2,-4.777877,-5.405281,1
3,3.114734,0.473738,2
4,3.374434,-0.37555,2


- visualization

In [92]:
px.scatter(df, 'x1', 'x2', 'cluster', width=1000, height=500, title= f'DBSCAN(eps=0.6, min_samples=8)', color_continuous_midpoint=0, template='plotly_dark')

#### third model

- eps=0.7 min_samples=5

In [102]:
cluster = DBSCAN(eps=0.7, min_samples=5)
cluster.fit(data)

In [103]:
cluster.labels_[:20]

array([0, 1, 1, 2, 2, 1, 2, 1, 1, 2, 2, 0, 0, 1, 1, 1, 0, 0, 0, 2])

In [104]:
df['cluster'] = cluster.labels_

df.head()

Unnamed: 0,x1,x2,cluster
0,-6.702388,5.522242,0
1,-4.538978,-5.115007,1
2,-4.777877,-5.405281,1
3,3.114734,0.473738,2
4,3.374434,-0.37555,2


In [106]:
px.scatter(df, 'x1', 'x2', 'cluster', width=1000, height=500, title= f'DBSCAN(eps=0.7, min_samples=5)', color_continuous_midpoint=0, template='plotly_dark')