Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
gbroques committed Apr 21, 2018
0 parents commit 953d3b3
Show file tree
Hide file tree
Showing 13 changed files with 265 additions and 0 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
.idea
.pytest_cache
.coverage
__pycache__
venv
htmlcov
.ipynb_checkpoints
7 changes: 7 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
language: python
python:
- '3.6.3'
script:
- py.test --cov=dbscan test/
after_success:
- coveralls
23 changes: 23 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@

The MIT License (MIT)

Copyright (c) 2018 G Roques

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# DBSCAN

Density-Based Spatial Clustering of Applications with Noise (DBSCAN) implementation in Python.

API inspired by Scikit-learn.

**Reference:** *Introduction to Data Mining* (1st Edition) by Pang-Ning Tan
Section 8.4, Page 526
2 changes: 2 additions & 0 deletions dataviz/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .dataviz import generate_clusters
from .dataviz import plot_clusters
156 changes: 156 additions & 0 deletions dataviz/dataviz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
from math import cos
from math import pi
from math import sin
from random import Random
from typing import List, Tuple

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


def plot_clusters(clusters: List[List], labels: List[int], centroids: List[List], seed=0) -> None:
"""Plot cluster data.
Args:
clusters: Cluster data to plot.
labels: Labels of each point.
centroids: Center point of each cluster.
seed: Seed for random number generator.
Used to sample colors.
Returns:
None
"""
columns = ['x', 'y']
num_clusters = len(set(labels))
data = get_data(clusters, labels, centroids, columns)
markers = get_markers(num_clusters)
palette = get_palette(num_clusters, seed)
g = sns.lmplot(*columns,
data=data,
markers=markers,
palette=palette,
fit_reg=False,
legend=False,
hue='labels',
scatter_kws={'linewidth': 1, 'edgecolor': 'w'})
plt.show()


def get_data(clusters, labels, centroids, columns) -> pd.DataFrame:
"""Construct a DataFrame object to plot.
Args:
clusters: The cluster data.
labels: Which cluster each point belongs to.
centroids: The center point of each cluster.
columns: Labels for each column of data.
Returns:
"""
df = pd.DataFrame(clusters, columns=columns)
df['labels'] = pd.Series(labels, index=df.index) # Add labels as a column for coloring
centroids_df = pd.DataFrame(centroids, columns=columns)
centroids_df['labels'] = ['centroid' for _ in range(len(centroids))]
df = df.append(centroids_df, ignore_index=True)
return df


def get_markers(num_clusters) -> List[str]:
"""Generate the marks for the plot.
Uses circles 'o' for points,
and crosses 'x' for centroids.
Args:
num_clusters: The number of clusters.
Returns:
A list of markers.
"""
markers = ['o' for _ in range(num_clusters)]
markers.append('x') # Reserve 'x' for centroids
return markers


def get_palette(num_clusters, seed=0) -> List[str]:
"""Generates a color palette for the plot.
Uses random colors for different clusters,
and reserves red for centroids.
Args:
num_clusters: The number of clusters.
seed: Seed for random number generator.
Returns:
"""
random = Random(seed)
all_colors = ['b', 'g', 'c', 'm', 'orange']
palette = random.sample(all_colors, num_clusters)
palette.append('red') # Reserve red color for centroids
return palette


def generate_clusters(num_clusters: int,
num_points: int,
spread: float,
bound_for_x: Tuple[float, float],
bound_for_y: Tuple[float, float],
seed=None) -> List[List]:
"""Generate random data for clustering.
Source:
https://stackoverflow.com/questions/44356063/how-to-generate-a-set-of-random-points-within-a-given-x-y-coordinates-in-an-x
Args:
num_clusters: The number of clusters to generate.
num_points: The number of points to generate.
spread: The spread of each cluster. Decrease for tighter clusters.
bound_for_x: The bounds for possible values of X.
bound_for_y: The bounds for possible values of Y.
seed: Seed for the random number generator.
Returns:
K clusters consisting of N points.
"""
random = Random(seed)
x_min, x_max = bound_for_x
y_min, y_max = bound_for_y
num_points_per_cluster = int(num_points / num_clusters)
clusters = []
for _ in range(num_clusters):
x = x_min + (x_max - x_min) * random.random()
y = y_min + (y_max - y_min) * random.random()
clusters.extend(generate_cluster(num_points_per_cluster, (x, y), spread, seed))
return clusters


def generate_cluster(num_points: int, center: Tuple[float, float], spread: float, seed=None) -> List[List]:
"""Generates a cluster of random points.
Source:
https://stackoverflow.com/questions/44356063/how-to-generate-a-set-of-random-points-within-a-given-x-y-coordinates-in-an-x
Args:
num_points: The number of points for the cluster.
center: The center of the cluster.
spread: How tightly to cluster the data.
seed: Seed for the random number generator.
Returns:
A random cluster of consisting of N points.
"""
x, y = center
seed = (seed + y) * x # Generate different looking clusters if called from generate_clusters
random = Random(seed)
points = []
for i in range(num_points):
theta = 2 * pi * random.random()
s = spread * random.random()
point = [x + s * cos(theta), y + s * sin(theta)]
points.append(point)
return points
1 change: 1 addition & 0 deletions dbscan/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .dbscan import DBSCAN
3 changes: 3 additions & 0 deletions dbscan/dbscan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
class DBSCAN:
"""Density-Based Spatial Clustering of Applications with Noise."""
pass
19 changes: 19 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from typing import List

from dataviz import generate_clusters


def main():
num_clusters = 4
clusters = generate_data(num_clusters, seed=1)


def generate_data(num_clusters: int, seed=None) -> List[List]:
num_points = 20
spread = 7
bounds = (1, 100)
return generate_clusters(num_clusters, num_points, spread, bounds, bounds, seed)


if __name__ == '__main__':
main()
25 changes: 25 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
attrs==17.4.0
certifi==2018.1.18
chardet==3.0.4
coverage==4.5.1
coveralls==1.3.0
cycler==0.10.0
docopt==0.6.2
idna==2.6
kiwisolver==1.0.1
matplotlib==2.2.2
more-itertools==4.1.0
numpy==1.14.2
pandas==0.22.0
pluggy==0.6.0
py==1.5.3
pyparsing==2.2.0
pytest==3.5.0
pytest-cov==2.5.1
python-dateutil==2.7.2
pytz==2018.4
requests==2.18.4
scipy==1.0.1
seaborn==0.8.1
six==1.11.0
urllib3==1.22
Binary file added specification.pdf
Binary file not shown.
Empty file added test/__init__.py
Empty file.
14 changes: 14 additions & 0 deletions test/dbscan_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import unittest

from dbscan import DBSCAN


class DBSCANTest(unittest.TestCase):

def test_fit(self):
dbscan = DBSCAN()
pass


if __name__ == '__main__':
unittest.main()

0 comments on commit 953d3b3

Please sign in to comment.