Initial commit

gbroques · Apr 21, 2018 · 953d3b3 · 953d3b3
commit 953d3b3
Show file tree

Hide file tree

Showing 13 changed files with 265 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+.idea
+.pytest_cache
+.coverage
+__pycache__
+venv
+htmlcov
+.ipynb_checkpoints
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,7 @@
+language: python
+python:
+  - '3.6.3'
+script:
+  - py.test --cov=dbscan test/
+after_success:
+  - coveralls
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,23 @@
+
+The MIT License (MIT)
+
+Copyright (c) 2018 G Roques
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
diff --git a/README.md b/README.md
@@ -0,0 +1,8 @@
+# DBSCAN
+
+Density-Based Spatial Clustering of Applications with Noise (DBSCAN) implementation in Python.
+
+API inspired by Scikit-learn.
+
+**Reference:**  *Introduction to Data Mining* (1st Edition) by Pang-Ning Tan
+Section 8.4, Page 526
diff --git a/dataviz/__init__.py b/dataviz/__init__.py
@@ -0,0 +1,2 @@
+from .dataviz import generate_clusters
+from .dataviz import plot_clusters
diff --git a/dataviz/dataviz.py b/dataviz/dataviz.py
@@ -0,0 +1,156 @@
+from math import cos
+from math import pi
+from math import sin
+from random import Random
+from typing import List, Tuple
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+
+
+def plot_clusters(clusters: List[List], labels: List[int], centroids: List[List], seed=0) -> None:
+    """Plot cluster data.
+
+    Args:
+        clusters: Cluster data to plot.
+        labels: Labels of each point.
+        centroids: Center point of each cluster.
+        seed: Seed for random number generator.
+              Used to sample colors.
+
+    Returns:
+        None
+    """
+    columns = ['x', 'y']
+    num_clusters = len(set(labels))
+    data = get_data(clusters, labels, centroids, columns)
+    markers = get_markers(num_clusters)
+    palette = get_palette(num_clusters, seed)
+    g = sns.lmplot(*columns,
+                   data=data,
+                   markers=markers,
+                   palette=palette,
+                   fit_reg=False,
+                   legend=False,
+                   hue='labels',
+                   scatter_kws={'linewidth': 1, 'edgecolor': 'w'})
+    plt.show()
+
+
+def get_data(clusters, labels, centroids, columns) -> pd.DataFrame:
+    """Construct a DataFrame object to plot.
+
+    Args:
+        clusters: The cluster data.
+        labels: Which cluster each point belongs to.
+        centroids: The center point of each cluster.
+        columns: Labels for each column of data.
+
+    Returns:
+
+    """
+    df = pd.DataFrame(clusters, columns=columns)
+    df['labels'] = pd.Series(labels, index=df.index)  # Add labels as a column for coloring
+    centroids_df = pd.DataFrame(centroids, columns=columns)
+    centroids_df['labels'] = ['centroid' for _ in range(len(centroids))]
+    df = df.append(centroids_df, ignore_index=True)
+    return df
+
+
+def get_markers(num_clusters) -> List[str]:
+    """Generate the marks for the plot.
+
+    Uses circles 'o' for points,
+    and crosses 'x' for centroids.
+
+    Args:
+        num_clusters: The number of clusters.
+
+    Returns:
+        A list of markers.
+    """
+    markers = ['o' for _ in range(num_clusters)]
+    markers.append('x')  # Reserve 'x' for centroids
+    return markers
+
+
+def get_palette(num_clusters, seed=0) -> List[str]:
+    """Generates a color palette for the plot.
+
+    Uses random colors for different clusters,
+    and reserves red for centroids.
+
+    Args:
+        num_clusters: The number of clusters.
+        seed: Seed for random number generator.
+
+    Returns:
+
+    """
+    random = Random(seed)
+    all_colors = ['b', 'g', 'c', 'm', 'orange']
+    palette = random.sample(all_colors, num_clusters)
+    palette.append('red')  # Reserve red color for centroids
+    return palette
+
+
+def generate_clusters(num_clusters: int,
+                      num_points: int,
+                      spread: float,
+                      bound_for_x: Tuple[float, float],
+                      bound_for_y: Tuple[float, float],
+                      seed=None) -> List[List]:
+    """Generate random data for clustering.
+
+    Source:
+    https://stackoverflow.com/questions/44356063/how-to-generate-a-set-of-random-points-within-a-given-x-y-coordinates-in-an-x
+
+    Args:
+        num_clusters: The number of clusters to generate.
+        num_points: The number of points to generate.
+        spread: The spread of each cluster. Decrease for tighter clusters.
+        bound_for_x: The bounds for possible values of X.
+        bound_for_y: The bounds for possible values of Y.
+        seed: Seed for the random number generator.
+
+    Returns:
+        K clusters consisting of N points.
+    """
+    random = Random(seed)
+    x_min, x_max = bound_for_x
+    y_min, y_max = bound_for_y
+    num_points_per_cluster = int(num_points / num_clusters)
+    clusters = []
+    for _ in range(num_clusters):
+        x = x_min + (x_max - x_min) * random.random()
+        y = y_min + (y_max - y_min) * random.random()
+        clusters.extend(generate_cluster(num_points_per_cluster, (x, y), spread, seed))
+    return clusters
+
+
+def generate_cluster(num_points: int, center: Tuple[float, float], spread: float, seed=None) -> List[List]:
+    """Generates a cluster of random points.
+
+    Source:
+    https://stackoverflow.com/questions/44356063/how-to-generate-a-set-of-random-points-within-a-given-x-y-coordinates-in-an-x
+
+    Args:
+        num_points: The number of points for the cluster.
+        center: The center of the cluster.
+        spread: How tightly to cluster the data.
+        seed: Seed for the random number generator.
+
+    Returns:
+        A random cluster of consisting of N points.
+    """
+    x, y = center
+    seed = (seed + y) * x  # Generate different looking clusters if called from generate_clusters
+    random = Random(seed)
+    points = []
+    for i in range(num_points):
+        theta = 2 * pi * random.random()
+        s = spread * random.random()
+        point = [x + s * cos(theta), y + s * sin(theta)]
+        points.append(point)
+    return points
diff --git a/dbscan/__init__.py b/dbscan/__init__.py
@@ -0,0 +1 @@
+from .dbscan import DBSCAN
diff --git a/dbscan/dbscan.py b/dbscan/dbscan.py
@@ -0,0 +1,3 @@
+class DBSCAN:
+    """Density-Based Spatial Clustering of Applications with Noise."""
+    pass
diff --git a/main.py b/main.py
@@ -0,0 +1,19 @@
+from typing import List
+
+from dataviz import generate_clusters
+
+
+def main():
+    num_clusters = 4
+    clusters = generate_data(num_clusters, seed=1)
+
+
+def generate_data(num_clusters: int, seed=None) -> List[List]:
+    num_points = 20
+    spread = 7
+    bounds = (1, 100)
+    return generate_clusters(num_clusters, num_points, spread, bounds, bounds, seed)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,25 @@
+attrs==17.4.0
+certifi==2018.1.18
+chardet==3.0.4
+coverage==4.5.1
+coveralls==1.3.0
+cycler==0.10.0
+docopt==0.6.2
+idna==2.6
+kiwisolver==1.0.1
+matplotlib==2.2.2
+more-itertools==4.1.0
+numpy==1.14.2
+pandas==0.22.0
+pluggy==0.6.0
+py==1.5.3
+pyparsing==2.2.0
+pytest==3.5.0
+pytest-cov==2.5.1
+python-dateutil==2.7.2
+pytz==2018.4
+requests==2.18.4
+scipy==1.0.1
+seaborn==0.8.1
+six==1.11.0
+urllib3==1.22
diff --git a/specification.pdf b/specification.pdf
diff --git a/test/__init__.py b/test/__init__.py
diff --git a/test/dbscan_test.py b/test/dbscan_test.py
@@ -0,0 +1,14 @@
+import unittest
+
+from dbscan import DBSCAN
+
+
+class DBSCANTest(unittest.TestCase):
+
+    def test_fit(self):
+        dbscan = DBSCAN()
+        pass
+
+
+if __name__ == '__main__':
+    unittest.main()