In [None]:
'''
DBSCAN references:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html#sklearn.cluster.DBSCAN
https://towardsdatascience.com/how-to-use-dbscan-effectively-ed212c02e62
https://medium.com/@mohantysandip/a-step-by-step-approach-to-solve-dbscan-algorithms-by-tuning-its-hyper-parameters-93e693a91289
'''

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random as rnd

#Bokeh is used for plotting some figures
from bokeh.plotting import figure, show
from bokeh.models import Range1d
from bokeh.io import output_notebook
output_notebook()

#Pandas options to display max rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

PATH = os.getcwd() #Pulls your current working directory
FILE = 'Customer_Locations.csv'
FULL_PATH = os.path.join(PATH, FILE) #Joins directory and file as one string

data = pd.read_csv(FULL_PATH, sep='\t', encoding='utf-8') #Import the data

#Load data with demographic info
data_3 = data.loc[:, ['Age', 'Lat', 'Lon']] #Condense to only necessary columns
tgt_dem = data_3.query('Age > 19 and Age < 41') #Filter by age range
tgt_dem_sorted = tgt_dem.sort_values(by=['Lat', 'Lon']) #Sort values for visualization

###--- Global Functions ---###
def find_neighbors(dataframe, n=50):
    from sklearn.neighbors import NearestNeighbors
    nearest_neighbors = NearestNeighbors(n_neighbors=n) #Instantiate ML model
    neighbors = nearest_neighbors.fit(dataframe) #Fit model to data
    distances, indices = neighbors.kneighbors(dataframe) #Extract distances from fit results
    distances = np.sort(distances[:,n-1], axis=0) #Sort distances
    
    #Plot knee/elbow graph
    fig = plt.figure(figsize=(5, 5))
    plt.plot(distances)
    plt.grid(which='both', axis='both')
    plt.xlabel("Points")
    plt.ylabel("Distance")
    plt.show()
    
    return distances #Output distances for input into kneed.KneeLocator

def find_knee(array):
    from kneed import KneeLocator
    i = np.arange(len(array)) #1D array to create alternate axis for KneeLocator since the array going in is also 1D
    knee = KneeLocator(i, array, S=1, curve='convex', direction='increasing', interp_method='polynomial')
    
    #Plot knee/elbow graph with line to show best estimate for knee/elbow location
    fig = plt.figure(figsize=(5, 5))
    knee.plot_knee()
    plt.xlabel("Points")
    plt.ylabel("Distance")

    return array[knee.knee] #Output knee value for use as eps value in DBSCAN operation

def cluster_DBSCAN(dataframe, eps=0.5, min_samples=10):
    from sklearn.cluster import DBSCAN

    X = dataframe.loc[:, ['Lat','Lon']].copy() #Reduce DataFrame to only latitude and longitude

    db = DBSCAN(eps=eps, min_samples=min_samples).fit(X) #Instantiate and fit DBSCAN model
    labels = db.labels_ #Assign label output to new variable

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) #Exclude outliers and find number of clusters
    n_noise_ = list(labels).count(-1) #Count number of outliers

    X['Cluster'] = labels #Add labels to data
    group_lbls = list(set(labels)) #Find unique labels

    cluster_data = X.loc[:, ['Lat','Lon','Cluster']] #Remove extraneous info
    all_clusters = [cluster_data.query('Cluster == @lbl') for lbl in group_lbls] #Split DataFrames by label

    #Print cluster information for debugging/model validation purposes
    print("Estimated number of clusters: %d" % n_clusters_)
    print("Estimated number of noise points: %d" % n_noise_)
    print(f'Noise-to-Cluster Ratio: {round(n_noise_/len(dataframe),3)}')

    return all_clusters #List of rows labeled by cluster

def plot_clusters(cluster_list, together=True, figsize=(800,600), alpha=0.2, **kwargs): #Plots all clusters together by default
    #Find min and max plot limits
    x_min = min([cluster.min()[0] for cluster in all_clusters])
    x_max = max([cluster.max()[0] for cluster in all_clusters])
    y_max = min([cluster.min()[1] for cluster in all_clusters])
    y_min = max([cluster.max()[1] for cluster in all_clusters])
    
    def plot_centers(): #Subroutine to plot centers if argument is passed in kwargs
        if 'centers' in kwargs:
            cluster_plot.circle(kwargs['centers'].iloc[:,0], kwargs['centers'].iloc[:,1], size=10, color='black', alpha=0.8)
        else:
            pass
    
    if together:
        #Initiate plot
        cluster_plot = figure(width=figsize[0], height=figsize[1])
        cluster_plot.y_range = Range1d(y_max, y_min)
        cluster_plot.x_range = Range1d(x_min, x_max)
        plot_centers() #Plots centers if they are included in kwargs
        
        #Print cluster scatterplots
        for cluster in cluster_list:
            if cluster.Cluster.max() == -1: #Exclude outliers
                pass
            else:
                if 'color' in kwargs:
                    if type(kwargs['color']) == list:
                        plot_color = kwargs['color'][cluster.Cluster.max()] #Uses proffered list of colors
                    else:
                        plot_color = kwargs['color'] #Uses single color if color list is a string literal
                else:
                    plot_color = '#'+"%06x" % rnd.randint(0, 0xFFFFFF) #Default if colors are not specified: this generates random hexadecimal values for colors to be able to differentiate clusters
                cluster_plot.circle(cluster.iloc[:,0], cluster.iloc[:,1], size=3, color=plot_color, alpha=alpha)

        show(cluster_plot)
    else: #Plot each individual cluster as its own chart
        for cluster in cluster_list:
            if cluster.Cluster.max() == -1:
                pass
            else:
                if 'color' in kwargs:
                    if type(kwargs['color']) == list:
                        plot_color = kwargs['color'][cluster.Cluster.max()]
                    else:
                        plot_color = kwargs['color']
                else:
                    plot_color = '#'+"%06x" % rnd.randint(0, 0xFFFFFF)
                cluster_plot = figure(width=figsize[0], height=figsize[1])
                cluster_plot.y_range = Range1d(y_max, y_min) #Ensures individual plots match the same scale as the overall plot
                cluster_plot.x_range = Range1d(x_min, x_max) #Same as above
                cluster_plot.circle(cluster.iloc[:,0], cluster.iloc[:,1], size=3, color=plot_color, alpha=alpha)
                plot_centers()
                show(cluster_plot)

#Segment data by year for visualization
data_4 = data.loc[:, ['Customer_Since_Year','Lat', 'Lon']]
d4_years = list(set(data_4.Customer_Since_Year)) #Set of years to use as labels during segmentation
cust_by_year = [data_4.query('Customer_Since_Year == @year') for year in d4_years] #Segment DataFrames by year and place into list

#Plot new subscribers by year
for year in cust_by_year:
    yoy_plot = figure(width=800, height=600) #Set dimensions of graphic
    
    #Find x and y min and max values to standardize chart range and labels
    x_min = min([cluster.loc[:, ['Lat','Lon']].min()[0] for cluster in cust_by_year])
    x_max = max([cluster.loc[:, ['Lat','Lon']].max()[0] for cluster in cust_by_year])
    y_max = min([cluster.loc[:, ['Lat','Lon']].min()[1] for cluster in cust_by_year])
    y_min = max([cluster.loc[:, ['Lat','Lon']].max()[1] for cluster in cust_by_year])
    
    #Standardize chart range for x and y
    yoy_plot.y_range = Range1d(y_max, y_min)
    yoy_plot.x_range = Range1d(x_min, x_max)
    
    #Plot individual coordinates as dots
    yoy_plot.circle(year.Lat, year.Lon, size=3, color='#633636', alpha=0.6)
    show(yoy_plot)

In [None]:
#Plot knee/elbow graph two different ways to find knee/elbow
my_distances = find_neighbors(tgt_dem_sorted.loc[:, ['Lat','Lon']], n=50) #Finds distances between points and plots graph
knee_elbow = find_knee(my_distances) #Finds knee/elbow for use as a starting point for the eps hyperparameter in the DBSCAN algorithm

'''
Clustering with DBSCAN can use knee/elbow value as eps (or not).
You can then only vary min_samples (though technically, you could carry forward the n 
from the find_neighbors() function). Either way, DBSCAN will take some tuning.
'''
all_clusters = cluster_DBSCAN(tgt_dem, eps=knee_elbow, min_samples=65) #Cluster points
plot_clusters(all_clusters, alpha=0.6) #Takes list of cluster DataFrames and plots all together using randomized colors

#plot_clusters(all_clusters, together=False) #Plot all individual clusters!

#Color randomizer from a set list
#all_colors = [rnd.choice(['black','blue','red','gray','green','orange']) for color in range(0,len(all_clusters))]

In [None]:
#See how k-means works (or doesn't) with the dataset
def k_cluster(dataframe, n_clusters=10):
    from sklearn.cluster import KMeans
    import numpy as np
    X = dataframe
    kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init="auto").fit(X) #Instantiate and fit KMeans model

    k_cluster = X.copy()
    k_cluster['Cluster'] = kmeans.labels_

    return [k_cluster.query('Cluster == @lbl') for lbl in range(0,n_clusters)], kmeans.cluster_centers_ #Outputs list of DataFrames w/clusters as well as cluster centers
    
k_clusters, k_centers = k_cluster(tgt_dem_sorted.loc[:, ['Lat','Lon']], n_clusters=7) #Call k_cluster function
df_k_centers = pd.DataFrame(k_centers) #Isolate centers

plot_clusters(k_clusters, centers=df_k_centers, alpha=0.4) #Plot clusters with centers

In [None]:
#Plot only clusters with 200 or more rows
larger_200_idx = list(set([cluster.Cluster.max() if len(cluster)>= 200 else -1 for cluster in all_clusters]))

#Weed out all small clusters from the main cluster set in all_clusters
big_clusters = []
for cluster in all_clusters:
    if cluster.Cluster.max() in larger_200_idx:
        big_clusters.append(cluster)
    else:
        pass
    
plot_clusters(big_clusters, together=True, alpha=0.4)