### K-nearest neighbors clustering

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors as knn
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial import ConvexHull

%matplotlib inline

file=pd.read_excel(r"C:\Users\sara.brumfield\Documents\DPW Model\Backlog\Overdue Streets and Alleys.xlsx", sheet_name="Sheet1")
#file["Location"]=file[['Longitude', 'Latitude']].apply(tuple, axis=1)
no_null=file.dropna(subset=["Latitude"], axis=0)
df=no_null.loc[no_null["SR Type"].str.contains("Dirty Alley")]
today=datetime.datetime.now()
df["Days Overdue"]=abs((df["Created Date"]-today).dt.days).astype(int)
df

In [None]:
x=df.loc[:,['Service Request Number','Latitude','Longitude']]

K_clusters = range(5,20)
kmeans = [KMeans(n_clusters=i) for i in K_clusters]
Y_axis = df[['Latitude']]
X_axis = df[['Longitude']]
score = [kmeans[i].fit(Y_axis).score(Y_axis) for i in range(len(kmeans))]

# Visualize
plt.plot(K_clusters, score)
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.title('Elbow Curve')
plt.show()

In [None]:
kmeans = KMeans(n_clusters = 9, init ='k-means++')
kmeans.fit(x[x.columns[1:3]]) # Compute k-means clustering.x
x['cluster_label'] = kmeans.fit_predict(x[x.columns[1:3]])
centers = kmeans.cluster_centers_ # Coordinates of cluster centers.
labels = kmeans.predict(x[x.columns[1:3]]) # Labels of each point
x.head(5)

In [None]:
def encircle(x,y, ax=None, **kw):
    if not ax: ax=plt.gca()
    p = np.c_[x,y]
    hull = ConvexHull(p)
    poly = plt.Polygon(p[hull.vertices,:], **kw)
    ax.add_patch(poly)
    
fig, ax = plt.subplots(figsize=(15,15))
x.plot.scatter(x='Latitude', y='Longitude', c=labels, s=df['Days Overdue']*3, cmap='Spectral', ax=ax)
#ax=plt.scatter(centers[:, 0], centers[:, 1], c='black', s=100, alpha=0.75)

for i in x["cluster_label"]:
    j=x.loc[x["cluster_label"]==i]
    encircle(j["Latitude"], j["Longitude"], fc='gray', alpha=0.1)

In [None]:
clustered_data = df.merge(x, left_on='Service Request Number', right_on='Service Request Number')
clustered_data.sort_values(by="cluster_label")

### DBScan Sandbox

In [None]:
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint

def get_centermost_point(cluster):
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    return tuple(centermost_point)

coords=test[['Longitude', 'Latitude']] #.apply(tuple, axis=1)
db = DBSCAN(eps=.5/6371., min_samples=15, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
rep_points = pd.DataFrame({'lon':lons, 'lat':lats})
rs = rep_points.apply(lambda row: df[(df['lat']==row['lat']) & (df['lon']==row['lon'])].iloc[0], axis=1)

fig, ax = plt.subplots(figsize=[10, 6])
rs_scatter = ax.scatter(rs['lon'], rs['lat'], c='#99cc99', edgecolor='None', alpha=0.7, s=120)
df_scatter = ax.scatter(df['lon'], df['lat'], c='k', alpha=0.9, s=3)
ax.set_title('Full data set vs DBSCAN reduced set')
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
ax.legend([df_scatter, rs_scatter], ['Full set', 'Reduced set'], loc='upper right')
plt.show()