In [None]:
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans

from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn import svm

%matplotlib inline

In [None]:
# load sample of 10,000,000 data points
!shuf -n 10000000 /home/divgrad/data/4-Vadeboncoeur/davis-bay.txt > /home/divgrad/bcdata-project-temp/sample-temp.txt
data = pd.read_csv('/home/divgrad/bcdata-project-temp/sample-temp.txt',sep=" ", header=None)
!rm /home/divgrad/bcdata-project-temp/sample-temp.txt

In [None]:
data.columns = ['lon', 'lat', 'z', 'r', 'g', 'b', '?', '?', '?']

In [None]:
zrgb = data[['z','r','g','b']].values
rgb = data[['r','g','b']].values
lon, lat = data['lon'].values, data['lat'].values

# K Means Mini-Batch in z-RGB Space

In [None]:
batch_size = 50000
num_clusters = 20

mbk_zrgb = MiniBatchKMeans(init='k-means++', n_clusters=num_clusters, batch_size=batch_size,
                      n_init=10, max_no_improvement=10, verbose=0)

mbk_zrgb.fit(zrgb)

In [None]:
# important line
mbk_labels = mbk_zrgb.labels_

In [None]:
smpl_idx = np.random.choice(np.arange(len(lon)), 150000, replace=False)
smpl_lon, smpl_lat, smpl_rgb  = lon[smpl_idx], lat[smpl_idx], rgb[smpl_idx]
smpl_labels = mbk_labels[smpl_idx]

In [None]:
rgb_01 = rgb/255
smpl_rgb = rgb[smpl_idx]
smpl_rgb_01 = rgb_01[smpl_idx]

In [None]:
plt.figure(figsize=(15,15))
plt.scatter(smpl_lon,smpl_lat,c=smpl_rgb_01,s=3,lw=0)
plt.axis('scaled')
plt.show()

In [None]:
plt.figure(figsize=(15,15))
plt.scatter(smpl_lon,smpl_lat,c=smpl_labels,s=1,alpha=75)
plt.axis('scaled')
plt.show()

In [None]:
curr_label = 0

plt.figure(figsize=(15,15))
plt.scatter(smpl_lon[smpl_labels==curr_label],smpl_lat[smpl_labels==curr_label],c=smpl_rgb_01[smpl_labels==curr_label,:],s=3,lw=0)
plt.axis('scaled')

In [None]:
curr_label = 1

plt.figure(figsize=(15,15))
plt.scatter(smpl_lon[smpl_labels==curr_label],smpl_lat[smpl_labels==curr_label],c=smpl_rgb_01[smpl_labels==curr_label,:],s=3,lw=0)
plt.axis('scaled')

In [None]:
curr_label = 2

plt.figure(figsize=(15,15))
plt.scatter(smpl_lon[smpl_labels==curr_label],smpl_lat[smpl_labels==curr_label],c=smpl_rgb_01[smpl_labels==curr_label,:],s=3,lw=0)
plt.axis('scaled')

In [None]:
curr_label = 3

plt.figure(figsize=(15,15))
plt.scatter(smpl_lon[smpl_labels==curr_label],smpl_lat[smpl_labels==curr_label],c=smpl_rgb_01[smpl_labels==curr_label,:],s=3,lw=0)
plt.axis('scaled')

In [None]:
curr_label = 4

plt.figure(figsize=(15,15))
plt.scatter(smpl_lon[smpl_labels==curr_label],smpl_lat[smpl_labels==curr_label],c=smpl_rgb_01[smpl_labels==curr_label,:],s=3,lw=0)
plt.axis('scaled')

In [None]:
curr_label = 0

plt.figure(figsize=(15,9))

for curr_label in range(num_clusters):
    plt.subplot(num_clusters,3,3*curr_label+1)
    plt.hist(smpl_rgb[smpl_labels==curr_label,0],bins=255,color='r',lw=0,normed=True)
    plt.xlim((0,255))
    plt.subplot(num_clusters,3,3*curr_label+2)
    plt.hist(smpl_rgb[smpl_labels==curr_label,1],bins=255,color='g',lw=0,normed=True)
    plt.xlim((0,255))
    plt.subplot(num_clusters,3,3*curr_label+3)
    plt.hist(smpl_rgb[smpl_labels==curr_label,2],bins=255,color='b',lw=0,normed=True)
    plt.xlim((0,255))

plt.show()

# K Means Mini-Batch in RGB Space

In [None]:
batch_size = 50000
num_clusters = 5

mbk_rgb = MiniBatchKMeans(init='k-means++', n_clusters=num_clusters, batch_size=batch_size,
                      n_init=10, max_no_improvement=10, verbose=0)

mbk_rgb.fit(rgb)

In [None]:
mbk_labels = mbk_rgb.labels_

In [None]:
smpl_idx = np.random.choice(np.arange(len(lon)), 150000, replace=False)
smpl_lon, smpl_lat, smpl_rgb  = lon[smpl_idx], lat[smpl_idx], rgb[smpl_idx]
smpl_labels = mbk_labels[smpl_idx]

In [None]:
rgb_01 = rgb/255
smpl_rgb = rgb[smpl_idx]
smpl_rgb_01 = rgb_01[smpl_idx]

In [None]:
plt.figure(figsize=(15,15))
plt.scatter(smpl_lon,smpl_lat,c=smpl_rgb_01,s=3,lw=0)
plt.axis('scaled')
plt.show()

In [None]:
plt.figure(figsize=(15,15))
plt.scatter(smpl_lon,smpl_lat,c=smpl_labels,s=1,alpha=75)
plt.axis('scaled')
plt.show()

In [None]:
curr_label = 0

plt.figure(figsize=(15,15))
plt.scatter(smpl_lon[smpl_labels==curr_label],smpl_lat[smpl_labels==curr_label],c=smpl_rgb_01[smpl_labels==curr_label,:],s=3,lw=0)
plt.axis('scaled')

In [None]:
curr_label = 1

plt.figure(figsize=(15,15))
plt.scatter(smpl_lon[smpl_labels==curr_label],smpl_lat[smpl_labels==curr_label],c=smpl_rgb_01[smpl_labels==curr_label,:],s=3,lw=0)
plt.axis('scaled')

In [None]:
curr_label = 2

plt.figure(figsize=(15,15))
plt.scatter(smpl_lon[smpl_labels==curr_label],smpl_lat[smpl_labels==curr_label],c=smpl_rgb_01[smpl_labels==curr_label,:],s=3,lw=0)
plt.axis('scaled')

In [None]:
curr_label = 3

plt.figure(figsize=(15,15))
plt.scatter(smpl_lon[smpl_labels==curr_label],smpl_lat[smpl_labels==curr_label],c=smpl_rgb_01[smpl_labels==curr_label,:],s=3,lw=0)
plt.axis('scaled')

In [None]:
curr_label = 4

plt.figure(figsize=(15,15))
plt.scatter(smpl_lon[smpl_labels==curr_label],smpl_lat[smpl_labels==curr_label],c=smpl_rgb_01[smpl_labels==curr_label,:],s=3,lw=0)
plt.axis('scaled')

In [None]:
curr_label = 0

plt.figure(figsize=(15,9))

for curr_label in range(num_clusters):
    plt.subplot(num_clusters,3,3*curr_label+1)
    plt.hist(smpl_rgb[smpl_labels==curr_label,0],bins=255,color='r',lw=0,normed=True)
    plt.xlim((0,255))
    plt.subplot(num_clusters,3,3*curr_label+2)
    plt.hist(smpl_rgb[smpl_labels==curr_label,1],bins=255,color='g',lw=0,normed=True)
    plt.xlim((0,255))
    plt.subplot(num_clusters,3,3*curr_label+3)
    plt.hist(smpl_rgb[smpl_labels==curr_label,2],bins=255,color='b',lw=0,normed=True)
    plt.xlim((0,255))

plt.show()

In [None]:
def hist3d(arr, **kwargs):
    
    """
    hist3d(arr, **kwargs) plots a 3D historgram of point cloud data

    Input
    arr : the input array of which a histogram will be plotted. arr 
          should be an N-by-3 array representing values in 3-space.
    nbins : the default number of bins along each axis (default: 50)
    th : the threshold below which a bin will not be represented in
         the final histogram. (default: .01)
    figsize : the size of the output figure (default: (10, 8))
    elev : the elevation angle of the view (default : 45)
    azim : the azimuthal angle of the view (default: 30)
    cmap : the colour map used in the plot (default: viridis)
    s : the size of the points in the histogram (default: 3)
    """
    
    nbins = kwargs.get('nbins', 50)
    th = kwargs.get('threshold', .01)
    figsize = kwargs.get('figsize', (10,8))
    elev = kwargs.get('elev', 45)
    azim = kwargs.get('azim', 30)
    cmap = kwargs.get('cmap', 'viridis')
    s = kwargs.get('s', 3)

    H, edges = np.histogramdd(arr, bins=nbins)
    edges = np.vstack(edges).T
    edges = .5 * (edges[1:,:] + edges[:-1,:])
    edges.shape

    x = []
    y = []
    z = []
    c = []

    for j in range(nbins):
        for k in range(nbins):
            for l in range(nbins):
                v = H[j,k,l]
                if v < th:
                    continue
                else:
                    x.append(edges[j,0])
                    y.append(edges[k,1])
                    z.append(edges[l,2])
                    c.append(v)
    from mpl_toolkits.mplot3d import Axes3D
    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot(111, projection='3d')
    ax.view_init(elev, azim)
    im = ax.scatter(x, y, z, zdir='z', c=arr/255, s=100*c,lw=0);
    return


In [None]:
curr_label = 2
hist3d(smpl_rgb[smpl_labels==curr_label])