# Distance and clustering - Oury - 7 Mar 2016 

### Load required libraries

In [67]:
import pandas as pd
import numpy  as np
import sklearn.neighbors as sn

### Load the saved dataframe

In [2]:
save_load_path = '/Users/David/Desktop'
mss_df = pd.read_pickle(save_load_path+'/mss_df.pkl')
type(mss_df)

pandas.core.frame.DataFrame

### Save numeric variables of interest in `mss_num_df`

In [3]:
num_rows = 10000
mss_num_df = pd.concat([mss_df.loc[:num_rows, 'sp_0':'sp_35'],
                        mss_df.loc[:num_rows, 'st_0':'st_35'],
                        mss_df.loc[:num_rows, 'k_0' :'k_11'],
                        mss_df.loc[:num_rows, 'ts_0':'ts_7']
                       ],
                       axis=1
                       )

### All variabls are normalized except `st_0` through `st_35`

In [4]:
mss_num_df.loc[0:3,'st_0':'st_35'].head()

Unnamed: 0,st_0,st_1,st_10,st_11,st_12,st_13,st_14,st_15,st_16,st_17,...,st_27,st_28,st_29,st_3,st_30,st_31,st_32,st_33,st_34,st_35
0,0.0,171.13,-10.64,-7.228,19.991,-143.504,-118.249,-142.909,-18.528,-4.209,...,-89.765,29.646,-45.432,-28.48,15.733,29.094,-6.805,9.46,-15.33,-21.079
1,0.0,171.13,-10.64,-7.228,35.141,-30.807,35.192,-75.606,-0.584,195.091,...,59.208,-17.624,28.703,-28.48,14.13,-0.71,34.62,-23.91,23.453,-5.048
2,0.0,171.124,-10.643,-7.226,23.09,-10.712,114.286,-390.247,15.592,46.796,...,-17.441,-47.459,-19.073,-28.489,3.268,9.741,16.689,-12.663,11.562,4.562
3,24.937,37.465,-37.411,-14.199,43.154,-43.433,73.812,-5.868,-52.972,57.5,...,-17.005,-37.423,47.573,-216.443,-0.734,25.383,-10.965,-44.947,10.023,-40.109


### Normalize all columns 

This is easier than specifying columns.

In [5]:
from sklearn import preprocessing
mss_num_df = mss_num_df.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))

# Distances

See [sklearn.neighbors.DistanceMetric](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.DistanceMetric.html)

### The `unit_square` matrix has row coordinates that are corners of the unit square

We will use this matrix to demonstrate distance metrics. 

In [8]:
unit_cube = np.array([[0,0],[0,1],[1,0],[1,1]])
unit_cube

array([[0, 0],
       [0, 1],
       [1, 0],
       [1, 1]])

Think of the rows of this matrix as a dataframe. We will determine the distances between rows using three difference distance metrics.

### Get `get_metric` function returns a distance metric/function

The Minkowski metric is determined by a parameter `p`. 

- `p=1` specifies the _Manhattan_ metric
- `p=2` specifies the Euclidean metric

See [Wikipedia](https://en.wikipedia.org/wiki/Minkowski_distance) 
for details on Minkowski distance.

### Print the _manhattan_ distances between rows of the `unit_cube`

In [9]:
dm_m1 = sn.DistanceMetric.get_metric('minkowski',p=1)
dm_m1.pairwise(unit_cube)

array([[ 0.,  1.,  1.,  2.],
       [ 1.,  0.,  2.,  1.],
       [ 1.,  2.,  0.,  1.],
       [ 2.,  1.,  1.,  0.]])

### Print the _Euclidean_ distances between rows of the `unit_cube`

In [10]:
dm_m2 = sn.DistanceMetric.get_metric('minkowski',p=2)
dm_m2.pairwise(unit_cube)

array([[ 0.        ,  1.        ,  1.        ,  1.41421356],
       [ 1.        ,  0.        ,  1.41421356,  1.        ],
       [ 1.        ,  1.41421356,  0.        ,  1.        ],
       [ 1.41421356,  1.        ,  1.        ,  0.        ]])

### Print the distances between rows of `unit_cube` for `p=1000`

In [50]:
dm_m = sn.DistanceMetric.get_metric('minkowski',p=10)
dm_m.pairwise(unit_cube)

array([[ 0.        ,  1.        ,  1.        ,  1.07177346],
       [ 1.        ,  0.        ,  1.07177346,  1.        ],
       [ 1.        ,  1.07177346,  0.        ,  1.        ],
       [ 1.07177346,  1.        ,  1.        ,  0.        ]])

Notice that the distance between `[0, 0]` and `[1, 1]` is close to `1`.

This might be useful because of the large number of columns `mss_num_df`.

### Get the Euclidean distance matrix between (each pair of) rows of `mss_num_df`

In [12]:
mss_num_dm = dm_m2.pairwise(mss_num_df[:])
mss_num_dm.shape

(10000, 10000)

In [52]:
mss_num_dm[0:3,0:9]

array([[ 0.        ,  2.89504329,  3.28170421,  3.08457942,  3.24504701,
         3.53971753,  3.17792097,  3.61201771,  3.45673843],
       [ 2.89504329,  0.        ,  3.44391134,  2.82890932,  3.15307406,
         3.7695063 ,  3.41919624,  3.4025642 ,  3.34139045],
       [ 3.28170421,  3.44391134,  0.        ,  3.21658984,  3.26525752,
         3.34217103,  2.85254281,  3.32292189,  3.38084161]])

In [54]:
mss_num_df[0:3]

Unnamed: 0,sp_0,sp_1,sp_10,sp_11,sp_12,sp_13,sp_14,sp_15,sp_16,sp_17,...,k_9,k_10,k_11,ts_0,ts_1,ts_3,ts_4,ts_5,ts_7,cluster
0,0.945946,0.683684,1.0,0.742,0.01,0.054,0.015,0.021,0.067,0.17,...,0,0,0,0,0,0,1,0,0,-1
1,1.0,1.0,1.0,1.0,0.018,0.07,0.04,0.044,0.217,0.074,...,0,0,0,0,0,0,1,0,0,0
2,1.0,0.910911,0.095095,0.147,0.489,1.0,0.561,0.258,0.153,0.096,...,0,0,0,0,1,0,0,0,0,-1


### Get the Euclidean distance from row `2` to every row of `mss_num_df`

In [336]:
row_num = 2
dm_m2.pairwise(mss_num_df[:], 
               mss_num_df[row_num:(row_num+1)])

array([[ 3.28170421],
       [ 3.44391134],
       [ 0.        ],
       ..., 
       [ 2.99274151],
       [ 3.59795052],
       [ 2.97094947]])

# Clustering

Below are simple examples for clustering with `KMeans`, `DBSCAN` and `Birch`.

### K-means

See

- [sklearn.cluster.KMeans](http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans)
- [K-means assumptions, with graphs](http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_assumptions.html#example-cluster-plot-kmeans-assumptions-py)

In [55]:
from sklearn.cluster import KMeans

In [58]:
km = KMeans(n_clusters=10)
km.fit(mss_num_df)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=10, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

### Cluster the datapoints of `mss_num_df`

In [60]:
km.labels_

array([0, 0, 0, ..., 7, 7, 8], dtype=int32)

In [61]:
km.cluster_centers_.shape, mss_num_df.shape

((10, 79), (10000, 79))

### `AgglomerativeClustering.fit` doesn't finish within 15 minutes

so it is not worth using.

In [62]:
from sklearn.cluster import AgglomerativeClustering
ac = AgglomerativeClustering(n_clusters=2)
# ac.fit(mss_num_df)

### DBSCAN 

See [sklearn.cluster.DBSCAN](http://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html)

- `eps`
- `min_samples`
- `metric` - string or callable, "precomputed" implies that `X` 
    (input to `fit` command) is a distance matrix

In [63]:
from sklearn.cluster import DBSCAN

In [64]:
db = DBSCAN(eps=1.4, min_samples=4)
db

DBSCAN(algorithm='auto', eps=1.4, leaf_size=30, metric='euclidean',
    min_samples=4, p=None, random_state=None)

### Create DBSCAN clusters

The following code is timed and takes approximately 60 seconds to complete.

In [28]:
db.fit(mss_num_df[:])

DBSCAN(algorithm='auto', eps=1.4, leaf_size=30, metric='euclidean',
    min_samples=4, p=None, random_state=None)

In [31]:
len(set(db.labels_))

133

Cluster `-1` is noise, and so is not a real cluster.

In [33]:
unique, counts = np.unique(db.labels_, 
                           return_counts=True)
print(np.asarray((unique, counts)).T)

[[  -1 7783]
 [   0    5]
 [   1  203]
 [   2    4]
 [   3  195]
 [   4   13]
 [   5    7]
 [   6   16]
 [   7   18]
 [   8  209]
 [   9   18]
 [  10  176]
 [  11   28]
 [  12   14]
 [  13   20]
 [  14    6]
 [  15   27]
 [  16  103]
 [  17    8]
 [  18  125]
 [  19   13]
 [  20    6]
 [  21   32]
 [  22    4]
 [  23    4]
 [  24   54]
 [  25    4]
 [  26   24]
 [  27   10]
 [  28    8]
 [  29   29]
 [  30   43]
 [  31    5]
 [  32   10]
 [  33    4]
 [  34    9]
 [  35   31]
 [  36    6]
 [  37   20]
 [  38   58]
 [  39   37]
 [  40   11]
 [  41    5]
 [  42    8]
 [  43    4]
 [  44   15]
 [  45    4]
 [  46   27]
 [  47    4]
 [  48    4]
 [  49    4]
 [  50    5]
 [  51    6]
 [  52    4]
 [  53    9]
 [  54   19]
 [  55    8]
 [  56   14]
 [  57    6]
 [  58    5]
 [  59   15]
 [  60   29]
 [  61    9]
 [  62   16]
 [  63   13]
 [  64    3]
 [  65    4]
 [  66    4]
 [  67    9]
 [  68    4]
 [  69    5]
 [  70    4]
 [  71    5]
 [  72    6]
 [  73    4]
 [  74   11]
 [  75    6]

### Record the cluster designations in variable `cluster` of `mss_num_df`

In [34]:
mss_num_df['cluster'] = db.labels_

In [35]:
mss_num_df.loc[0:4,'cluster']

0   -1
1    0
2   -1
3    1
4   -1
Name: cluster, dtype: int64

In [36]:
cluster_number = 17
mss_num_df.loc[mss_num_df['cluster']==cluster_number]

Unnamed: 0,sp_0,sp_1,sp_10,sp_11,sp_12,sp_13,sp_14,sp_15,sp_16,sp_17,...,k_9,k_10,k_11,ts_0,ts_1,ts_3,ts_4,ts_5,ts_7,cluster
268,0.75976,0.86987,0.610611,0.815,0.957,1.0,0.83,0.764,0.687,0.815,...,0,0,0,0,0,0,1,0,0,17
397,0.961962,1.0,0.273273,0.556,1.0,0.766,0.732,0.732,0.85,0.85,...,0,0,0,0,0,0,1,0,0,17
435,0.436436,0.64965,1.0,0.713,0.502,0.502,0.598,0.726,0.974,0.529,...,0,0,0,0,0,0,1,0,0,17
603,0.718719,1.0,0.997998,0.909,0.981,0.733,0.838,1.0,0.765,0.641,...,0,0,0,0,0,0,1,0,0,17
819,0.563564,0.598599,1.0,0.514,0.706,0.757,0.622,0.75,0.69,0.712,...,0,0,0,0,0,0,1,0,0,17
2864,1.0,1.0,0.552553,0.681,1.0,0.981,0.99,0.793,0.805,0.788,...,0,0,0,0,0,0,1,0,0,17
8974,1.0,0.997998,0.192192,0.455,1.0,0.644,0.461,0.516,0.445,0.54,...,0,0,0,0,0,0,1,0,0,17
9774,0.983984,1.0,0.660661,0.204,0.568,0.281,0.417,0.375,0.599,1.0,...,0,0,0,0,0,0,1,0,0,17


In [37]:
print('artist:',mss_df.ix[268,'artist_name'])
print('release:',mss_df.ix[268,'release'])
print('title:',mss_df.ix[268,'title'])

artist: b'The Nightraver & The Magican'
release: b'Ultimate Happy Hardcore 3'
title: b'Drop the Bass'


### Birch

See 

- [Wikipedia](https://en.wikipedia.org/wiki/BIRCH)
- [sklearn.cluster.Birch](http://scikit-learn.org/stable/modules/generated/sklearn.cluster.Birch.html)

In [39]:
from sklearn.cluster import Birch

### Parameters

- `threshold` - _The radius of the subcluster obtained by merging a new sample and the closest subcluster should be lesser than the threshold. Otherwise a new subcluster is started._ Smaller values increase the time required to create clusters.
- `branching_factor` - _Maximum number of CF subclusters in each node._

In [68]:
bi = Birch(n_clusters=20, branching_factor=50, threshold=1.5)
bi

Birch(branching_factor=50, compute_labels=True, copy=True, n_clusters=20,
   threshold=1.5)

### Create Birch the clusters

The function`bi.fit` takes approximately

- 120 seconds with `threshold=1.0`
- 12 seconds with `threshold=1.5`

In [69]:
import time
t0 = time.time() 
bi.fit(mss_num_df[:])
print('time:',time.time() - t0)
bi

time: 15.956559896469116


Birch(branching_factor=50, compute_labels=True, copy=True, n_clusters=20,
   threshold=1.5)

In [47]:
unique, counts = np.unique(bi.labels_, 
                           return_counts=True)
print(np.asarray((unique, counts)).T)

[[   0   66]
 [   1 1081]
 [   2  708]
 [   3 3533]
 [   4  150]
 [   5 1223]
 [   6  642]
 [   7   57]
 [   8  169]
 [   9  560]
 [  10  102]
 [  11   73]
 [  12   67]
 [  13   35]
 [  14 1098]
 [  15  152]
 [  16  145]
 [  17   19]
 [  18   47]
 [  19   73]]


# Nearest neighbors

See

- [Nearest Neighbors Classification](http://scikit-learn.org/stable/modules/neighbors.html#classification)
- [sklearn.neighbors](http://scikit-learn.org/stable/modules/classes.html#module-sklearn.neighbors)
- [sklearn.neighbors.KNeighborsClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier)
- [sklearn.neighbors.RadiusNeighborsClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.RadiusNeighborsClassifier.html#sklearn.neighbors.RadiusNeighborsClassifier)

In [66]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
kn = KNeighborsClassifier(n_neighbors=3)

### To make predictions we need a categorical variable. 

### EOF