In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
from datascience import *

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

# K-means and NBA Data, Part II

This dataset contains player statistics for the 2016 NBA season. Try k-means clustering on *subsets* of the columns in the dataset. Start with trying to answer the same questions as above. If you'd like, see if you can interpret the clusters when you increase the number of clusters.

**Warning**: Don't use scatter_matrix with more than ~5 columns -- it requires a lot of memory to plot.

The columns of this dataset are:

```
Rk        Player-id
Player    Player Name
Pos       Position
Age       Player Age
Tm        Team
G         Number of Games played
GS        Number of Games started
MP        Minutes played
FG        Field Goals
FGA       Field Goals Attempted
3P        Three-point shots made
3PA       Three-point shots attempted
2P        Two-point shots made
2PA       Two-point shots attempted
FT        Free Throws made
FTA       Free Throws attempted
ORB       Offensive rebounds
DRB       Defensive rebounds
TRB       Total rebounds
AST       Number of assists
STL       Number of steals
BLK       Number of blocks
TOV       Number of turnovers
PF        Number of personal fouls
PTS       Total number of points
```

In [None]:
nba = Table.read_table('data/nba2016.csv')
nba

## Naive clustering

Try clustering without any scaling.

### A data-driven approach to answering "how many clusters are there?"
* Try kmeans clustering with 1,2,3,...,10 clusters.
* For each clustering attempt, measure how well it's clustered.
* Use the smallest number of clusters that give the most clustering accuracy.

In [None]:
def cluster_scores(data_arr):
    '''
    input a numpy nd.array of data
    returns cluster quality scores of using 1,2,..,9 clusters
    '''
    scores = []
    for i in np.arange(1, 10):
        score = KMeans(n_clusters=i).fit(data_arr).score(data_arr)
        scores.append(score)
    return scores

In [None]:
clustering_features = nba.drop('Rk', 'Player', 'Pos', 'Tm')

scores = cluster_scores(clustering_features.values)

### Plot the quality of the different clustering attempts
* Look for the "elbow" in the plot, when increaseing the number of clusters no longer gives you better clusters.

In [None]:
plt.plot(np.arange(1, 10), scores);

First, let's try using 2 clusters:

In [None]:
# the attribute .values accesses the underlying numpy
# array of a Table, which sklearn requires.
train = nba.drop('Rk', 'Player', 'Pos', 'Tm').values

kmeans2 = KMeans(n_clusters=2)
labels2 = kmeans2.fit_predict(train)
out2 = nba.with_column('label', labels2) # add labels to our input table

What do the clusters look like?
* Group by label and look at the sizes of the clusters
* Group by label and look at the average values of the clusters

In [None]:
out2.group('label')

In [None]:
out2.group('label', np.mean)

The main differentiators seem to come from minutes played:
* Most other statistics are correlated with minutes played! So MP explains the clusters well by itself!

In [None]:
out2.hist('MP', group='label', unit='min')

Now, let's try 3 clusters!

In [None]:
train = nba.drop('Rk', 'Player', 'Pos', 'Tm').values

kmeans3 = KMeans(n_clusters=3)
labels3 = kmeans3.fit_predict(train)
out3 = nba.with_column('label', labels3) # add labels to our input table

What do the clusters represent?

In [None]:
out3.group('label')

In [None]:
out3.group('label', np.mean)

The main differentiator still seems to be minutes played (`MP`), which isn't a surprise.
* The three groups consist of "small/medium/large" number of minutes played.
* There's a better description, though. The three clusters can be described as:
    - Starters
    - Second Unit
    - Players with inconsistent playing time

In [None]:
out3.hist('MP', group='label')

In [None]:
out3.hist('G', group='label')

In [None]:
out3.hist('GS', group='label')

### Can we try to recover player positions from clusters?

In [None]:
nba.group('Pos').sort('count', descending=True)

### Strategy for clustering players into positions played:
* Make the dataset reflect the differences in how positions play

To Start:
* De-correlate player statistics from minutes played.
* Drop `G`, `GS`, and `MP` -- since these don't have to do with your position!
* Also try: scale the dataset appropriately (`StandardScaler`?).

In [None]:
to_normalize = nba.drop('Rk','Player','Pos','Age','Tm','G','GS','MP')
min_played = nba.column('MP')

features = to_normalize
for label in to_normalize.labels:
    features = features.with_column(label, nba.column(label) / min_played)

In [None]:
features

In [None]:
scores = cluster_scores(features.values)
plt.plot(np.arange(1, 10), scores);

Let's try 4 clusters:

In [None]:
n_clusters = 4

train = features.values

kmeans = KMeans(n_clusters=n_clusters)
labels = kmeans.fit_predict(train)
out = nba.with_column('label', labels)

In [None]:
out.group(['Pos', 'label']).pivot('Pos', 'label', 'count', sum).show()

In [None]:
out.group('label')

In [None]:
out.group('label', np.mean)

Plot histograms/scatterplots for labels to assess the clusters

In [None]:
(
    features
    .with_column('label', labels)
    .where('FTA', are.below(0.4))  # filter out james harden
    .scatter('FTA', 'DRB', colors='label')
)

In [None]:
out.hist('3P', group='label')

Try normalizing

In [None]:
ss = StandardScaler()
ssfeatures = ss.fit_transform(features.values)

In [None]:
scores = cluster_scores(ssfeatures)
plt.plot(np.arange(1, 10), scores);

In [None]:
n_clusters = 3

train = ssfeatures

kmeans = KMeans(n_clusters=n_clusters)
labels = kmeans.fit_predict(train)
out = nba.with_column('label', labels)

Getting better:
   * Centers are isolated to one cluster (along with most Power Forwards)
   * Small forwards and guards are split between the other two

In [None]:
out.group(['Pos', 'label']).pivot('Pos', 'label', 'count', sum).show()

What differentiates the other two?
* Plot histograms and scatter-plots!
* Look at who the outliers are.
    - which players are assigned clusters that don't match their positions?
    - An example may be Lebron James has stats that look like both a Forward and a Guard.

In [None]:
out.group('label', np.mean)

In [None]:
# Code

In [None]:
# Code

### Next steps:
* Remove extraneous columns
* Add columns for efficiency (e.g. FG/FGA)
* Add assist-turnover ratio (i.e. AST/TOV)
* Forget trying to capture positions:
    - Use only certain subsets and see what type of clusters you get!

In [None]:
# Code

In [None]:
# Code

# TSNE

Another clustering algorithm is called "t-distributed stochastic neighbor embedding", or TSNE for short. It attempts to embed high-dimensional datasets into low dimensions, while preserving regions of high-density.

See a nice explanation here: https://distill.pub/2016/misread-tsne/

In [None]:
from sklearn.manifold import TSNE

In [None]:
tsne = TSNE(n_components=2) # embeds in the plane -- 2 dimensions
tsne_out = tsne.fit_transform(ssfeatures)

In [None]:
(
    Table(['1', '2'])
    .with_rows(out)
    .with_columns('label', labels)
    .scatter('1', '2', colors='label')
)

In [None]:
# Code

# Extra: clustering SDPD data

A cleaned version of the SDPD data is found at `data/sdpd_clean.csv`. Try clustering these and understanding the clusters.

As clustering algorithms require only numeric input, categorical data has been cleaned in a standard way. Yes/No fields have been changed to 1/0. 

The ethnicities are encoded with integers according to the following map:

```
 'W': 0,
 'H': 1,
 'B': 2,
 'O': 3,
 'A': 4,
 'F': 5,
 'V': 6,
 'C': 7,
 'I': 8,
 'X': 9,
 'K': 10,
 NAN: 11,
 'P': 12,
 'J': 13,
 'Z': 14,
 'L': 15,
 'D': 16,
 'S': 17,
 'G': 18,
 'U': 19
```

What are the implications of such a transformation of features? (hint: think about the distance function).

Have fun!

In [None]:
# Code