### Previously on
* Supervised: regression and classification
* Unsupervised: clustering, reduce dimensionality
* Clustering is a key way to explore unknown data, and it's a very commonly used machine learning technique

### US senator data

In [None]:
# Determine which senators are more or less in the mainstream of their party

import pandas as pd
votes=pd.read_csv('114_congress.csv')

In [None]:
# Exploring
print(votes.party.value_counts())
votes.mean()

# If the mean for a column is less than .5, more Senators voted against the bill, and vice versa if it's over .5. Print the results.


In [None]:
# Euclidean distance between senators using only vote columns
from sklearn.metrics.pairwise import euclidean_distances

print(euclidean_distances(votes.iloc[0,3:].values.reshape(1, -1), votes.iloc[1,3:].values.reshape(1, -1)))

distance=euclidean_distances(votes.iloc[0,3:], votes.iloc[2,3:])

### Notes
* Because we aren't predicting anything, there's no risk of overfitting, so we'll train our model on the whole dataset.
* The crosstab() method takes in two vectors or Pandas Series and computes how many times each unique value in the second vector occurs for each unique value in the first vector.


### K-Mean clustering

In [None]:
import pandas as pd
from sklearn.cluster import KMeans

kmeans_model = KMeans(n_clusters=2, random_state=1)
senator_distances=kmeans_model.fit_transform(votes.iloc[:,3:])

In [None]:
# Use crosstab: occurence of same values for two vectors
labels=kmeans_model.labels_
pd.crosstab(labels,votes['party'])

In [None]:
democratic_outliers=votes[(labels==1) & (votes["party"] == "D")]
print(democratic_outliers)

In [None]:
# Visualize the clusters
plt.scatter(senator_distances[:,0],senator_distances[:,1],c=labels,lw=0)
plt.show()

In [None]:
import numpy as np
extremism=(senator_distances**3).sum(axis=1)
votes['extremism']=extremism

votes.sort_values('extremism',inplace=True,ascending=False)
print(votes.head(10))

### NBA K-Means clustering >> recreate the K-Means clustering process
3 steps:
(1) Assign each point to a clustter (evaluate the distance to randomly assigned centers)
(2) Re-evaluate the points in each clustter to find new centers

In [None]:
import pandas as pd
import numpy as np
nba = pd.read_csv("nba_2013.csv")
nba.head(3)


In [None]:
# Filter out point_guards
point_guards=nba[nba['pos']=='PG'].copy()

# Create a new feature points per game
point_guards['ppg'] = point_guards['pts'] / point_guards['g']

# Sanity check, make sure ppg = pts/g
point_guards[['pts', 'g', 'ppg']].head(5)

# Assist-turnover ratio
point_guards = point_guards[point_guards['tov'] != 0]
point_guards['atr']=point_guards['ast']/point_guards['tov']

In [None]:
plt.scatter(point_guards['ppg'], point_guards['atr'], c='y')
plt.title("Point Guards")
plt.xlabel('Points Per Game', fontsize=13)
plt.ylabel('Assist Turnover Ratio', fontsize=13)
plt.show()