### Attaining cluster outputs as features
#### Notebook outline:
1) Read in train and test dataset.
2) Create clusters using SVD and KMeans (Chris' method)
3) Create clusters using KMeans (Andreea's method)
4) Append cluster values to X datasets
5) Save new X datasets as CSVs

##### **Note that all scaling, clustering, and dimensionality reduction was learned from the training set. Test sets were not involved until final "prediction."

In [46]:
import pandas as pd
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
import numpy as np

In [4]:
base_path = '../data/train_test_split/'

X_train = pd.read_csv(base_path+'x_train.csv')
X_train = X_train.drop(columns = 'Unnamed: 0')

X_test = pd.read_csv(base_path+'x_test.csv')
X_test = X_test.drop(columns = 'Unnamed: 0')

y_train = pd.read_csv(base_path+'y_train.csv')
y_test = pd.read_csv(base_path+'y_test.csv')

### Cluster 1: t-sne, kmeans (chrismca)
* Feature selection
* Standard Scaler
* Dimensionality reduction (t-sne, perplexity = 150)
* kmeans (k=4): fit to training set
* kmeans (k=4): assign cluster to test set

In [3]:
# Feature Selection
X_train_sel = X_train.drop(columns = ['position', 'state_province', 'committed_to',
                                       'committed_to', 'year', 'conference',
                                       'side_of_ball', 'position_group', 'stars',
                                       'hometown_city', 'athlete_id', 'name', 'hometown_country', # Andreea added on 10/1
                                       'post_season_wins_rolling_2year']) # not many distinct values

In [4]:
from sklearn.preprocessing import StandardScaler

# Standard Scaler
scaler = StandardScaler()

X_train_scal = scaler.fit_transform(X_train_sel)

In [5]:
# SVD of training set
from sklearn.decomposition import TruncatedSVD

# Train svd object on training set
svd = TruncatedSVD(n_components=2, random_state=42)
svd = svd.fit(X_train_scal) 

# Apply fitted object to the training set
X_train_SVD = svd.transform(X_train_scal) 

In [6]:
# Train clustering object and predict on training set
from sklearn.cluster import KMeans

k = 4
kmeans = KMeans(n_clusters=k, random_state=42, n_init = 'auto')
kmeans_svd = kmeans.fit(X_train_SVD)

train_SVD_kmeans_cluster = kmeans_svd.predict(X_train_SVD)

In [7]:
# Feature selection for test set
X_test_sel = X_test.drop(columns = ['position', 'state_province', 'committed_to',
                                       'committed_to', 'year', 'conference',
                                       'side_of_ball', 'position_group', 'stars',
                                       'hometown_city', 'athlete_id', 'name', 'hometown_country', # Andreea added on 10/1
                                       'post_season_wins_rolling_2year']) # not many distinct values

# scale test set (based off fitted scaler from training set)
X_test_scal = scaler.transform(X_test_sel)

# Apply SVD to test set (based off fitted svd object)
X_test_SVD = svd.transform(X_test_scal)

# use training set clusters to predict test set
test_SVD_kmeans_cluster = kmeans_svd.predict(X_test_SVD)

## Straight up KMeans (Andreea)
* Feature Selection
* Scale the data
* Run kmeans with k = 4 on training set
* Run kmeans "predcition" on test set

In [8]:
# Feature Selection

numerical_features =  ['height', 'weight', 'distance_miles', 'stars','win_pct_rolling_2year','rating']
X_train_sel = X_train[numerical_features]

In [9]:
# Standard Scaler

scaler = StandardScaler()
X_train_scal = scaler.fit_transform(X_train_sel)

In [10]:
k = 4

# train kmeans on training set
kmeans = KMeans(n_clusters=k, random_state=42, n_init = 'auto')
kmeans = kmeans.fit(X_train_scal)

train_kmeans_cluster = kmeans.predict(X_train_scal)

In [11]:
# Feature Selection
X_test_sel = X_test[numerical_features]

# Standard Scaler (using trained scaler from above)
X_test_scal = scaler.transform(X_test_sel)

# predict kmeans on test set
test_kmeans_cluster = kmeans.predict(X_test_scal)

### Append clusters to train and test data sets

In [12]:
X_train['SVD_KMeans_Cluster'] = train_SVD_kmeans_cluster
X_test['SVD_KMeans_Cluster'] = test_SVD_kmeans_cluster

In [13]:
X_train['KMeans_Cluster'] = train_kmeans_cluster
X_test['KMeans_Cluster'] = test_kmeans_cluster

### Save new X datasets as CSVs

In [14]:
base_path = '../data/train_test_split/'

X_train.to_csv(base_path + 'x_train_cluster.csv', index = False)
X_test.to_csv(base_path + 'x_test_cluster.csv', index = False)

## DBSCAN with PCA and TSNE (Ryan)

### Preprocess

In [69]:
# Define numerical and categorical values
numerical_features = ['rating', 'ranking', 'height', 'weight', 'distance_miles', 'stars', 'wins_rolling_2year', 'games_played_rolling_2year', 'post_season_wins_rolling_2year', 'point_diff_rolling_2year', 'win_pct_rolling_2year']
categorical_features = ['conference', 'side_of_ball', 'position_group', 'position', 'year']

In [70]:
X_train_sel = X_train[numerical_features+categorical_features]
X_test_sel = X_test[numerical_features+categorical_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

X_train_scal = preprocessor.fit_transform(X_train_sel)
X_test_scal = preprocessor.fit_transform(X_test_sel)

### PCA

In [71]:
pca = PCA(n_components=2, random_state=42)
dim_reduced_x_train_pca = pca.fit_transform(X_train_scal)
dim_reduced_x_test_pca = pca.fit_transform(X_test_scal)

In [72]:
dbscan = DBSCAN(eps=.45, min_samples=250)
clusters_pca_train = dbscan.fit_predict(dim_reduced_x_train_pca)

labels = dbscan.labels_
core_mask = dbscan.core_sample_indices_
core_points = dim_reduced_x_train_pca[core_mask]
core_labels = labels[core_mask]

nn = NearestNeighbors(n_neighbors=1).fit(core_points)
distances, indices = nn.kneighbors(dim_reduced_x_test_pca)

clusters_pca_test = core_labels[indices.flatten()]

In [73]:
print(f'training clusters: {np.unique(clusters_pca_train)}')
print(f'testing clusters: {np.unique(clusters_pca_test)}')

training clusters: [-1  0  1  2]
testing clusters: [0 1 2]


### TSNE

In [74]:
tsne = TSNE(n_components=2, init='random', random_state=42)
dim_reduced_x_train_tsne = tsne.fit_transform(X_train_scal)
dim_reduced_x_test_tsne = tsne.fit_transform(X_test_scal)

In [75]:
dbscan = DBSCAN(eps=6, min_samples=100)
clusters_tsne_train = dbscan.fit_predict(dim_reduced_x_train_tsne)

labels = dbscan.labels_
core_mask = dbscan.core_sample_indices_
core_points = dim_reduced_x_train_tsne[core_mask]
core_labels = labels[core_mask]

nn = NearestNeighbors(n_neighbors=1).fit(core_points)
distances, indices = nn.kneighbors(dim_reduced_x_test_tsne)

clusters_tsne_test = core_labels[indices.flatten()]

In [76]:
print(f'training clusters: {np.unique(clusters_tsne_train)}')
print(f'testing clusters: {np.unique(clusters_tsne_test)}')

training clusters: [-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22]
testing clusters: [ 0  1  2  3  4  5  6  7  8 10 11 12 13 14 16 17 18 19 20 21 22]


### Append to Datasets

In [None]:
X_train['DBSCAN_Cluster_PCA'] = clusters_pca_train
X_test['DBSCAN_Cluster_PCA'] = clusters_pca_test

In [None]:
X_train['DBSCAN_Cluster_TSNE'] = clusters_tsne_train
X_test['DBSCAN_Cluster_TSNE'] = clusters_tsne_test

In [None]:
base_path = '../data/train_test_split/'

X_train.to_csv(base_path + 'x_train_cluster.csv', index = False)
X_test.to_csv(base_path + 'x_test_cluster.csv', index = False)

In [36]:
dbscan.core_sample_indices_

array([ 248, 1250, 1658, 2678, 2843, 3017, 3308, 3465, 4004, 4374, 4855,
       5195, 5242, 5285])