### Attaining cluster outputs as features
#### Notebook outline:
1) Read in train and test dataset.
2) Create clusters using SVD and KMeans (Chris' method)
3) Create clusters using KMeans (Andreea's method)
4) Append cluster values to X datasets
5) Save new X datasets as CSVs

##### **Note that all scaling, clustering, and dimensionality reduction was learned from the training set. Test sets were not involved until final "prediction."

In [1]:
import pandas as pd
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

In [2]:
base_path = '../data/train_test_split/'

X_train = pd.read_csv(base_path+'x_train.csv')
X_train = X_train.drop(columns = 'Unnamed: 0')

X_test = pd.read_csv(base_path+'x_test.csv')
X_test = X_test.drop(columns = 'Unnamed: 0')

y_train = pd.read_csv(base_path+'y_train.csv')
y_test = pd.read_csv(base_path+'y_test.csv')

### Cluster 1: t-sne, kmeans (chrismca)
* Feature selection
* Standard Scaler
* Dimensionality reduction (t-sne, perplexity = 150)
* kmeans (k=4): fit to training set
* kmeans (k=4): assign cluster to test set

In [3]:
# Feature Selection
X_train_sel = X_train.drop(columns = ['position', 'state_province', 'committed_to',
                                       'committed_to', 'year', 'conference',
                                       'side_of_ball', 'position_group', 'stars',
                                       'hometown_city', 'athlete_id', 'name', 'hometown_country', # Andreea added on 10/1
                                       'post_season_wins_rolling_2year']) # not many distinct values

In [4]:
from sklearn.preprocessing import StandardScaler

# Standard Scaler
scaler = StandardScaler()

X_train_scal = scaler.fit_transform(X_train_sel)

In [5]:
# SVD of training set
from sklearn.decomposition import TruncatedSVD

# Train svd object on training set
svd = TruncatedSVD(n_components=2, random_state=42)
svd = svd.fit(X_train_scal) 

# Apply fitted object to the training set
X_train_SVD = svd.transform(X_train_scal) 

In [6]:
# Train clustering object and predict on training set
from sklearn.cluster import KMeans

k = 4
kmeans = KMeans(n_clusters=k, random_state=42, n_init = 'auto')
kmeans_svd = kmeans.fit(X_train_SVD)

train_SVD_kmeans_cluster = kmeans_svd.predict(X_train_SVD)

In [7]:
# Feature selection for test set
X_test_sel = X_test.drop(columns = ['position', 'state_province', 'committed_to',
                                       'committed_to', 'year', 'conference',
                                       'side_of_ball', 'position_group', 'stars',
                                       'hometown_city', 'athlete_id', 'name', 'hometown_country', # Andreea added on 10/1
                                       'post_season_wins_rolling_2year']) # not many distinct values

# scale test set (based off fitted scaler from training set)
X_test_scal = scaler.transform(X_test_sel)

# Apply SVD to test set (based off fitted svd object)
X_test_SVD = svd.transform(X_test_scal)

# use training set clusters to predict test set
test_SVD_kmeans_cluster = kmeans_svd.predict(X_test_SVD)

## Straight up KMeans (Andreea)
* Feature Selection
* Scale the data
* Run kmeans with k = 4 on training set
* Run kmeans "predcition" on test set

In [8]:
# Feature Selection

numerical_features =  ['height', 'weight', 'distance_miles', 'stars','win_pct_rolling_2year','rating']
X_train_sel = X_train[numerical_features]

In [9]:
# Standard Scaler

scaler = StandardScaler()
X_train_scal = scaler.fit_transform(X_train_sel)

In [10]:
k = 4

# train kmeans on training set
kmeans = KMeans(n_clusters=k, random_state=42, n_init = 'auto')
kmeans = kmeans.fit(X_train_scal)

train_kmeans_cluster = kmeans.predict(X_train_scal)

In [11]:
# Feature Selection
X_test_sel = X_test[numerical_features]

# Standard Scaler (using trained scaler from above)
X_test_scal = scaler.transform(X_test_sel)

# predict kmeans on test set
test_kmeans_cluster = kmeans.predict(X_test_scal)

### Append clusters to train and test data sets

In [12]:
X_train['SVD_KMeans_Cluster'] = train_SVD_kmeans_cluster
X_test['SVD_KMeans_Cluster'] = test_SVD_kmeans_cluster

In [13]:
X_train['KMeans_Cluster'] = train_kmeans_cluster
X_test['KMeans_Cluster'] = test_kmeans_cluster

### Save new X datasets as CSVs

In [14]:
base_path = '../data/train_test_split/'

X_train.to_csv(base_path + 'x_train_cluster.csv', index = False)
X_test.to_csv(base_path + 'x_test_cluster.csv', index = False)