In [6]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)

df = pd.read_csv('../../data/imputed_dataset.csv')

# Add conference group feature
conference_group = {'SEC': 'power_5', 'Big Ten': 'power_5', 'ACC': 'power_5', 'Big 12': 'power_5'
                  , 'FBS Independents': 'ind', 'Mountain West': 'group_5'
                  , 'Pac-12': 'power_5', 'Mid-American': 'group_5', 'American Athletic': 'group_5'
                  , 'Sun Belt': 'group_5', 'Conference USA': 'group_5'}

df["conf_group"] = df["conference"].map(conference_group)

In [7]:
# Drop categorical columns for dimensionality reduction / clustering:
df_quant = df.drop(columns = ['position', 'state_province', 'committed_to',
                              'committed_to', 'year', 'conference',
                              'side_of_ball', 'position_group', 'conf_group', 'stars',
                              'hometown_city', 'athlete_id', 'name', 'hometown_country', # Andreea added on 10/1
                              'post_season_wins_rolling_2year']) # not many distinct values

In [8]:
X = df_quant.drop(columns = ['is_drafted'])
y = df_quant['is_drafted']

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_scaled = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
X_scaled.head()

Unnamed: 0,rating,ranking,height,weight,latitude,longitude,latitude_school,longitude_school,wins_rolling_2year,games_played_rolling_2year,point_diff_rolling_2year,win_pct_rolling_2year,distance_miles
0,2.967899,-1.493761,1.609536,-0.003574,0.447271,-0.127447,0.421016,-0.040218,0.719918,0.362495,0.820877,0.773952,-0.568584
1,2.964356,-1.492798,0.414951,1.434095,-0.305947,-0.38216,-1.340881,0.522908,0.897529,1.119574,1.06453,0.831514,0.600467
2,2.936013,-1.491834,1.211341,1.212916,0.072797,0.698095,-1.504312,0.660086,0.187083,0.362495,0.370488,0.191132,-0.088782
3,2.928927,-1.49087,0.016757,-0.224754,0.704181,-1.997663,2.227862,-2.142148,0.009471,0.362495,-0.766559,-0.003142,0.33176
4,2.927156,-1.489906,0.414951,0.328196,1.021295,0.960498,0.641996,0.613013,0.719918,0.362495,1.053455,0.773952,-0.216651


In [10]:
# I tried running higher values of p, but there wasn't any better separation and it took forever to run. 
from sklearn.manifold import TSNE

def t_sne_reduction(scaled_dataset, perp=150):
    
    # t-SNE dim reduction
    X_embedded = TSNE(n_components=2, learning_rate='auto',
                      init='random', perplexity=perp).fit_transform(X_scaled)
    
    X_emb_df = pd.DataFrame(X_embedded, columns = ['component 1', 'component 2'])    
    return X_emb_df
    
X_emb_df = t_sne_reduction(X_scaled)

In [11]:
# See documentation notebook for elbow plot showing why we did k = 4

from sklearn.cluster import KMeans
def kmeans_cluster(X, k=4):
    
    kmeans = KMeans(n_clusters=k, random_state=0, n_init = 'auto')
    X['cluster'] = kmeans.fit_predict(X_emb_df)

    return X

df_kmeans = kmeans_cluster(X_emb_df, 4)

In [13]:
df['kmean_tsne_cluster'] = df_kmeans['cluster']
df.head()

Unnamed: 0,name,rating,ranking,year,position,height,weight,latitude,longitude,hometown_city,state_province,hometown_country,stars,committed_to,athlete_id,is_drafted,conference,latitude_school,longitude_school,wins_rolling_2year,games_played_rolling_2year,post_season_wins_rolling_2year,point_diff_rolling_2year,win_pct_rolling_2year,distance_miles,side_of_ball,position_group,conf_group,kmean_tsne_cluster
0,Dorial Green-Beckham,0.9997,1.0,2012,WR,78.0,220.0,37.216678,-93.292037,Springfield,MO,USA,5,Missouri,531380.0,1.0,SEC,38.935849,-92.333201,18.0,26.0,1.0,300.0,0.692308,129.726375,offense,pass_catcher,power_5,2
1,Mario Edwards,0.9995,2.0,2012,DT,75.0,285.0,33.215039,-97.133052,Denton,TX,USA,5,Florida State,530290.0,1.0,ACC,30.438169,-84.304403,19.0,27.0,2.0,366.0,0.703704,776.590432,defense,d_line,power_5,1
2,D.J. Humphries,0.9979,3.0,2012,OT,77.0,275.0,35.2272,-80.843083,Charlotte,NC,USA,5,Florida,-1009881.0,1.0,SEC,29.649936,-82.348579,15.0,26.0,2.0,178.0,0.576923,395.212334,offense,o_line,power_5,1
3,Shaq Thompson,0.9975,4.0,2012,S,74.0,210.0,38.581572,-121.4944,Sacramento,CA,USA,5,Washington,535329.0,1.0,Big Ten,47.650323,-122.301575,14.0,26.0,1.0,-130.0,0.538462,627.908306,defense,d_backfield,power_5,3
4,Noah Spence,0.9974,5.0,2012,WDE,75.0,235.0,40.266311,-76.886112,Harrisburg,PA,USA,5,Ohio State,-1027150.0,1.0,Big Ten,40.001645,-83.019727,18.0,26.0,1.0,363.0,0.692308,324.459618,defense,d_line,power_5,2


In [14]:
df.to_csv('../../data/tsne_kmeans_cm.csv', index = False)