In [None]:
import numpy
import pandas
from matplotlib import pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
from sklearn.cluster import KMeans
import sklearn.metrics as metrics
import warnings
from sklearn import preprocessing

warnings.filterwarnings('ignore')

pandas.set_option('display.max_columns', 500)
pandas.set_option('display.max_rows', 500)
original_data = pandas.read_csv('sf_data.csv')
drops = ['abb','abbreviation','name','conference','assists',
        'away_wins','blocks','conference_wins','defensive_rebounds',
        'field_goal_attempts','field_goals','free_throws',
        'games_played','home_wins','minutes_played','losses',
        'offensive_rebounds','opp_assists','opp_blocks',
        'opp_defensive_rebounds','opp_field_goal_attempts',
        'opp_field_goals','opp_free_throw_attempts','opp_free_throws',
        'opp_offensive_rebounds','opp_personal_fouls','opp_steals',
        'opp_three_point_field_goal_attempts','opp_three_point_field_goals',
        'opp_total_rebounds','opp_turnovers','opp_two_point_field_goal_attempts',
        'opp_two_point_field_goals','personal_fouls','steals','three_point_field_goal_attempts',
        'three_point_field_goals','total_rebounds','turnovers',
        'two_point_field_goal_attempts','two_point_field_goals','wins']
data = original_data.drop(drops,1)
# data = data[data['major']==0]
data = data[data['major']!=0]
data_standardized = preprocessing.scale(data)

In [None]:
# #use to find number of clusters I should start with
plt.figure(figsize=(10, 8))
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(data_standardized)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
kmeans = KMeans(n_clusters = 4, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(data_standardized)
y_kmeans1=y_kmeans
y_kmeans1=y_kmeans+1
cluster = pandas.DataFrame(y_kmeans1)
cluster.columns = ['cluster']

data_rejoin = pandas.merge(data,original_data)
joined = pandas.merge(data_rejoin,cluster,left_index=True,right_index=True)

joined.groupby('cluster')['win_percentage','net_rating','simple_rating_system','offensive_rating',
                           'opp_offensive_rating',
                          'true_shooting_percentage','field_goal_percentage','effective_field_goal_percentage',
                          'total_rebound_percentage','two_point_field_goal_percentage'].mean()

In [None]:
sns.scatterplot(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], 
                marker='+', 
                color='black', 
                s=200)
sns.scatterplot(data_standardized[:,0], data_standardized[:,1], hue = joined['cluster'],
                palette=sns.color_palette("Set1", n_colors=4))

In [None]:
x_axis = pandas.DataFrame(data_standardized[:,0])
y_axis = pandas.DataFrame(data_standardized[:,1])
axis_join = pandas.merge(x_axis,y_axis,left_index=True,right_index=True)
axis_join.columns = ['x','y']

km_cen_x_axis = pandas.DataFrame(kmeans.cluster_centers_[:,0])
km_cen_y_axis = pandas.DataFrame(kmeans.cluster_centers_[:,1])
km_cen_axis_join = pandas.merge(km_cen_x_axis,km_cen_y_axis,left_index=True,right_index=True).T
km_cen_axis_join.columns = ['cluster1','cluster2','cluster3','cluster4']
km_cen_axis_join.rename(index={'0_x':'x_cen','0_y':'y_cen'}, inplace=True)
just_x = pandas.DataFrame(km_cen_axis_join.iloc[0,]).T
just_x.columns = ['cluster1_x','cluster2_x','cluster3_x','cluster4_x']
just_y = pandas.DataFrame(km_cen_axis_join.iloc[1,]).T
just_y.columns = ['cluster1_y','cluster2_y','cluster3_y','cluster4_y']
km_cen_axis_final = axis_join.assign(**just_x.iloc[0])
km_cen_axis_final = km_cen_axis_final.assign(**just_y.iloc[0])

def numpy_triu1(df):          
    a = df.values
    r,c = numpy.triu_indices(a.shape[1],1)
    cols = df.columns
    nm = [cols[i]+"_"+cols[j] for i,j in zip(r,c)]
    return pandas.DataFrame(abs(a[:,r] - a[:,c]), columns=nm)

diffs = numpy_triu1(km_cen_axis_final)
diffs_table = diffs.iloc[:,[1,2,3,4,13,14,15,16]]
diffs_table['cluster1_dist'] = (diffs_table['x_cluster1_x'] + diffs_table['y_cluster1_y'])/2
diffs_table['cluster2_dist'] = (diffs_table['x_cluster2_x'] + diffs_table['y_cluster2_y'])/2
diffs_table['cluster3_dist'] = (diffs_table['x_cluster3_x'] + diffs_table['y_cluster3_y'])/2
diffs_table['cluster4_dist'] = (diffs_table['x_cluster4_x'] + diffs_table['y_cluster4_y'])/2
diffs_table['min_val'] = diffs_table[['cluster1_dist','cluster2_dist','cluster3_dist','cluster4_dist']].min(axis=1)

conditions = [
    (diffs_table['min_val'] == diffs_table['cluster1_dist']),
    (diffs_table['min_val'] == diffs_table['cluster2_dist']),
    (diffs_table['min_val'] == diffs_table['cluster3_dist'])]
choices = [1, 2, 3]
diffs_table['math_cluster'] = numpy.select(conditions, choices, default=4)
math_clusters = diffs_table.iloc[:,13]

joined2 = pandas.merge(joined,math_clusters,left_index=True,right_index=True)
joined2['k_vs_math_match'] = joined2['cluster'] == joined2['math_cluster']

In [None]:
joined2.to_csv('college_hoops_team_clusters.csv')
# joined2.to_csv('college_hoops_team_clusters_lowmajor.csv')

In [None]:
joined2