In [13]:
import pandas as pd
import numpy as np

In [14]:
def load_and_process():
    result = pd.read_csv('./data/nba-player-2014.csv', na_values='-')
    result.fillna(0, inplace=True)
    return result


def filter_columns(dataframe, excluded):
    return dataframe[list(set(dataframe.columns) - set(excluded))]


def normalize(dataframe):
    return (dataframe - dataframe.mean()) / dataframe.std()


def calculate_distance(s1, s2):
    return np.sqrt(np.power(s1 - s2, 2).sum(axis=1))


def kmeans(stat, k, iterations):
    centers = stat.sample(n=k)
    for i in range(iterations):
        centers.reset_index(drop=True, inplace=True)
        if i > 0 and i % 10 == 0:
            print('iteration round', i)
        distance_matrix = centers.apply(lambda c: calculate_distance(stat, c), axis=1)
        belongings = distance_matrix.idxmin()
        centers = stat.groupby(belongings).mean()
    return belongings



In [15]:
distance_excluded_columns = ['fullname']
players = load_and_process()
normalized_stat = normalize(filter_columns(players, distance_excluded_columns))

labels = kmeans(normalized_stat, 5, 100)

iteration round 10
iteration round 20
iteration round 30
iteration round 40
iteration round 50
iteration round 60
iteration round 70
iteration round 80
iteration round 90


In [16]:
players.groupby(labels).count().max(axis=1)

0    28
1    38
2    67
3    88
4    46
dtype: int64

In [17]:
players['Group'] = labels

james = players[players.fullname == 'James, LeBron']
james_group = players.Group == james.Group.iloc[0]

distances = calculate_distance(normalized_stat[james_group], normalized_stat.loc[james.index].iloc[0])
distances.name = 'Distance'
pd.concat([players[james_group], distances], axis=1).sort_values(by='Distance')

Unnamed: 0,FTA/G,MPG,FT%,PPG,GP,FGM/G,3FGA/G,3FG%,HIGH,PPS,3FGM/G,FTM/G,FG%,PTS/48,GS,fullname,PTS,FGA/G,Group,Distance
2,7.7,36.1,0.71,25.3,69,9.0,4.9,0.354,42,1.36,1.7,5.4,0.488,33.6,69,"James, LeBron",1743,18.5,0,0.0
11,5.8,35.4,0.858,21.1,68,7.5,3.2,0.359,40,1.28,1.2,5.0,0.455,28.5,67,"Gay, Rudy",1432,16.4,0,2.928869
15,6.1,34.4,0.812,19.3,76,6.4,4.3,0.364,33,1.35,1.6,4.9,0.445,26.8,76,"Hayward, Gordon",1463,14.3,0,3.423804
10,6.0,31.8,0.768,21.5,62,8.2,1.6,0.284,42,1.23,0.5,4.6,0.47,32.4,62,"Wade, Dwyane",1331,17.5,0,3.485929
8,4.9,36.4,0.863,21.7,75,7.7,5.0,0.415,57,1.32,2.1,4.2,0.468,28.6,75,"Irving, Kyrie",1628,16.5,0,3.555443
14,7.1,38.7,0.834,20.0,65,6.5,3.0,0.378,35,1.43,1.1,5.9,0.462,24.8,65,"Butler, Jimmy",1301,14.0,0,3.649827
6,5.1,35.4,0.845,23.4,71,9.3,1.5,0.352,39,1.17,0.5,4.3,0.466,31.7,71,"Aldridge, LaMarcus",1661,19.9,0,3.667809
7,6.4,35.2,0.728,21.9,67,8.6,0.4,0.4,45,1.28,0.1,4.6,0.502,29.9,67,"Griffin, Blake",1469,17.1,0,3.685176
0,9.8,34.4,0.835,28.1,67,9.4,4.3,0.299,54,1.28,1.3,8.1,0.426,39.3,67,"Westbrook, Russell",1886,22.0,0,3.722021
12,4.9,35.7,0.864,21.0,82,7.2,7.0,0.343,43,1.26,2.4,4.2,0.434,28.2,82,"Lillard, Damian",1720,16.6,0,3.967624


### 以下是分布执行讲解

In [9]:
k = 5
centers = stat.sample(n=k)
centers.reset_index(drop=True, inplace=True)
centers

Unnamed: 0,PTS/48,FTA/G,FTM/G,MPG,FGA/G,FGM/G,PTS,GS,FG%,GP,HIGH,FT%,3FGM/G,PPG,PPS,3FG%,3FGA/G
0,0.140711,0.103359,-0.026436,-0.825379,-0.560919,-0.602876,-0.406933,-1.180173,-0.160045,0.284093,-0.507897,-0.510498,-0.153614,-0.465705,0.48868,0.238151,-0.094572
1,-0.026597,-0.697441,-0.55207,-0.963047,-0.612147,-0.492511,-0.439084,-0.969681,0.550083,0.973664,-0.748551,1.020793,-0.695067,-0.585632,-0.047005,-0.076633,-0.561217
2,-1.104803,-0.327841,-0.401889,-0.963047,-1.354955,-1.099521,-1.202666,-0.689025,2.201129,-1.646706,-0.989205,-0.664534,-0.965793,-1.065336,2.631419,0.197092,-1.079712
3,-1.216342,-0.759041,-0.777342,-1.268976,-1.0732,-1.154704,-1.237496,-0.759189,-0.781406,-0.681306,-1.59084,-0.62829,-1.101156,-1.20525,-1.577533,-1.910592,-1.183411
4,1.683662,1.520159,1.775737,1.713834,1.974873,2.045894,2.301776,1.240482,0.212772,0.422007,3.703547,1.111402,1.741469,2.212644,0.871312,0.929307,1.409063


In [10]:
distance_matrix = centers.apply(lambda c: calculate_distance(stat, c), axis=1)
distance_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,257,258,259,260,261,262,263,264,265,266
0,11.810445,12.378807,9.327481,9.225911,9.584407,10.156033,8.34943,8.027007,8.654543,8.91728,...,3.897074,4.75756,3.886522,3.462988,5.226107,4.567826,4.531824,4.797437,4.842631,6.164737
1,12.650827,13.257922,10.268428,9.654727,10.391494,10.711301,8.701704,8.674197,9.258088,9.435505,...,3.735407,4.750047,4.768493,3.09674,5.457005,4.872139,4.650709,5.213894,5.108138,6.668267
2,14.118786,14.648485,11.475401,10.709081,11.188161,12.548523,10.695886,9.808814,11.01796,11.53388,...,4.987771,4.681027,4.559898,5.15965,3.325118,5.262882,2.802413,4.978339,5.413398,5.184707
3,14.578849,15.623567,12.401961,11.57964,11.944293,13.506404,10.844188,10.696947,11.785072,12.06489,...,2.092471,3.258761,4.453746,3.530732,4.116214,2.01737,5.590456,3.506902,1.656738,4.954299
4,5.548965,5.782448,3.555443,5.408165,6.345913,3.376582,4.070384,4.455685,0.0,2.566423,...,11.992489,12.149571,11.555714,11.06844,12.629147,12.255728,11.669225,11.978965,12.569615,13.658564


In [11]:
belongings = distance_matrix.idxmin()
belongings

0      4
1      4
2      4
3      4
4      4
5      4
6      4
7      4
8      4
9      4
10     4
11     4
12     4
13     4
14     4
15     4
16     4
17     4
18     4
19     4
20     4
21     4
22     4
23     4
24     4
25     4
26     0
27     4
28     4
29     4
      ..
237    0
238    3
239    0
240    1
241    2
242    3
243    3
244    0
245    3
246    3
247    1
248    1
249    0
250    2
251    3
252    3
253    3
254    3
255    2
256    1
257    3
258    3
259    0
260    1
261    2
262    3
263    2
264    3
265    3
266    3
Length: 267, dtype: int64

In [12]:
centers = stat.groupby(belongings).mean()
centers

Unnamed: 0,PTS/48,FTA/G,FTM/G,MPG,FGA/G,FGM/G,PTS,GS,FG%,GP,HIGH,FT%,3FGM/G,PPG,PPS,3FG%,3FGA/G
0,-0.013683,-0.144922,-0.165153,0.032157,-0.009141,-0.05821,-0.076119,-0.084866,-0.236343,-0.055955,0.001885,-0.060704,0.19771,-0.057255,-0.141641,0.276911,0.206234
1,-0.365658,-0.554154,-0.462288,-0.568997,-0.55201,-0.498509,-0.432852,-0.282532,0.108183,0.616886,-0.494818,0.547263,-0.439054,-0.549567,-0.130186,-0.172734,-0.455264
2,-0.814338,-0.096841,-0.392503,-0.577767,-1.058792,-0.689099,-0.763941,-0.279005,2.188924,-0.414098,-0.883919,-1.361656,-0.999633,-0.753028,2.02399,-1.298987,-1.092675
3,-1.108123,-0.809642,-0.844387,-1.000196,-0.998188,-1.030543,-1.1398,-0.64392,-0.280513,-0.755189,-1.09664,-0.916944,-0.806258,-1.087465,-1.025449,-1.001433,-0.785281
4,1.362383,1.493377,1.583113,1.287196,1.553911,1.531255,1.609135,1.013212,-0.025738,0.146179,1.464419,0.657372,0.714475,1.636477,0.453744,0.445528,0.726003
