In [None]:
import numpy as np
import enum
import matplotlib.pyplot as plt
import os
import csv
import random
import importlib
import pickle
import itertools

import clusterpoints
import game
import visualize

%matplotlib tk

In [None]:
# Load all the games into memory
n_games = 1304
games = game.load_processed_data('processed_data', n=n_games)

In [None]:
# Collect all the shot qualities for each player, and the player that passed to them
players_records = []
for game_code, g in games.items():
    for quarter_num, quarter in g.quarters.items():
        if quarter.possessions is None:
            continue
        for p in quarter.possessions:
            if len(p.shots) == 0:
                continue
            if len(p.thread) >= 1:
                player = p.thread[-1].player
                if player is not None:
                    try:
                        player_record = next(player_record for p, player_record in players_records
                                             if player['name'] == p['name'])
                    except StopIteration:
                        player_record = {'shooting': [], 'passing': [], 'efg': player['efg']}
                        players_records.append((player, player_record))
                    player_record['shooting'].append(p.shots[-1].quality)
            if len(p.thread) >= 2:
                player = p.thread[-2].player
                if player is not None:
                    try:
                        player_record = next(player_record for p, player_record in players_records
                                             if player['name'] == p['name'])
                    except StopIteration:
                        player_record = {'shooting': [], 'passing': [], 'efg': player['efg']}
                        players_records.append((player, player_record))
                    player_record['passing'].append(p.shots[-1].quality)

# Compute the average shot qualities, and other stats
for player, player_record in players_records:
    if len(player_record['passing']) > 5 and len(player_record['shooting']) > 5:
        average_passing_esq = sum(player_record['passing'])/len(player_record['passing'])
        average_shooting_esq = sum(player_record['shooting'])/len(player_record['shooting'])
        player_record['average passing'] = average_passing_esq
        player_record['average shooting'] = average_shooting_esq
        player_record['shoot pass ratio'] = np.log10(len(player_record['shooting'])*1.0/len(player_record['passing']))
        player_record['number of entries'] = len(player_record['shooting']) + len(player_record['passing'])
        player_record['efg plus'] = 100*player_record['efg'] - player_record['average shooting']

In [8]:
# Get the number of games played from the players file
csvreader = csv.DictReader(open('players.csv','r'))
players_list = [dict(row) for row in csvreader]
for p, player_record in players_records:
    player_record['games played'] = next(int(player['games played']) for player in players_list
                                         if player['name'] == p['name'])

In [None]:
# Plot the distribution, and highlight the MVP players
importlib.reload(visualize)
visualize.plot_pass_shoot(
    players_records,
    players_to_highlight=[
        'LeBron James',
        'Russell Westbrook',
        'Kevin Durant',
        'Kawhi Leonard',
        'Stephen Curry',
        'James Harden'],
    bounds=[[49,55],
            [49,55]]
)

In [None]:
# Perform k-means clustering for different values of k
starting_lineups = pickle.load(open('lineups.pickle','rb'))
clusterings = []
for k in range(1,10):
    groups = clusterpoints.cluster_points(
        starting_lineups,
        k,
        points_func = lambda x: [x for x in [next(((player_record['average passing'], player_record['average shooting'])
                                                   for player, player_record in players_records
                                                   if player['name'] == name), None)
                                             for name in x['players']]
                                 if x is not None],
        iterations_after_none = 500
    )
    clusterings.append((k,groups))
# Save the clusterings, so they can be plotted later
pickle.dump(clusterings, open('Next clusterings.pickle','wb'))

In [None]:
# Plot the convergence of the clustering performance for different values of k
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot(
    [x[0] for x in clusterings],
    [np.mean([y['score average'] for y in x[1]]) for x in clusterings]
)
ax.set_xlabel('Number of clusters')
ax.set_ylabel('Average distance from player to cluster center')
ax.set_ylim([0, 7])
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
             ax.get_xticklabels() + ax.get_yticklabels()):
    item.set_fontsize('large')

In [None]:
# For each cluster, plot the player groups with the distribution in the background
importlib.reload(visualize)
groups = next(x[1] for x in clusterings if x[0] == 4)
for group in groups:
    print(', '.join(starting_lineup['team name'] for starting_lineup in group['nearest']))
    
    cluster_colors = ['red', 'yellow', 'limegreen', 'royalblue', 'm']
    clusters = zip(cluster_colors, [[x[0] for x in cluster] for cluster in group['clusters']])
    positions  = [[next(player for player, counts in players_records if x[1] == player['name'])['position']
                   for x in cluster]
                  for cluster in group['clusters']]
    position_modes = [max(set(x), key=x.count) for x in positions]
    position_fracs = [x[0].count(x[1])*100.0/len(x[0]) for x in zip(positions, position_modes)]
    position_labels = ["{} {:.0f}%".format(x[0], x[1]) for x in zip(position_modes, position_fracs)]
    position_coords = [np.mean([x[0] for x in cluster],axis=0) for cluster in group['clusters']]
    cluster_labels = zip(cluster_colors, position_labels, position_coords)
    
    visualize.plot_pass_shoot(players_records,
                              points_to_label=cluster_labels,
                              clusters_to_highlight=clusters,
                              bounds=[[49,55],[49,55]],
                              alpha=0.1
                             )