In [None]:
import numpy as np
import enum
import matplotlib.pyplot as plt
import os
import random
import importlib
import scipy.stats
import csv
import math

import clusterpoints
import game
import visualize

%matplotlib tk

In [None]:
# Load all the games into memory
n_games = 1304
games = game.load_processed_data('processed_data', n=n_games)

In [None]:
# Count each player's shots, dribbles and passes
players_counts = []
for game_code, g in games.items():
    for quarter_num, quarter in g.quarters.items():
        if quarter.possessions is None:
            continue
        for p in quarter.possessions:
            for h in p.thread:
                if h.player is None:
                    continue
                try:
                    player, counts = next((player, counts) for player, counts in players_counts
                                          if h.player['name'] == player['name'])
                except StopIteration:
                    player = h.player
                    counts = {'pass': 0, 'shot': 0, 'dribble': 0}
                    players_counts.append((player, counts))
                if h.end_type in counts:
                    counts[h.end_type] += 1

In [None]:
# If the player has done at least 15 actions, compute coordinates on a triangle plot
players_coords = []
for player, counts in players_counts:
    total = sum(count for action, count in counts.items())
    if total > 15:
        stats = {action: count*1.0/total for action, count in counts.items()}
        players_coords.append((player, counts, [stats['pass'] + 0.500*stats['shot'],
                                                                0.866*stats['shot']]))

In [None]:
# Get the number of games played from the players file
csvreader = csv.DictReader(open('players.csv','r'))
players_list = [dict(row) for row in csvreader]
for player, counts, coords in players_coords:
    player['games played'] = next((int(p['games played']) for p in players_list
                                   if p['name'] == player['name']), 0)

In [None]:
# Plot the full triangle plot
importlib.reload(visualize)
visualize.triangle_scatter_players(players_coords,
                                   tick_size=[.1,.1],
                                   bounds=[[0,1.0],
                                           [0,1.0]],
                                   alpha=0.2)

In [None]:
# Plot the distribution, and highlight the MVP players
visualize.triangle_scatter_players(players_coords,
                                   players_to_highlight=['LeBron James',
                                                         'Russell Westbrook',
                                                         'Kevin Durant',
                                                         'Kawhi Leonard',
                                                         'Stephen Curry',
                                                         'James Harden',
                                                         'Kevin Love',
                                                         'DeAndre Jordan'],
                                   tick_size=[.1,.03],
                                   bounds=[[0.4,0.9],
                                           [0,0.15]],
                                   alpha=0.4)

In [None]:
# Plot the correlation between EFG and passing propensity
x=np.array([x[1]['pass']*1.0/sum(count for action, count in x[1].items())
            for x in players_coordsif not math.isnan(x[0]['efg'])])
y=np.array([x[0]['efg'] for x in players_coords if not math.isnan(x[0]['efg'])])

fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot(x,y,'.')
ax.set_xlabel('Passing propensity')
ax.set_ylabel('Effective Field Goal Percentage')
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
             ax.get_xticklabels() + ax.get_yticklabels()):
    item.set_fontsize('large')

# Do a linear regression, plot the line of best fit and label it
slope, intercept, rvalue, pvalue, stderr = scipy.stats.linregress(x,y)
ax.plot(x,x*slope+intercept,linewidth=3)
ax.annotate(
    s="efg={:.2f}*pass+{:.2f}\nr={:.2f}, p={:.1e}".format(slope, intercept, rvalue, pvalue),
    xy=(np.mean(x),np.mean(x)*slope+intercept),
    xytext=(np.mean(x),np.mean(x)*slope+intercept),
    bbox=dict(boxstyle='round,pad=0.5', fc='orange'),
    arrowprops=dict(arrowstyle="->", color='orange', lw=4),
    zorder=5).draggable()

# Highlight the MVPs
names = ['LeBron James',
         'Russell Westbrook',
         'Kevin Durant',
         'Kawhi Leonard',
         'Stephen Curry',
         'James Harden',
         'Kevin Love',
         'DeAndre Jordan']
for name in names:
    player, counts, coords = next((player, counts, coords) for player, counts, coords in players_coords
                                  if player['name'] == name)
    ax.annotate(
        s=name,
        xy=(counts['pass']*1.0/sum(count for action, count in counts.items()),player['efg']),
        xytext=(counts['pass']*1.0/sum(count for action, count in counts.items()),player['efg']),
        bbox=dict(boxstyle='round,pad=0.5', fc='limegreen'),
        arrowprops=dict(arrowstyle="->", color='limegreen', lw=4),
        zorder=4).draggable()

In [None]:
# Perform k-means clustering for different values of k
starting_lineups = pickle.load(open('lineups.pickle','rb'))
clusterings = []
for k in range(1,10):
    groups = clusterpoints.cluster_points(
        starting_lineups,
        k,
        points_func = lambda x: [x for x in [next((coords for player, counts, coords in players_coords
                                                   if player['name'] == name), None)
                                             for name in x['players']]
                                 if x is not None],
        iterations_after_none = 500
    )
    clusterings.append((k,groups))

In [None]:
# Plot the convergence of the clustering performance for different values of k
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot(
    [x[0] for x in clusterings],
    [np.mean([y['score average'] for y in x[1]]) for x in clusterings]
)
ax.set_xlabel('Number of clusters')
ax.set_ylabel('Average distance from player to cluster center')
ax.set_ylim([0, 0.01])
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
             ax.get_xticklabels() + ax.get_yticklabels()):
    item.set_fontsize('large')

In [None]:
# For each cluster, plot the player groups with the distribution in the background
importlib.reload(visualize)
groups = next(x[1] for x in clusterings if x[0] == 4)
for group in groups:
    print(', '.join(starting_lineup['team name'] for starting_lineup in group['nearest']))
    
    cluster_colors = ['red', 'yellow', 'limegreen', 'royalblue', 'm']
    clusters = zip(cluster_colors, [[x[0] for x in cluster] for cluster in group['clusters']])
    positions  = [[next(player for player, counts in players_counts
                        if x[1] == player['name'])['position']
                   for x in cluster]
                  for cluster in group['clusters']]
    position_modes = [max(set(x), key=x.count) for x in positions]
    position_fracs = [x[0].count(x[1])*100.0/len(x[0]) for x in zip(positions, position_modes)]
    position_labels = ["{} {:.0f}%".format(x[0], x[1]) for x in zip(position_modes, position_fracs)]
    position_coords = [np.mean([x[0] for x in cluster],axis=0) for cluster in group['clusters']]
    cluster_labels = zip(cluster_colors, position_labels, position_coords)
    
    visualize.triangle_scatter_players(players_coords,
                                   points_to_label=cluster_labels,
                                   clusters_to_highlight=clusters,
                                   tick_size=[.1,.03],
                                   bounds=[[0.4,0.9],[0,0.15]],
                                   alpha=0.1,
                                   label=False)