In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from PIL import Image
from sklearn.svm import SVC

In [None]:
x_scale = 5
y_scale = 3

full_court_img = Image.open('../../data/wnba/ml-bg-court.png').resize((x_scale * 100, y_scale * 100), Image.ANTIALIAS)
full_court_img.size

In [None]:
df_boxscores = pd.read_csv('../../data/wnba/wnba_boxscores.csv')
df_boxscores.head()

In [None]:
df = pd.read_csv('../../data/wnba/wnba_shot_chart.csv')
df.head()

In [None]:
def who_shot(row):
    match = re.search(r'^([a-z\']+[\s-]+)+(?=makes|misses)', row, flags=re.IGNORECASE)
    if match is not None:
        return match.group().strip()
        
    match = re.search(r'blocks\s+([a-z\']+[\s-]+)+?(?=\'s)', row, flags=re.IGNORECASE)
    if match is None:
        return ''

    return re.sub(r'^blocks\s+', '', match.group()).strip()

def who_blocked(row):
    match = re.search(r'^([a-z\']+[\s-]+)+(?=blocks)', row, flags=re.IGNORECASE)
    if match is None:
        return ''

    return match.group().strip()

def distance(row):
    match = re.search(r'(\d+)[\s-]+foot', row, flags=re.IGNORECASE)
    if match is None:
        return ''

    return match.group(1).strip()

def distance_category(distance):
    if distance == '':
        return ''
    
    distance_as_integer = int(distance)
    if distance_as_integer < 6:
        return '0-5 footer'

    if distance_as_integer < 17:
        return '<= 16 footer'

    if distance_as_integer <= 22:
        return '<= 22 footer'

    return '+22 footer'

def shot(row):
    match = re.search(r'(pullup|floating|step\s+back)\s+jump', row)
    if match is None:
        match = re.search(r'(layup)', row)

    if match is None:
        match = re.search(r'(hook|jumper)', row)

    if match is None:
        match = re.search(r'(free\s+throw)', row)

    if match is None:
        return 'set'

    return match.group(1).strip()

def assist(row):
    match = re.search(r'\(((?:[a-z\']+[\s-])+)(?=assists)', row, flags=re.IGNORECASE)
    if match is None:
        return ''

    return match.group(1).strip()

def is_3pt(row):
    shot_distance = row['shot_distance']
    if len(shot_distance) > 0:
        return int(shot_distance) > 22
    
    return len(re.findall(r'three\s+point', row['description'])) > 0

df['shot_by'] = df.description.map(who_shot)
df['shot_distance'] = df.description.map(distance)
df['shot_distance_cat'] = df.shot_distance.map(distance_category)
df['blocked_by'] = df.description.map(who_blocked)
df['shot_type'] = df.description.map(shot)
df['assist_by'] = df.description.map(assist)

df['is_3pt'] = df.apply(is_3pt, axis=1)

In [None]:
def parse(loc):
  locations = list(map(float, re.findall(r'calc\(([\d.]+)%', ' '.join(loc))))
  if len(locations) != 2:
    return [-1, -1]

  ## css is bottom ... bottom 6% would mean lower shot near line
  ##                   bottom 96% would mean upper shot new line
  return [ round(x, 2) for x in locations ]

def is_valid(loc):
  return not any(list(map(lambda a: a > 100 or a < 0, loc)))

df.location = df.location.map(eval).map(parse)
df['is_valid'] = df.location.map(is_valid)

df.head(n=10)

In [None]:
## only use the valid shot locations moving forward
df_valid = df[df.is_valid].copy()

In [None]:
def side(loc):
  return 'right' if loc[1] >= 50  else 'left'

df_valid['side_of_court'] = df_valid.location.map(lambda loc: 'right' if loc[1] >= 50  else 'left')

def flip(row):

  location = row['location']
  shot_distance = row['shot_distance']

  if shot_distance != '' and int(shot_distance) >= 46:
    return [50, 50]

  if row['side_of_court'] == 'right':
    x = location[1]
    y = location[0]  

    return [round(100-y, 1), round(100-x, 1)]

  return location

df_valid['single_location'] = df_valid.apply(flip, axis=1)
df_valid['side_of_the_half_court'] = df_valid.single_location.map(lambda loc: 'right' if loc[0] > 50 else 'left') ## doesnt make sense yet...

df_valid.head(n=5)

In [None]:
def get_makes_and_misses_full_court(df):
    def get_coordinates(df):
        x = df.location.map(lambda a: a[1])
        y = df.location.map(lambda a: a[0])

        return x, y

    df_sub = df[df.blocked_by.map(lambda bb: len(bb) == 0)]

    x_make, y_make = get_coordinates(df_sub[df_sub.made])
    x_missed, y_missed = get_coordinates(df_sub[~df_sub.made])

    return x_make, y_make, x_missed, y_missed

In [None]:
def get_makes_and_misses_half_court(df):
    def get_coordinates(df):
        x = df.single_location.map(lambda a: a[1])
        y = df.single_location.map(lambda a: a[0])

        return x, y

    df_sub = df[df.blocked_by.map(lambda bb: len(bb) == 0)]

    x_make, y_make = get_coordinates(df_sub[df_sub.made])
    x_missed, y_missed = get_coordinates(df_sub[~df_sub.made])

    return x_make, y_make, x_missed, y_missed

### Can we impute 3pt from location?

In [None]:
fig, ax = plt.subplots(2)

## def. 3pt
df_3pt = df_valid[
    np.logical_and(
        df_valid.shot_distance.map(lambda a: -1 if a == '' else int(a)) > 22, ## have distance, > 22
        df_valid.is_3pt,
    )
]

x_make, y_make, x_missed, y_missed = get_makes_and_misses_half_court(df_3pt)

ax[0].scatter(x_missed * x_scale, (100 - y_missed) * y_scale, marker='x', c='red', alpha=.5)
ax[0].scatter(x_make * x_scale, (100 - y_make) * y_scale, marker='o', c='blue', alpha=.5)
ax[0].axhline(50 * y_scale, c='k', alpha=.5, linestyle= '--')
ax[0].imshow(full_court_img)

## def. 2pt
df_2pt = df_valid[
    np.logical_and(
        df_valid.shot_distance.map(lambda a: 50 if a == '' else int(a)) < 22, ## have distance, > 22
        ~df_valid.is_3pt,
    )
]

x_make, y_make, x_missed, y_missed = get_makes_and_misses_half_court(df_2pt)

ax[1].scatter(x_missed * x_scale, (100 - y_missed) * y_scale, marker='x', c='red', alpha=.5)
ax[1].scatter(x_make * x_scale, (100 - y_make) * y_scale, marker='o', c='blue', alpha=.5)
ax[1].axhline(50 * y_scale, c='k', alpha=.5, linestyle= '--')
ax[1].imshow(full_court_img)

plt.show()

In [None]:
fig, ax = plt.subplots()

df_unlabeled = df_valid[
    np.logical_and(
        df_valid.shot_distance.str.len() == 0,
        ~df_valid.is_3pt,
    )
].copy()

x_make, y_make, x_missed, y_missed = get_makes_and_misses_half_court(df_unlabeled)

ax.scatter(x_missed * x_scale, (100 - y_missed) * y_scale, marker='x', c='red', alpha=.5)
ax.scatter(x_make * x_scale, (100 - y_make) * y_scale, marker='o', c='blue', alpha=.5)
ax.axhline(50 * y_scale, c='k', alpha=.5, linestyle= '--')
ax.imshow(full_court_img)

plt.show()

In [None]:
df_training = pd.concat([
    df_3pt[['single_location', 'is_3pt']],
    df_2pt[['single_location', 'is_3pt']]
]).sample(frac=1)

X = df_training.single_location.tolist()
y = df_training.is_3pt.astype(int).tolist()

model = SVC(random_state=42)
model.fit(X, y)

model.score(X, y)

In [None]:
df_unlabeled['predicted_3pt'] = model.predict(df_unlabeled.single_location.tolist())
df_unlabeled.is_3pt = df_unlabeled.predicted_3pt.astype(bool)

print(len(df_unlabeled[df_unlabeled.is_3pt]))

In [None]:
fig, ax = plt.subplots()

x_make, y_make, x_missed, y_missed = get_makes_and_misses_half_court(
    df_unlabeled[df_unlabeled.is_3pt]
)

ax.scatter(x_missed * x_scale, (100 - y_missed) * y_scale, marker='x', c='red', alpha=.5)
ax.scatter(x_make * x_scale, (100 - y_make) * y_scale, marker='o', c='blue', alpha=.5)
ax.axhline(50 * y_scale, c='k', alpha=.5, linestyle= '--')
ax.imshow(full_court_img)

plt.xticks([])
plt.yticks([])
plt.show()

In [None]:
for index, row in df_valid.iterrows():
    if row['shot_distance'] == '' and not row['is_3pt']:
        df_valid.loc[df_valid.index == index, 'is_3pt'] = bool(model.predict([row['single_location']])[0])

fig, ax = plt.subplots()

x_make, y_make, x_missed, y_missed = get_makes_and_misses_half_court(df_valid[~df_valid['is_3pt']])

ax.scatter(x_missed * x_scale, (100 - y_missed) * y_scale, marker='x', c='red', alpha=.5)
ax.scatter(x_make * x_scale, (100 - y_make) * y_scale, marker='o', c='blue', alpha=.5)
ax.axhline(50 * y_scale, c='k', alpha=.5, linestyle= '--')
ax.imshow(full_court_img)

plt.xticks([])
plt.yticks([])
plt.show()

In [None]:
fig, ax = plt.subplots()

x_make, y_make, x_missed, y_missed = get_makes_and_misses_half_court(df_valid[df_valid['is_3pt']])

ax.scatter(x_missed * x_scale, (100 - y_missed) * y_scale, marker='x', c='red', alpha=.5)
ax.scatter(x_make * x_scale, (100 - y_make) * y_scale, marker='o', c='blue', alpha=.5)
ax.axhline(50 * y_scale, c='k', alpha=.5, linestyle= '--')
ax.imshow(full_court_img)

plt.xticks([])
plt.yticks([])
plt.show()

### View Player Shot Charts

In [None]:
player = 'Kelsey Plum' ## 'Sylvia Fowles'
df_player = df_valid[df_valid.shot_by == player]

df_player.head()

## Full Shot Chart

In [None]:
def get_makes_and_misses_full_court(df):
    def get_coordinates(df):
        x = df.location.map(lambda a: a[1])
        y = df.location.map(lambda a: a[0])

        return x, y

    df_sub = df[df.blocked_by.map(lambda bb: len(bb) == 0)]

    x_make, y_make = get_coordinates(df_sub[df_sub.made])
    x_missed, y_missed = get_coordinates(df_sub[~df_sub.made])

    return x_make, y_make, x_missed, y_missed

In [None]:
print(f'{player} - shot chart')

fig, ax = plt.subplots()

x_make, y_make, x_missed, y_missed = get_makes_and_misses_full_court(df_player)
ax.scatter(x_missed * x_scale, (100 - y_missed) * y_scale, marker='x', c='red', alpha=.5)
ax.scatter(x_make * x_scale, (100 - y_make) * y_scale, marker='o', c='blue', alpha=.5)
ax.axhline(50 * y_scale, c='k', alpha=.5, linestyle= '--')

## y-axis flips, need to flip the y data...
ax.imshow(full_court_img)

plt.show()

## Half Shot Chart

In [None]:
def get_makes_and_misses_half_court(df):
    def get_coordinates(df):
        x = df.single_location.map(lambda a: a[1])
        y = df.single_location.map(lambda a: a[0])

        return x, y

    df_sub = df[df.blocked_by.map(lambda bb: len(bb) == 0)]

    x_make, y_make = get_coordinates(df_sub[df_sub.made])
    x_missed, y_missed = get_coordinates(df_sub[~df_sub.made])

    return x_make, y_make, x_missed, y_missed

In [None]:
print(f'{player} - shot chart')

fig, ax = plt.subplots()

x_make, y_make, x_missed, y_missed = get_makes_and_misses_half_court(df_player)

ax.scatter(x_missed * x_scale, (100 - y_missed) * y_scale, marker='x', c='red', alpha=.5)
ax.scatter(x_make * x_scale, (100 - y_make) * y_scale, marker='o', c='blue', alpha=.5)
ax.axhline(50 * y_scale, c='k', alpha=.5, linestyle= '--')

ax.imshow(full_court_img)

plt.xticks([])
plt.yticks([])
plt.show()

## How well did we pick up 3pt?

In [None]:
print(f'{player} - shot chart')

fig, ax = plt.subplots()

x_make, y_make, x_missed, y_missed = get_makes_and_misses_half_court(df_player[df_player.is_3pt])

ax.scatter(x_missed * x_scale, (100 - y_missed) * y_scale, marker='x', c='red', alpha=.5)
ax.scatter(x_make * x_scale, (100 - y_make) * y_scale, marker='o', c='blue', alpha=.5)
ax.axhline(50 * y_scale, c='k', alpha=.5, linestyle= '--')

ax.imshow(full_court_img)

plt.xticks([])
plt.yticks([])
plt.show()

In [None]:
df_player[df_player.is_3pt].groupby(['made', 'side_of_the_half_court']).count()[['game_id']]

In [None]:
df_agg_makes = df[np.logical_and(np.logical_and(df.shot_by == player, df.is_3pt), df.made)].groupby(['game_id']).count()[['made']]
df_agg_misses = df[np.logical_and(np.logical_and(df.shot_by == player, df.is_3pt), ~df.made)].groupby(['game_id']).count()[['made']]
df_agg_misses.columns = ['misses']

df_totals = df_agg_makes.join(df_agg_misses)
def format_3pt(row):
    made = row['made']
    miss = row['misses']
    total = made + miss

    return f'{made}-{total}'

df_totals['3PT'] = df_totals.apply(format_3pt, axis=1)

In [None]:
validation = pd.merge(
    df_boxscores.loc[df_boxscores.id == 3065570, ['game_id', '3PT']],
    df_totals,
    how="left",
    on='game_id'
)[['game_id', '3PT_x', '3PT_y']]

validation['is_correct'] = validation['3PT_x'] == validation['3PT_y']

validation

### Impute Shot Categories

In [None]:
df_shot_cat_1 = df_valid.loc[df_valid.shot_distance_cat == '0-5 footer']
df_shot_cat_2 = df_valid.loc[df_valid.shot_distance_cat == '<= 16 footer']
df_shot_cat_3 = df_valid.loc[df_valid.shot_distance_cat == '<= 22 footer']
df_shot_cat_4 = df_valid.loc[df_valid.shot_distance_cat == '+22 footer']

df_shot_cat_unlabeled = df_valid.loc[df_valid.shot_distance_cat == '']

In [None]:
fig, ax = plt.subplots()

x_make, y_make, x_missed, y_missed = get_makes_and_misses_half_court(df_shot_cat_1)

ax.scatter(x_missed * x_scale, (100 - y_missed) * y_scale, marker='x', c='red', alpha=.5)
ax.scatter(x_make * x_scale, (100 - y_make) * y_scale, marker='o', c='blue', alpha=.5)
ax.axhline(50 * y_scale, c='k', alpha=.5, linestyle= '--')

ax.imshow(full_court_img)

plt.xticks([])
plt.yticks([])
plt.show()

In [None]:
fig, ax = plt.subplots()

x_make, y_make, x_missed, y_missed = get_makes_and_misses_half_court(df_shot_cat_2)

ax.scatter(x_missed * x_scale, (100 - y_missed) * y_scale, marker='x', c='red', alpha=.5)
ax.scatter(x_make * x_scale, (100 - y_make) * y_scale, marker='o', c='blue', alpha=.5)
ax.axhline(50 * y_scale, c='k', alpha=.5, linestyle= '--')

ax.imshow(full_court_img)

plt.xticks([])
plt.yticks([])
plt.show()

In [None]:
fig, ax = plt.subplots()

x_make, y_make, x_missed, y_missed = get_makes_and_misses_half_court(df_shot_cat_3)

ax.scatter(x_missed * x_scale, (100 - y_missed) * y_scale, marker='x', c='red', alpha=.5)
ax.scatter(x_make * x_scale, (100 - y_make) * y_scale, marker='o', c='blue', alpha=.5)
ax.axhline(50 * y_scale, c='k', alpha=.5, linestyle= '--')

ax.imshow(full_court_img)

plt.xticks([])
plt.yticks([])
plt.show()

In [None]:
fig, ax = plt.subplots()

x_make, y_make, x_missed, y_missed = get_makes_and_misses_half_court(df_shot_cat_4)

ax.scatter(x_missed * x_scale, (100 - y_missed) * y_scale, marker='x', c='red', alpha=.5)
ax.scatter(x_make * x_scale, (100 - y_make) * y_scale, marker='o', c='blue', alpha=.5)
ax.axhline(50 * y_scale, c='k', alpha=.5, linestyle= '--')

ax.imshow(full_court_img)

plt.xticks([])
plt.yticks([])
plt.show()

In [None]:
fig, ax = plt.subplots()

x_make, y_make, x_missed, y_missed = get_makes_and_misses_half_court(df_shot_cat_unlabeled)

ax.scatter(x_missed * x_scale, (100 - y_missed) * y_scale, marker='x', c='red', alpha=.5)
ax.scatter(x_make * x_scale, (100 - y_make) * y_scale, marker='o', c='blue', alpha=.5)
ax.axhline(50 * y_scale, c='k', alpha=.5, linestyle= '--')

ax.imshow(full_court_img)

plt.xticks([])
plt.yticks([])
plt.show()

### parsed data

In [None]:
## shot type
df.shot_type.unique()

In [None]:
## shot type
df.shot_distance_cat.unique()

In [None]:
## shots
sorted(df.shot_by.unique(), key=len, reverse=True)[:5]

In [None]:
## blocks
sorted(df.blocked_by.unique(), key=len, reverse=True)[:5]

In [None]:
## assists
sorted(df.assist_by.unique(), key=len, reverse=True)[:5]

In [None]:
## shot distances

## 91.86 ft ..., -1 == 'Not Given'
sorted(list(map(lambda a: -1 if a == '' else int(a), df.shot_distance.unique())))[:5]