In this notebook, we investigate errors in the shooting position data from the Euroleague API.

To run the notebook, data should have been fetched using the notebok `collection-season-shot-data.ipynb` and stored at `../data/`

In [None]:
from glob import glob
import numpy as np
import pandas as pd

# Load data from all seasons

In [None]:
files = glob("../data/*csv")

In [None]:
df_list = []
for file in files:
    df_list.append(pd.read_csv(file))
df = pd.concat(df_list)

In [None]:
df.head()

# Explore the dataset

In [None]:
np.unique(df['ACTION'])

In [None]:
np.unique(df['ZONE'])

In [None]:
df.groupby(["ACTION", "ID_ACTION"])["Gamecode"].count().to_frame("n_rows")

In [None]:
print("Min and max values of the x-coord:", df['COORD_X'].min(), df['COORD_X'].max())
print("Min and max values of the y-coord:", df['COORD_Y'].min(), df['COORD_Y'].max())

# Explore 2PT vs 3PT FGs

In [None]:
import sys
from matplotlib import pyplot as plt

sys.path.append("../utils/")
from draw_court import draw_court, Arc
from shot_chart_plots import plot_scatter, joint_plot
%matplotlib

plt.close("all")

Split the dataset into the 2PT FGs (including layups and dunks) and 3PT FGs

In [None]:
two_pointer_terms = ["Two Pointer", "Layup", "Dunk"]
twopt_mask = df['ACTION'].str.contains('|'.join(two_pointer_terms))
threept_mask = df['ACTION'].str.contains("Three Pointer")
season_mask = df['Season'] >= 2010

two_pointers_df = df[twopt_mask & season_mask]
three_pointers_df = df[threept_mask & season_mask]

In [None]:
# the 3PT line arc, in graph coordinates
three_arc = Arc((0, 0), 2 * 675, 2 * 675, theta1=12, theta2=167.5,
                linewidth=1, color=None)

## 3PT FGs attempts

In [None]:
plt.figure()
draw_court()
plt.plot(three_pointers_df['COORD_X'], three_pointers_df['COORD_Y'], 'o', label='')
plt.legend()
plt.xlim([-800, 800])
plt.ylim([-200, 1300])
plt.title("Three-point FG attempted since season 2010")
plt.show()

## 2PT FGs attempts

In [None]:
plt.figure()
draw_court()
plt.plot(two_pointers_df['COORD_X'], two_pointers_df['COORD_Y'], 'o', label='')
plt.legend()
plt.xlim([-800, 800])
plt.ylim([-200, 1300])
plt.title("Two-point FG attempted since season 2010")
plt.show()

Investigate the "long" 2PT shot from the chart above.

In [None]:
two_pointers_df[two_pointers_df["COORD_Y"] > 1000]

# Counts of falsely identified shots over the years

Focus on areas well inside the arc and areas well outside the arc to avoid edge cases very near the arc.

In [None]:
mask_false_3pt = (three_pointers_df['COORD_X'].abs() <= 400) & (three_pointers_df['COORD_Y'].abs() <= 400)
threept_dist = three_pointers_df[mask_false_3pt].groupby("Season").count()["Gamecode"]

In [None]:
mask_false_2pt = two_pointers_df['COORD_Y'].abs() >= 700
twopt_dist = two_pointers_df[mask_false_2pt].groupby("Season").count()["Gamecode"]

In [None]:
dist_df = pd.concat([threept_dist, twopt_dist], axis=1)
dist_df.columns = ["3PT", "2PT"]

In [None]:
dist_df.plot.bar(stacked=True, title="Falsely registered shots according to their location on the court")