In [None]:
import pandas as pd

# Similar to data_analysis.py but with chunk loading and smaller data for testing

# switch back to full data when done testing
# df = pd.read_csv("play_by_play_sample.csv")

# csv_path = "archive/csv/play_by_play.csv"
# chunk_size = 500_000
# chunks = []

# for chunk in pd.read_csv(csv_path, chunksize=chunk_size, low_memory=False):
#     chunks.append(chunk)

# df = pd.concat(chunks, ignore_index=True)
# print("Loaded data with shape:", df.shape)


#  Temp for testing with smaller data so it does not take hours to load/switch it back when plugged in
# using chunks and limiting row to 50,000
csv_path = "archive/csv/play_by_play.csv"
chunk_size = 500_000
max_rows = 50_000

chunks = []
rows_loaded = 0

for chunk in pd.read_csv(csv_path, chunksize=chunk_size, low_memory=False):
    remaining = max_rows - rows_loaded
    if remaining <= 0:
        break

    chunk_small = chunk.head(remaining)
    chunks.append(chunk_small)
    rows_loaded += len(chunk_small)

df = pd.concat(chunks, ignore_index=True)
print("Loaded data with shape:", df.shape)

def classify_event(row):
    t = row.eventmsgtype
    if t == 12:
        return "period_start"
    if t == 10:
        return "jump_ball"
    if t == 1:
        return "shot_make"
    if t == 2:
        return "shot_miss"
    if t == 3:
        return "free_throw"
    if t == 4:
        return "rebound"
    if t == 5:
        return "turnover"
    if t == 6:
        return "foul"
    if t == 8:
        return "substitution"
    return "other"

df["event_class"] = df.apply(classify_event, axis=1)

# parsess possessions
df["possession_id"] = -1

current = 0
last_team_in_poss = None
shot_attempt_active = False
team_that_shot = None

for i, row in df.iterrows():
    cls = row.event_class
    team = row.player1_team_id

    df.at[i, "possession_id"] = current

    if cls == "jump_ball":
        last_team_in_poss = row.player3_team_id
        shot_attempt_active = False
        team_that_shot = None
        continue

    if cls == "shot_make":
        shot_attempt_active = False
        team_that_shot = row.player1_team_id
        current += 1
        last_team_in_poss = None
        continue

    if cls == "shot_miss":
        shot_attempt_active = True
        team_that_shot = row.player1_team_id
        continue

    if cls == "rebound":
        rebound_team = row.player1_team_id
        if pd.isna(rebound_team):
            rebound_team = None

        if shot_attempt_active:
            if rebound_team == team_that_shot:
                shot_attempt_active = False
                team_that_shot = rebound_team
                continue
            else:
                shot_attempt_active = False
                team_that_shot = rebound_team
                current += 1
                continue

        last_team_in_poss = rebound_team
        continue

    if cls == "turnover":
        shot_attempt_active = False
        current += 1
        continue

    if cls == "free_throw":
        if "Free Throw" in str(row.homedescription) or "Free Throw" in str(row.visitordescription):
            pass
        continue

# extracting shots
def extract_shot_distance(desc):
    if not isinstance(desc, str):
        return None
    if "'" in desc:
        try:
            d = desc.split("'")[0]
            d = "".join(ch for ch in d if ch.isdigit())
            if d.isdigit():
                return int(d)
        except:
            return None
    return None

df["shot_distance"] = df.apply(
    lambda r: extract_shot_distance(r.homedescription) 
              if r.event_class in ["shot_make", "shot_miss"] and isinstance(r.homedescription,str)
              else extract_shot_distance(r.visitordescription), axis=1
)

df["shooter"] = df.apply(
    lambda r: r.player1_name if r.event_class in ["shot_make", "shot_miss"] else None,
    axis=1,
)

shot_chart = df[df.event_class.isin(["shot_make","shot_miss"])][
    ["shooter","event_class","shot_distance","homedescription","visitordescription","possession_id"]
]

shot_chart = shot_chart.dropna(subset=["shooter"])

print(shot_chart.head())


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, roc_auc_score
from scipy.sparse import hstack, csr_matrix

cap_rows = 100_000

marginLables = ["Behind (<= -6)", "Close (-5 to 5)", "Ahead (>= 6)"]
marginWeights = {
    "Behind (<= -6)": 1.2,
    "Close (-5 to 5)": 1.0,
    "Ahead (>= 6)": 0.8
}

catCols = ["MarginBucket", "period"]
numCols = ["seconds_left", "time_margin_interaction", "shooter_3pt_rate"]

shots = df.loc[df["eventmsgtype"].isin([1, 2])].copy()

desc = (shots["homedescription"].fillna("") + " " + shots["visitordescription"].fillna("")).str.upper()
shots["is_three"] = desc.str.contains("3PT", na=False).astype(int)

shots["Shooter"] = shots["player1_name"].astype(str).str.strip().str.lower()
shots["scoremargin"] = pd.to_numeric(shots["scoremargin"], errors="coerce")
shots["period"] = pd.to_numeric(shots["period"], errors="coerce")

# seconds left
time_str = shots["pctimestring"].astype(str)
mmss = time_str.str.split(":", expand=True)

shots["seconds_left"] = (
    pd.to_numeric(mmss[0], errors="coerce") * 60 +
    pd.to_numeric(mmss[1], errors="coerce")
)

m = shots["scoremargin"]
shots["MarginBucket"] = pd.NA
shots.loc[m.le(-6), "MarginBucket"] = "Behind (<= -6)"
shots.loc[m.between(-5, 5), "MarginBucket"] = "Close (-5 to 5)"
shots.loc[m.ge(6), "MarginBucket"] = "Ahead (>= 6)"

model_data = shots.dropna(subset=["Shooter", "MarginBucket", "seconds_left", "period"]).copy()

print("Rows after cleaning:", len(model_data))
print("3PT attempt rate:", model_data["is_three"].mean() if len(model_data) else None)

if len(model_data) > cap_rows:
    model_data = model_data.sample(cap_rows)

model_data["shooter_3pt_rate"] = model_data.groupby("Shooter")["is_three"].transform("mean")
model_data["time_margin_interaction"] = model_data["seconds_left"] * model_data["MarginBucket"].map(marginWeights)

X_num = csr_matrix(model_data[numCols].to_numpy())
enc = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
X_cat = enc.fit_transform(model_data[catCols].astype({"period": int}))
X = hstack([X_num, X_cat])
y = model_data["is_three"].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y
)

clf = LogisticRegression(max_iter=5000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

print("\n3-Point Attempt Prediction")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_proba))


#plotting portion
default_period = int(model_data["period"].mode().iloc[0])
default_shooter_rate = float(model_data["shooter_3pt_rate"].mean())
sec_range = np.linspace(0, 720, 200)
plt.figure(figsize=(9, 5))

for mb in marginLables:
    plot_df = pd.DataFrame({
        "seconds_left": sec_range,
        "MarginBucket": mb,
        "period": default_period,
        "shooter_3pt_rate": default_shooter_rate
    })
    plot_df["time_margin_interaction"] = plot_df["seconds_left"] * plot_df["MarginBucket"].map(marginWeights)

    X_num_plot = csr_matrix(plot_df[numCols].to_numpy())
    X_cat_plot = enc.transform(plot_df[catCols].astype({"period": int}))
    X_plot = hstack([X_num_plot, X_cat_plot])

    probs = clf.predict_proba(X_plot)[:, 1]
    plt.plot(sec_range, probs, label=mb)

plt.gca().invert_xaxis()
plt.xlabel("Seconds left in period")
plt.ylabel("Predicted probability of 3PT attempt")
plt.title(f"3PT Attempt Probability vs Time (Period {default_period})")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()