In [1]:
import pickle as pkl
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings

np.warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=RuntimeWarning)
warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", lambda x: "%.3f" % x)

In [2]:
all_players = pd.read_csv("data/all_players.csv")
all_teams = pd.read_csv("data/all_teams.csv")
merged = pd.read_csv("data/merged.csv")
with open("data/matches.pkl", "rb") as file:
    matches = pkl.load(file)

## Calculating PER

The Player Efficiency Rating (PER) is a per-minute rating developed by ESPN.com columnist John Hollinger. In John's words, "The PER sums up all a player's positive accomplishments, subtracts the negative accomplishments, and returns a per-minute rating of a player's performance." It appears from his books that John's database only goes back to the 1988-89 season. I decided to expand on John's work and calculate PER for all players since minutes played were first recorded (1951-52).

---

All calculations begin with what I am calling unadjusted PER (uPER). The formula is:



![alt text](notebook_asset/uper.svg "Player Efficiency Rating")

OR

    uPER = (1 / MP) *
     [ 3P
     + (2/3) * AST
     + (2 - factor * (team_AST / team_FG)) * FG
     + (FT *0.5 * (1 + (1 - (team_AST / team_FG)) + (2/3) * (team_AST / team_FG)))
     - VOP * TOV
     - VOP * DRB% * (FGA - FG)
     - VOP * 0.44 * (0.44 + (0.56 * DRB%)) * (FTA - FT)
     + VOP * (1 - DRB%) * (TRB - ORB)
     + VOP * DRB% * ORB
     + VOP * STL
     + VOP * DRB% * BLK
     - PF * ((lg_FT / lg_PF) - 0.44 * (lg_FTA / lg_PF) * VOP) ]

---

Most of the terms in the formula above should be clear, but let me define the less obvious ones:

![alt text](notebook_asset/factor.svg "Player Efficiency Rating")

OR

    factor = (2 / 3) - (0.5 * (lg_AST / lg_FG)) / (2 * (lg_FG / lg_FT))

---

![alt text](notebook_asset/vop.svg "Player Efficiency Rating")

OR

     VOP = lg_PTS / (lg_FGA - lg_ORB + lg_TOV + 0.44 * lg_FTA)

---

![alt text](notebook_asset/drbp.svg "Player Efficiency Rating")

OR

    DRB% = (lg_TRB - lg_ORB) / lg_TRB

---

The calcuation of uPER obviously depends on these statistics, so here are my solutions for years when the data are missing:

* Zero out three-point field goals, turnovers, blocked shots, and steals.
* Set the league value of possession (VOP) equal to 1.
* Set the defensive rebound percentage (DRB%) equal to 0.7.
* Set player offensive rebounds (ORB) equal to 0.3 * TRB.

Some of these solutions may not be elegant, but I think they are reasonable. After uPER is calculated, an adjustment must be made for the team's pace. The pace adjustment is:

    pace adjustment = lg_Pace / team_Pace

League and team pace factors cannot be computed for seasons prior to 1973-74, so I estimate the above using:

    estimated pace adjustment = 2 * lg_PPG / (team_PPG + opp_PPG)

Now the pace adjustment is made to uPER (I will call this aPER):

    aPER = (pace adjustment) * uPER

The final step is to standardize aPER. First, calculate league average aPER (lg_aPER) using player minutes played as the weights. Then, do the following:

    PER = aPER * (15 / lg_aPER)

The step above sets the league average to 15 for all seasons.



**source**: https://www.basketball-reference.com/about/per.html

#### Filtering Dataframe for Current Season Only

In [5]:
merged = merged[merged["SEASON_ID"] == "2021-22"]
team_stats = merged.groupby("TEAM")[
    [
        "FGM",
        "FGA",
        "FG3M",
        "FG3A",
        "FTM",
        "FTA",
        "OREB",
        "DREB",
        "REB",
        "AST",
        "STL",
        "BLK",
        "TOV",
        "PF",
        "PTS",
    ]
].sum()
team_stats.head()

Unnamed: 0_level_0,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
TEAM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Atlanta Hawks,1420,3047,430.0,1131.0,637,779,344.0,1173.0,1517.0,869,216.0,144.0,403.0,527,3907
Boston Celtics,1615,3567,486.0,1417.0,731,897,395.0,1373.0,1768.0,948,310.0,241.0,545.0,727,4447
Brooklyn Nets,1386,2919,372.0,1058.0,688,811,228.0,1118.0,1346.0,926,236.0,163.0,453.0,544,3832
Charlotte Hornets,1773,3827,617.0,1646.0,618,835,417.0,1396.0,1813.0,1106,367.0,198.0,500.0,764,4781
Chicago Bulls,1562,3280,445.0,1133.0,639,776,272.0,1184.0,1456.0,934,285.0,167.0,476.0,638,4208


#### Calculating Factor

In [4]:
lg_AST = merged["AST"].sum()
lg_FG = merged["FGM"].sum()
lg_FT = merged["FTM"].sum()
merged["factor"] = (2 / 3) * (0.5 * (lg_AST) / lg_FG) / (2 * (lg_FG / lg_FT))

In [5]:
merged["factor"].unique()[0]

0.0411127363473995

#### Calculating VOP

In [6]:
lg_PTS = merged["PTS"].sum()
lg_FGA = merged["FGA"].sum()
lg_ORB = merged["OREB"].sum()
lg_TOV = merged["TOV"].sum()
lg_FTA = merged["FTA"].sum()
merged["vop"] = lg_PTS / (lg_FGA - lg_ORB + lg_TOV + 0.44 * lg_FTA)

In [7]:
merged["vop"].unique()[0]

1.0826488323115735

#### Calculating DRB%

In [8]:
lg_TRB = merged["REB"].sum()
lg_ORB = merged["OREB"].sum()
merged["drbp"] = (lg_TRB - lg_ORB) / lg_TRB

In [9]:
merged["drbp"].unique()[0]

0.7820528455284553

#### Calculating uPER

In [10]:
def uPER(dataframe, team_stat_df, team):
    MP = merged["MIN"]
    FG3M = merged["FG3M"]
    AST = merged["AST"]
    factor = merged["factor"]
    FG = merged["FGM"]
    FT = merged["FTM"]
    VOP = merged["vop"]
    TOV = merged["TOV"]
    DRBP = merged["drbp"]
    FGA = merged["FGA"]
    FTA = merged["FTA"]
    TRB = merged["REB"]
    ORB = merged["OREB"]
    STL = merged["STL"]
    BLK = merged["BLK"]
    PF = merged["PF"]

    lg_FT = dataframe["FTM"].sum()
    lg_PF = dataframe["PF"].sum()
    lg_FTA = dataframe["FTA"].sum()

    mask = team_stat_df[team_stat_df.index == team]
    team_AST = mask["AST"].values[0]
    team_FG = mask["FGM"].values[0]

    uPER = (1 / MP) * (
        FG3M
        + (2 / 3) * AST
        + (2 - factor * (team_AST / team_FG)) * FG
        + (FT * 0.5 * (1 + (1 - (team_AST / team_FG)) + (2 / 3) * (team_AST / team_FG)))
        - VOP * TOV
        - VOP * DRBP * (FGA - FG)
        - VOP * 0.44 * (0.44 + (0.56 * DRBP)) * (FTA - FT)
        + VOP * (1 - DRBP) * (TRB - ORB)
        + VOP * DRBP * ORB
        + VOP * STL
        + VOP * DRBP * BLK
        - PF * ((lg_FT / lg_PF) - 0.44 * (lg_FTA / lg_PF) * VOP)
    )

    dataframe.loc[dataframe.TEAM == team, "uPER"] = uPER

    return dataframe

In [11]:
for team in merged.TEAM.unique():
    uPER(merged, team_stats, team)

In [12]:
merged.uPER.head(3)

5    0.455
9    0.611
12   0.397
Name: uPER, dtype: float64

#### Calculating Team and League Paces

Team pace against an opponent:

    0.5 * ((team_FGA + 0.4 * team_FTA - 1.07 * (team_ORB / (team_ORB + opp_DRB)) 
    * (team_FGA - team_FG) + team_TOV) + (opp_FGA + 0.4 * opp_FTA - 1.07 * (opp_ORB / (opp_ORB + team_DRB)) 
    * (opp_FGA - opp_FG) + opp_TOV))

In [13]:
def generate_played(team, season):
    df = matches[team]
    df = df[df["SEASON_ID"] == season]
    df["OPPONENT"] = df["MATCHUP"].apply(lambda x: x[-3:])
    df = df.merge(all_teams, left_on="OPPONENT", right_on="abbreviation")
    played = [(team, opp) for opp in df["full_name"].values]
    return played

In [14]:
def get_team_paces(team, opponent, team_stats_df):

    # TEAM
    team = team_stats_df[team_stats_df.index == team]
    team_AST = team["AST"].values[0]
    team_FG = team["FGM"].values[0]
    team_FGA = team["FGA"].values[0]
    team_FTA = team["FTA"].values[0]
    team_ORB = team["OREB"].values[0]
    team_TOV = team["TOV"].values[0]
    team_DRB = team["DREB"].values[0]

    # OPPONENT
    opponent = team_stats_df[team_stats_df.index == opponent]
    opp_AST = opponent["AST"].values[0]
    opp_FG = opponent["FGM"].values[0]
    opp_FGA = opponent["FGA"].values[0]
    opp_FTA = opponent["FTA"].values[0]
    opp_ORB = opponent["OREB"].values[0]
    opp_TOV = opponent["TOV"].values[0]
    opp_DRB = opponent["DREB"].values[0]

    team_pace = 0.5 * (
        (
            team_FGA
            + 0.4 * team_FTA
            - 1.07 * (team_ORB / (team_ORB + opp_DRB)) * (team_FGA - team_FG)
            + team_TOV
        )
        + (opp_FGA + 0.4 * opp_FTA - 1.07 * (opp_ORB / opp_ORB + team_DRB))
        * (opp_FGA - opp_FG)
        + opp_TOV
    )

    return team_pace

In [15]:
def add_paces(dataframe, team, matches_dct, team_stats_df, season):
    dataframe.loc[dataframe.TEAM == team, "T_PACE"] = sum(
        [
            get_team_paces(matchup[0], matchup[1], team_stats_df)
            for matchup in generate_played(team, season)
        ]
    ) / len(generate_played(team, season))

    total = 0
    count = 0
    for key in matches_dct.keys():
        try:
            played = generate_played(key, season)
            count += len(played)
            for game in played:
                team_pace = get_team_paces(game[0], game[1], team_stats_df)
                total += team_pace
        except:
            continue

    lg_pace = total / count
    dataframe.loc[dataframe.SEASON_ID == season, "L_PACE"] = lg_pace
    return dataframe

In [16]:
teams = merged.TEAM.unique()
for team in tqdm(teams):
    merged = add_paces(merged, team, matches, team_stats, "2021-22")

100%|██████████| 30/30 [00:44<00:00,  1.48s/it]


#### Calculating PER

In [17]:
def per(dataframe):
    dataframe["adjustment"] = dataframe["L_PACE"] / dataframe["T_PACE"]
    dataframe["aPER"] = dataframe["uPER"] * dataframe["adjustment"]
    dataframe["PER"] = dataframe["aPER"] * (15 / dataframe["aPER"].mean())
    return dataframe

In [18]:
per(merged)[["FIRST_NAME", "LAST_NAME", "TEAM", "PER"]].sort_values(
    by="PER", ascending=False
).head(3)

Unnamed: 0,FIRST_NAME,LAST_NAME,TEAM,PER
789,Giannis,Antetokounmpo,Milwaukee Bucks,32.466
1355,Nikola,Jokic,Denver Nuggets,29.738
2017,Joel,Embiid,Philadelphia 76ers,28.723


#### ***Better results on OOP version.***