# PER Analysis

In [1]:
import pickle as pkl
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings
from scipy import spatial
from scipy.stats import weightedtau

np.warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=RuntimeWarning)
warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", lambda x: "%.3f" % x)

In [2]:
all_players = pd.read_csv("data/all_players.csv")
all_teams = pd.read_csv("data/all_teams.csv")
merged = pd.read_csv("data/merged.csv")
with open("data/matches.pkl", "rb") as file:
    matches = pkl.load(file)

#### Metrics function

In [16]:
def metrics(year):

    season = str(year - 1) + "-" + str(year)[-2:]

    def compare_similarity(num, season):
        real = pd.read_csv(f"real/realper{season}.csv").iloc[:num, :].PER.to_list()
        est = pd.read_csv(f"estimations/{season}.csv").iloc[:num, :].PER.to_list()
        result = 1 - spatial.distance.cosine(real, est)
        tau, pvalue = weightedtau(real, est)
        return np.round(result, 5), np.round(tau, 5)

    metrics = pd.DataFrame(columns=["Cosine", "Tau"])
    for i in [10, 25, 50, 100, 200]:
        result, tau = compare_similarity(i, season)
        temp_df = pd.DataFrame(
            {"Cosine": str(result), "Tau": str(tau)}, index=[f"First {i}"]
        )
        metrics = metrics.append([temp_df])

    return metrics.T

In [23]:
for i in range(2005, 2023):
    print(metrics(i))
    print(53 * "-")

       First 10 First 25 First 50 First 100 First 200
Cosine  0.99981  0.99986   0.9999   0.99991   0.99992
Tau         1.0      1.0      1.0   0.99956   0.99964
-----------------------------------------------------
       First 10 First 25 First 50 First 100 First 200
Cosine   0.9997   0.9997   0.9998   0.99981   0.99985
Tau         1.0      1.0  0.99954   0.99963   0.99969
-----------------------------------------------------
       First 10 First 25 First 50 First 100 First 200
Cosine  0.99983   0.9999  0.99989   0.99992   0.99993
Tau         1.0      1.0  0.99989   0.99977   0.99971
-----------------------------------------------------
       First 10 First 25 First 50 First 100 First 200
Cosine  0.99984  0.99979  0.99978   0.99985   0.99988
Tau         1.0  0.99856   0.9988   0.99923   0.99953
-----------------------------------------------------
       First 10 First 25 First 50 First 100 First 200
Cosine  0.99885  0.99935  0.99951   0.99968   0.99978
Tau         1.0      1.0  0.

## 2021-22 Season

#### Live PER Ratings on  http://insider.espn.com/nba/hollinger/statistics for 2021-22 Season

In [None]:
# First page
live_ratings = pd.read_html("http://insider.espn.com/nba/hollinger/statistics")[0]
live_ratings = live_ratings.iloc[2:, 1:].reset_index(drop=True)

# Other pages
for i in range(2, 9):
    temp_df = pd.read_html(
        "http://insider.espn.com/nba/hollinger/statistics/_/page/{page_num}"
    )[0]
    temp_df = temp_df.iloc[2:, 1:].reset_index(drop=True)
    live_ratings = pd.concat([live_ratings, temp_df], axis=0, ignore_index=True)

# Fix the format
live_ratings.columns = [
    "PLAYER",
    "GP",
    "MPG",
    "TS%",
    "AST",
    "TO",
    "USG",
    "ORR",
    "DRR",
    "REBR",
    "PER",
    "VA",
    "EWA",
]
live_ratings.PLAYER = live_ratings.PLAYER.apply(
    lambda x: np.nan if x == "PLAYER" else x
)
live_ratings.dropna(inplace=True)
live_ratings["abbrev"] = live_ratings.PLAYER.apply(lambda x: x.split(",")[1].strip())
live_ratings["abbrev"] = live_ratings["abbrev"].apply(
    lambda x: "UTA" if x == "UTAH" else x
)
live_ratings["abbrev"] = live_ratings["abbrev"].apply(
    lambda x: "GSW" if x == "GS" else x
)
live_ratings["abbrev"] = live_ratings["abbrev"].apply(
    lambda x: "NOP" if x == "NO" else x
)
live_ratings["abbrev"] = live_ratings["abbrev"].apply(
    lambda x: "WAS" if x == "WSH" else x
)
live_ratings["abbrev"] = live_ratings["abbrev"].apply(
    lambda x: "SAS" if x == "SA" else x
)
live_ratings["abbrev"] = live_ratings["abbrev"].apply(
    lambda x: "NYK" if x == "NY" else x
)
live_ratings["name"] = live_ratings.PLAYER.apply(lambda x: x.split(",")[0])

# Merge
live_ratings = live_ratings.merge(
    all_teams[["full_name", "abbreviation"]],
    left_on="abbrev",
    right_on="abbreviation",
    how="left",
)
live_ratings.rename(columns={"name": "FULLNAME", "full_name": "TEAM"}, inplace=True)

# Dump
live_ratings[["FULLNAME", "TEAM", "PER"]].to_csv(
    f"real/realper2021-22.csv", index=False
)

#### Standings from https://www.espn.com/nba/standings/_/group/league for 2021-22 Season

In [None]:
def cut_name(row):
    for i, val in enumerate(row):
        if row == "LACLA Clippers":
            return "Los Angeles " + row.split()[1]
        if val.islower():
            return row[i - 1 :]


# Estimations
final = pd.read_csv("estimations/2021-22.csv")
agg_final = (
    final.groupby("TEAM")
    .PER.sum()
    .to_frame()
    .reset_index()
    .sort_values(by="PER", ascending=False)
)

# Real Standings
standings_teams = pd.read_html("https://www.espn.com/nba/standings/_/group/league")[0]
error = standings_teams.columns[0]
lst = standings_teams[error].to_list()
lst.insert(0, error)
standings_teams = pd.DataFrame({"Team": lst})
standings_stats = pd.read_html("https://www.espn.com/nba/standings/_/group/league")[1]

# Merge
standings = pd.concat([standings_teams, standings_stats], axis=1, ignore_index=True)
standings.columns = [
    "Team",
    "W",
    "L",
    "PCT",
    "GB",
    "HOME",
    "AWAY",
    "DIV",
    "CONF",
    "PPG",
    "OPP PPG",
    "DIFF",
    "STRK",
    "L10",
]
standings.Team = standings.Team.apply(cut_name)
real_standings = standings.merge(agg_final, left_on="Team", right_on="TEAM", how="left")
real_standings.drop("TEAM", axis=1, inplace=True)

# Return
real_standings[["Team", "PER"]].to_csv("standings/standings2021-22.csv", index=False)

## Other Seasons

In [None]:
def get_live_ratings(year):

    # First page
    live_ratings = pd.read_html(
        f"http://insider.espn.com/nba/hollinger/statistics/_/year/{year}"
    )[0]
    live_ratings = live_ratings.iloc[2:, 1:].reset_index(drop=True)

    # Other pages
    for i in range(2, 9):
        temp_df = pd.read_html(
            f"http://insider.espn.com/nba/hollinger/statistics/_/page/{i}/year/{year}"
        )[0]
        temp_df = temp_df.iloc[2:, 1:].reset_index(drop=True)
        live_ratings = pd.concat([live_ratings, temp_df], axis=0, ignore_index=True)

    # Fix the format
    live_ratings.columns = [
        "PLAYER",
        "GP",
        "MPG",
        "TS%",
        "AST",
        "TO",
        "USG",
        "ORR",
        "DRR",
        "REBR",
        "PER",
        "VA",
        "EWA",
    ]
    live_ratings.PLAYER = live_ratings.PLAYER.apply(
        lambda x: np.nan if x == "PLAYER" else x
    )
    live_ratings.dropna(inplace=True)
    live_ratings["abbrev"] = live_ratings.PLAYER.apply(
        lambda x: x.split(",")[1].strip()
    )
    live_ratings["abbrev"] = live_ratings["abbrev"].apply(
        lambda x: "UTA" if x == "UTAH" else x
    )
    live_ratings["abbrev"] = live_ratings["abbrev"].apply(
        lambda x: "GSW" if x == "GS" else x
    )
    live_ratings["abbrev"] = live_ratings["abbrev"].apply(
        lambda x: "NOP" if x == "NO" else x
    )
    live_ratings["abbrev"] = live_ratings["abbrev"].apply(
        lambda x: "WAS" if x == "WSH" else x
    )
    live_ratings["abbrev"] = live_ratings["abbrev"].apply(
        lambda x: "SAS" if x == "SA" else x
    )
    live_ratings["abbrev"] = live_ratings["abbrev"].apply(
        lambda x: "NYK" if x == "NY" else x
    )
    live_ratings["abbrev"] = live_ratings["abbrev"].apply(
        lambda x: "BKN" if x == "HOU/BKN" else x
    )
    live_ratings["abbrev"] = live_ratings["abbrev"].apply(
        lambda x: "CHI" if x == "ORL/CHI" else x
    )
    live_ratings["name"] = live_ratings.PLAYER.apply(lambda x: x.split(",")[0])

    # Merge
    live_ratings = live_ratings.merge(
        all_teams[["full_name", "abbreviation"]],
        left_on="abbrev",
        right_on="abbreviation",
        how="left",
    )
    live_ratings.rename(columns={"name": "FULLNAME", "full_name": "TEAM"}, inplace=True)
    season = str(year - 1) + "-" + str(year)[-2:]

    # Dump
    live_ratings[["FULLNAME", "TEAM", "PER"]].to_csv(
        f"real/realper{season}.csv", index=False
    )


# Get all available years
for i in range(2005, 2022):
    get_live_ratings(i)

In [None]:
def get_real_standings(year):
    season = str(year - 1) + "-" + str(year)[-2:]

    def cut_name(row):
        row = row[3:]
        for i, val in enumerate(row):
            if row == "LACLA Clippers":
                return "Los Angeles" + row.split()[1]
            if val.islower():
                return row[i - 1 :]

    # Estimations
    final = pd.read_csv(f"estimations/{season}.csv")
    agg_final = (
        final.groupby("TEAM")
        .PER.sum()
        .to_frame()
        .reset_index()
        .sort_values(by="PER", ascending=False)
    )

    # Real Standings
    standings_teams = pd.read_html(
        f"https://www.espn.com/nba/standings/_/season/{year}/group/league"
    )[0]
    error = standings_teams.columns[0]
    lst = standings_teams[error].to_list()
    lst.insert(0, error)
    standings_teams = pd.DataFrame({"Team": lst})
    standings_stats = pd.read_html(
        f"https://www.espn.com/nba/standings/_/season/{year}/group/league"
    )[1]

    # Merge
    standings = pd.concat([standings_teams, standings_stats], axis=1, ignore_index=True)
    standings.columns = [
        "Team",
        "W",
        "L",
        "PCT",
        "GB",
        "HOME",
        "AWAY",
        "DIV",
        "CONF",
        "PPG",
        "OPP PPG",
        "DIFF",
        "STRK",
        "L10",
    ]
    standings.Team = standings.Team.apply(cut_name)
    real_standings = standings.merge(
        agg_final, left_on="Team", right_on="TEAM", how="left"
    )
    real_standings.drop("TEAM", axis=1, inplace=True)
    season = str(year - 1) + "-" + str(year)[-2:]

    # Return
    real_standings[["Team", "PER"]].to_csv(
        f"standings/standings{season}.csv", index=False
    )


for i in range(2005, 2022):
    get_real_standings(i)