## Processing, parsing and cleaning playoff data

In [10]:
from bs4 import BeautifulSoup
import pandas as pd

In [11]:
seasons = list(range(1996, 2023))

In [12]:
# keep_unique_hrefs keeps only the first occurence of each link
def keep_unique_hrefs(hrefs):
    unique_href = []
    seen_hrefs = set()

    for href in hrefs:
        if href not in seen_hrefs:
            unique_href.append(href)
            seen_hrefs.add(href)
    
    return unique_href

In [13]:
# assign_score(team) assigns a playoff score to each team that made the playoffs based on where they got eliminated
# or if they won the championship

def assign_score(teams):
    for num in range(len(teams)):
        # Team that won the championship
        if num == 0:
            teams[num] = [teams[num], 10]
        # Team that lost in the finals
        elif num == 1:
            teams[num] = [teams[num], 7]
        # Teamsthat lost in the conference finals
        elif num >= 2 and num <= 3:
            teams[num] = [teams[num], 4]
        # Teams that lost in the conference semifinals
        elif num >= 4 and num <= 7:
            teams[num] = [teams[num], 2]
        # Teams that lost in the first round
        else:
            teams[num] = [teams[num], 1]

In [14]:
all_dfs = []

# getting and assigning a playoff score (to indicate where they got eliminated/won the championship/didn't make the playoffs) 
# to each team 

for season in seasons:
    file = "PLAYOFFS/{}_playoffs.html".format(season)

    with open(file) as f:
        content = f.read()

    soup = BeautifulSoup(content, "html.parser")

    # get all the <a>
    a_tags = soup.find_all("a")

    # get all the team links in those a_tags
    hrefs = [a["href"] for a in a_tags]

    # keep only the team links in the format e.g. "/teams/MIL/2021.html"
    hrefs = [href for href in hrefs if href.startswith("/teams/") and href.endswith(".html")]

    # unique is a list containing only the first instance of each team linke
    unique = keep_unique_hrefs(hrefs)

    # teams is a list of just the team names from unique
    teams = [href.split("/")[2] for href in unique]

    # assigns a score to each team based on where they finished in the playoffs
    assign_score(teams)

    df = pd.DataFrame(teams)

    # renaming the columns
    df.columns = ["Team", "Score"]

    # keeping track of the season
    df["Year"] = season

    # adding the dataframe to a list of all the dataframes so far
    all_dfs.append(df)

# concatenating all the dataframes so far
playoff_score = pd.concat(all_dfs)
    

**Replacing Historical Team Names with the Current Ones**

In [15]:
replace_dict = {"NJN": "BRK", "CHA": "CHO", "CHH": "CHO", "VAN": "MEM", "NOH": "NOP", "NOK": "NOP", "SEA": "OKC", "WSB": "WAS"}
playoff_score["Team"] = playoff_score["Team"].replace(replace_dict)

In [16]:
playoff_score

Unnamed: 0,Team,Score,Year
0,CHI,10,1996
1,OKC,7,1996
2,ORL,4,1996
3,UTA,4,1996
4,NYK,2,1996
...,...,...,...
11,TOR,1,2022
12,UTA,1,2022
13,DEN,1,2022
14,MIN,1,2022


In [17]:
playoff_score.to_csv("CSV/playoff_score.csv")