In [96]:
import pandas as pd
import numpy as np

import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
pd.set_option('display.max_rows',None)

In [97]:
playoffs = pd.read_html('https://www.basketball-reference.com/playoffs/series.html')[0]
playoffs.columns = [col[1] for col in playoffs.columns]
playoffs.drop([col for col in playoffs.columns if "Unnamed:" in col], axis=1, inplace=True)
playoffs.dropna(how='all', inplace=True)
playoffs = playoffs[playoffs.Yr.str.isnumeric()]
playoffs.Yr = playoffs.Yr.astype('int64')
playoffs = playoffs[playoffs.Yr.isin(range(1980,2022))]

In [98]:
playoffs = playoffs[['Yr', 'Series', 'Team']]
playoffs.columns = ['Year', 'Series', 'Team1', 'Team2']

In [99]:
def fix_series(row):
    if "First Round" in row:
        return 'Playoff'
    if "Semifinals" in row:
        return 'Quarter'
    if "Conf Finals" in row:
        return 'Semi'
    return "Final"

In [100]:
playoffs.Series = playoffs.Series.apply(fix_series)

In [101]:
playoffs['Team1Score'] = playoffs.Team1.str.extract(r'([0-9])')
playoffs['Team1Score'] = playoffs['Team1Score'].astype('int64')
playoffs['Team2Score'] = playoffs.Team2.str.extract(r'([0-9])')
playoffs['Team2Score'] = playoffs['Team2Score'].astype('int64')
playoffs['Team1'] = playoffs.Team1.apply(lambda x: x[:-4])
playoffs['Team2'] = playoffs.Team2.apply(lambda x: x[:-4])

In [104]:
glob = pd.DataFrame()
for year in range(1980, 2022):
    temp = playoffs[playoffs.Year == year]
    a = temp.groupby('Team1').Team1Score.sum().to_frame()
    a.columns = ['Score']
    b = temp.groupby('Team2').Team2Score.sum().to_frame()
    b.columns = ['Score']
    c = pd.concat([a,b], axis=0).reset_index()
    c.columns = ['TEAM','SCORE']
    d = c.groupby('TEAM').sum().reset_index()
    d['SEASON'] = str(year-1) + "-" + str(year)[-2:]
    glob = pd.concat([glob, d], ignore_index=True)

In [107]:
glob.to_csv('../data/playoffwins.csv', index=False)

---

In [10]:
playoffs['difference'] = playoffs['Team1Score'] - playoffs['Team2Score']

In [11]:
playoffs['Winner'] = np.where(playoffs.difference < 0, playoffs.Team2, playoffs.Team1)

In [12]:
playoffs = playoffs[['Year','Series','Team1','Team2','Winner']]

In [14]:
playoffs.to_csv('data/playoffs.csv')