In [35]:

# Import libaries
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import StandardScaler
import requests
from bs4 import BeautifulSoup


# Scraping player info for the slug for the boxscore url

In [36]:
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 500)
pd.set_option('display.min_rows', 500)

In [37]:
alphabet = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n",\
            "o", "p", "q", "r", "s", "t", "u", "v", "w", "y", "z"] 

In [39]:

players = []
for a in alphabet:
    url2 = f'https://www.basketball-reference.com/players/{a}/'
    r2 = requests.get(url2)
    soup2 = BeautifulSoup(r2.content, 'lxml')
    table2 = soup2.find('table', {'id' : 'players'})
    
    for row in table2.find('tbody').find_all('tr'):
        player= {}
        try:
            player['slug'] = row.find('th').attrs['data-append-csv']
        except:
            player['slug'] = np.nan
        try:
            player['name'] = row.find('a').text
        except:
            player['name'] = np.nan
        try:
            player['year_start'] = row.find('td', {'data-stat' : 'year_min'}).text
        except:
            player['year_start'] = np.nan
        try:
            player['year_end'] = row.find('td', {'data-stat' : 'year_max'}).text
        except:
            player['year_end'] = np.nan
        try:
            player['position'] = row.find('td', {'data-stat' : 'pos'}).text
        except:
            player['position'] = np.nan
        try:
            player['height'] = row.find('td', {'data-stat' : 'height'}).text
        except:
            player['height'] = np.nan
        try:
            player['weight'] = row.find('td', {'data-stat' : 'weight'}).text
        except:
            player['weight'] = np.nan
        try:
            player['birth_date'] = row.find('td', {'data-stat' : 'birth_date'}).text
        except:
            player['birth_date'] = np.nan
        try:
            player['college'] = row.find('td', {'data-stat' : 'colleges'}).text
        except:
            player['college'] = np.nan
    
    

        players.append(player)
        
    

# Making players DataFrame

In [40]:
players_df = pd.DataFrame(players)

In [41]:
players_df.head()

Unnamed: 0,slug,name,year_start,year_end,position,height,weight,birth_date,college
0,abdelal01,Alaa Abdelnaby,1991,1995,F-C,6-10,240,"June 24, 1968",Duke
1,abdulza01,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235,"April 7, 1946",Iowa State
2,abdulka01,Kareem Abdul-Jabbar,1970,1989,C,7-2,225,"April 16, 1947",UCLA
3,abdulma02,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162,"March 9, 1969",LSU
4,abdulta01,Tariq Abdul-Wahad,1998,2003,F,6-6,223,"November 3, 1974","Michigan, San Jose State"


In [42]:
players_df.dtypes
#all are strings, need to convert some to ints

slug          object
name          object
year_start    object
year_end      object
position      object
height        object
weight        object
birth_date    object
college       object
dtype: object

In [43]:
#

In [44]:
players_df = players_df.astype({'year_start': 'int64'})

In [45]:
players_df = players_df.astype({'year_end': 'int64'})

# Creating a Dataframe of just current players

In [46]:
current_players_df = players_df.loc[players_df['year_end'] > 2019]

In [47]:
nineteen_df = players_df[(players_df['year_end'] > 2018) & (players_df['year_start'] < 2019)]

In [48]:
current_players_list = current_players_df.to_dict('list')
#making it into a list of dictionaires so that can loop throught= them

In [49]:
current_players_list.keys()

dict_keys(['slug', 'name', 'year_start', 'year_end', 'position', 'height', 'weight', 'birth_date', 'college'])

# Getting Team Averages

In [50]:
team_average_years = ['2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011',\
                      '2012', '2013', '2013', '2015', '2016', '2017', '2018', '2019', '2020']

In [51]:
game_months = ['11', '12', '1', '2', '3', '4']

In [52]:
team_def_stats = []


for y in team_average_years:
    for m in game_months:
        url3 = f'https://www.teamrankings.com/nba/stat/defensive-efficiency?date=2020-05-25'
        r3 = requests.get(url3)
        soup3 = BeautifulSoup(r3.content, 'lxml')
        div_main = soup3.find('div', {'class': 'main-wrapper clearfix has-left-sidebar'})
        main_scrape= div_main.find('main', {'role': 'main'})
        table_scrape = main_scrape.find('table')
        date_scrape = main_scrape.find('div', {'class': 'table-filters clearfix'})
        for row in table_scrape.find('tbody').find_all('tr'):
                stats = {}
                stats['date'] = date_scrape.find('input').attrs['value']
                stats['name'] = row.find_all('td')[1].attrs['data-sort']
                stats['current_year_def'] = row.find_all('td')[2].attrs['data-sort']
                stats['last_3'] = row.find_all('td')[3].attrs['data-sort']
                stats['last_1'] = row.find_all('td')[4].attrs['data-sort']
                stats['home'] = row.find_all('td')[5].attrs['data-sort']
                stats['away'] = row.find_all('td')[6].attrs['data-sort']
                stats['last_year_def'] = row.find_all('td')[7].attrs['data-sort']



                team_def_stats.append(stats)


In [53]:
team_def_df = pd.DataFrame(team_def_stats)

In [54]:
team_def_df.head()

Unnamed: 0,date,name,current_year_def,last_3,last_1,home,away,last_year_def
0,05/25/2020,Milwaukee,0.986428,0.934015,0.89817,0.988964,0.98389,1.01909
1,05/25/2020,Toronto,1.01821,1.10336,1.04828,1.02177,1.01427,1.03267
2,05/25/2020,LA Lakers,1.02687,1.0752,1.09715,1.0114,1.04129,1.06253
3,05/25/2020,Boston,1.03358,1.07079,1.11082,1.03613,1.03109,1.04194
4,05/25/2020,LA Clippers,1.03823,1.07961,0.936722,1.02638,1.05045,1.08597


In [55]:
# team_ratings = []
# url4 = 'https://www.basketball-reference.com/leagues/NBA_2019_ratings.html'
# r4 = requests.get(url4)
# soup4 = BeautifulSoup(r4.content, 'lxml')
# team_div = soup4.find('div', {'id' : 'div_ratings'})
# table4 = team_div.find('table', {'class' : 'sortable'})
# for row in table4.find('tbody').find_all('tr'):
#     ratings = {}
#     ratings['team'] = row.find('td', {'data-stat' : 'team_name'}).text
#     ratings['wins'] = row.find('td', {'data-stat' : 'wins'}).text
#     ratings['losses'] = row.find('td', {'data-stat' : 'losses'}).text
#     ratings['win_pct'] = row.find('td', {'data-stat' : 'win_loss_pct'}).text
#     ratings['margin_of_victory'] = row.find('td', {'data-stat' : 'mov'}).text
#     ratings['offensive_rating'] = row.find('td', {'data-stat' : 'off_rtg'}).text
#     ratings['defensive_rating'] = row.find('td', {'data-stat' : 'def_rtg'}).text
#     ratings['net_rating'] = row.find('td', {'data-stat' : 'net_rtg'}).text
#     ratings['adjusted_margin_of_victory'] = row.find('td', {'data-stat' : 'mov_adj'}).text
#     ratings['adjusted_offensive_ratings'] = row.find('td', {'data-stat' : 'off_rtg_adj'}).text
#     ratings['adjusted_defensive_ratings'] = row.find('td', {'data-stat' : 'def_rtg_adj'}).text
#     ratings['adjusted_net_rating'] = row.find('td', {'data-stat' : 'net_rtg_adj'}).text
#     team_ratings.append(ratings)
    


In [56]:
# for index, row in teams_df.iterrows():
#     print(row[0])
#     if row[0] == 'Milwaukee Bucks':
#              row['test'] = row[5]
#     else:
#              row['test'] = 'Not Working'

In [57]:
team_def_df['name'].unique()

array(['Milwaukee', 'Toronto', 'LA Lakers', 'Boston', 'LA Clippers',
       'Philadelphia', 'Indiana', 'Brooklyn', 'Orlando', 'Denver',
       'Okla City', 'Miami', 'Houston', 'Utah', 'Chicago', 'Memphis',
       'Dallas', 'Sacramento', 'Phoenix', 'Minnesota', 'New Orleans',
       'Detroit', 'New York', 'Charlotte', 'Golden State', 'Portland',
       'San Antonio', 'Atlanta', 'Cleveland', 'Washington'], dtype=object)

In [58]:
home_abr = []
for row in team_def_df['name']:
        if row == 'New Orleans':
            home_abr.append('NOP')
        elif row == 'LA Lakers':
            home_abr.append('LAL')
        elif row == 'Chicago':
            home_abr.append('CHI')
        elif row == 'Detroit':
            home_abr.append('DET')
        elif row == 'Cleveland':
            home_abr.append('CLE')
        elif row == 'Minnesota':
            home_abr.append('MIN')
        elif row == 'Memphis':
            home_abr.append('MEM')
        elif row == 'Washington':
            home_abr.append('WAS')
        elif row == 'New York':
            home_abr.append('NYK')
        elif row == 'Okla City':
            home_abr.append('OKC')
        elif row == 'Sacramento':
            home_abr.append('SAC')
        elif row == 'Denver':
            home_abr.append('DEN')
        elif row == 'Atlanta':
            home_abr.append('ATL')
        elif row == 'Milwaukee':
            home_abr.append('MIL')
        elif row == 'LA Clippers':
            home_abr.append('LAC')
        elif row == 'Toronto':
            home_abr.append('TOR')
        elif row == 'Dallas':
            home_abr.append('DAL')
        elif row == 'Phoenix':
            home_abr.append('PHO')
        elif row == 'Portland':
            home_abr.append('POR')
        elif row == 'Utah':
            home_abr.append('UTA')
        elif row == 'Miami':
            home_abr.append('MIA')
        elif row == 'Philadelphia':
            home_abr.append('PHI')
        elif row == 'Orlando':
            home_abr.append('ORL')
        elif row == 'Boston':
            home_abr.append('BOS')
        elif row == 'Indiana':
            home_abr.append('IND')
        elif row == 'Brooklyn':
            home_abr.append('BKN')
        elif row == 'Charlotte':
            home_abr.append('CHA')
        elif row == 'Houston':
            home_abr.append('HOU')
        elif row == 'San Antonio':
            home_abr.append('SAS')
        elif row == 'Golden State':
            home_abr.append('GSW')
        else:
            home_abr.append('Forgot to add team!')

team_def_df['team_abr'] = home_abr      

In [None]:
#if month number - 1 = month of main_df & opponent = team_abr, main_df['opponent_def'] = team_def_df['current_year_def']

In [240]:
team_def_df.head()

Unnamed: 0,date,name,current_year_def,last_3,last_1,home,away,last_year_def,team_abr
0,11/25/2004,San Antonio,0.946133,0.969369,0.832033,0.869777,1.00031,0.920566,SAS
1,11/25/2004,Cleveland,0.946611,0.943888,0.883336,0.91827,0.982198,1.01009,CLE
2,11/25/2004,Orlando,0.958222,0.939126,0.943171,0.931939,0.985147,1.06937,ORL
3,11/25/2004,Golden State,0.965597,0.917961,0.926871,0.948742,0.986601,1.0099,GSW
4,11/25/2004,Dallas,0.975114,0.972964,0.977639,0.964967,0.983954,1.04071,DAL


In [29]:
team_def_df.dtypes

date                object
name                object
current_year_def    object
last_3              object
last_1              object
home                object
away                object
last_year_def       object
dtype: object

In [59]:
years = ['2004', '2005', '2006', '2007', '2008',\
         '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018',\
         '2019', '2020']

# Scraping the boxscore for each game for every active player in the NBA

In [60]:
all_players_stats = []
counter = 0
for y in years:
    for a in alphabet:
        for w in players_df['slug'].unique():
            first_letter = w[0]
            if first_letter == a:
                try:
                    url = f'https://www.basketball-reference.com/players/{a}/{w}/gamelog/{y}'
                    r = requests.get(url)
                    soup = BeautifulSoup(r.content, 'lxml')
                    table = soup.find('table', {'id': 'pgl_basic'})
                    div = soup.find('div', {'itemtype' : 'https://schema.org/Person'})
                    name = div.find('h1', {'itemprop' : 'name'})
                    for row in table.find('tbody').find_all('tr'):
                        game = {}
                        game['slug'] = w
                        try:
                            game['date'] = row.find('td', {'data-stat' : 'date_game'}).text
                        except:
                            game['date'] = np.nan
                        try:
                            game['name'] = name.text
                        except:
                            game['name'] = np.nan
                        try:
                            game['age'] = row.find('td', {'data-stat' : 'age'}).text
                        except:
                            game['age'] = np.nan
                        try:
                            game['team'] = row.find('td', {'data-stat' : 'team_id'}).text
                        except:
                            game['team'] = np.nan
                        try:
                            game['away'] = row.find('td', {'data-stat' : 'game_location'}).text
                        except:
                            game['away'] = np.nan
                        try:
                            game['opponent'] = row.find('td', {'data-stat' : 'opp_id'}).text
                        except:
                            game['opponent'] = np.nan
                        try:
                            game['game_result'] = row.find('td', {'data-stat' : 'game_result'}).text
                        except:
                            game['game_result'] = np.nan
                        try:
                            game['minutes_played'] = row.find('td', {'data-stat' : 'mp'}).text
                        except:
                            game['minutes_played'] = np.nan
                        try:
                            game['field_goals'] = row.find('td', {'data-stat' : 'fg'}).text
                        except:
                            game['field_goals'] = np.nan
                        try:
                            game['field_goals_att'] = row.find('td', {'data-stat' : 'fga'}).text
                        except:
                            game['field_goals_att'] = np.nan
                        try:
                            game['field_goal_perc'] = row.find('td', {'data-stat' : 'fg_pct'}).text
                        except:
                            game['field_goal_perc'] = np.nan
                        try:
                            game['three_point_made'] = row.find('td', {'data-stat' : 'fg3'}).text
                        except:
                            game['three_point_made'] = np.nan
                        try:
                            game['three_point_att'] = row.find('td', {'data-stat' : 'fg3a'}).text
                        except:
                            game['three_point_att'] = np.nan
                        try:
                            game['three_point_perc'] = row.find('td', {'data-stat' : 'fg3_pct'}).text
                        except:
                            game['three_point_perc'] = np.nan
                        try:
                            game['free_throws_made'] = row.find('td', {'data-stat' : 'ft'}).text
                        except:
                            game['free_throws_made'] = np.nan
                        try:
                            game['free_throws_att'] = row.find('td', {'data-stat' : 'fta'}).text
                        except:
                            game['free_throws_att'] = np.nan
                        try:
                            game['free_throw_perc'] = row.find('td', {'data-stat' : 'ft_pct'}).text
                        except:
                            game['free_throw_perc'] = np.nan
                        try:
                            game['offensive_rebound'] = row.find('td', {'data-stat' : 'orb'}).text
                        except:
                            game['offensive_rebound'] = np.nan
                        try:
                            game['defensive_rebound'] = row.find('td', {'data-stat' : 'drb'}).text
                        except:
                            game['defensive_rebound'] = np.nan
                        try:
                            game['total_rebounds'] = row.find('td', {'data-stat' : 'trb'}).text
                        except:
                            game['total_rebounds'] = np.nan
                        try:
                            game['assists'] = row.find('td', {'data-stat' : 'ast'}).text
                        except:
                            game['assists'] = np.nan
                        try:
                            game['steals'] = row.find('td', {'data-stat' : 'stl'}).text
                        except:
                            game['steals'] = np.nan
                        try:
                            game['blocks'] = row.find('td', {'data-stat' : 'blk'}).text
                        except:
                            game['blocks'] = np.nan
                        try:
                            game['turnovers'] = row.find('td', {'data-stat' : 'tov'}).text
                        except:
                            game['turnovers'] = np.nan
                        try:
                            game['personal_fouls'] = row.find('td', {'data-stat' : 'pf'}).text
                        except:
                            game['personal_fouls'] = np.nan
                        try:
                            game['points'] = row.find('td', {'data-stat' : 'pts'}).text
                        except:
                            game['points'] = np.nan
                        try:
                            game['game_score'] = row.find('td', {'data-stat' : 'game_score'}).text
                        except:
                            game['game_score'] = np.nan
                        try:
                            game['plus_minus'] = row.find('td', {'data-stat' : 'plus_minus'}).text
                        except:
                            game['plus_minus'] = np.nan


                        all_players_stats.append(game)
                    else:
                        continue
                except:
                    table = np.nan
                else:
                    continue

# Making the players stats DF

In [286]:
game_stats_df = pd.DataFrame(all_players_stats)

In [30]:
game_stats_df.tail()

NameError: name 'game_stats_df' is not defined

In [287]:
final_df = game_stats_df_2020.merge(teams_df, how='left', left_on='team', right_on='team_abr')

# Stripping white space

In [52]:
list_of_columns = list(game_stats_df.columns.values)

In [54]:
for x in list_of_columns:
    game_stats_df[x] = game_stats_df[x].str.strip()

# Changing column types and other cleaning

In [55]:
def columntype_cleaning(df):
    df['name'] = df['name'].str[:-17]
    df['game_result'] = df['game_result'].str.strip("L ()")
    df['game_result'] = df['game_result'].str.strip("W ()")
    df['minutes_played'] = df['minutes_played'].str.replace(":", ".")
    df['away'] = np.where(df['away'] == "@", 1, 0)
    df['field_goal_perc'] = df['field_goal_perc'].replace("", np.nan)
    df['three_point_perc'] = df['three_point_perc'].replace("", np.nan)
    df['plus_minus'] = df['plus_minus'].replace("", np.nan)
    df['age'] = df['age'].astype(str)
    df['age'] = df['age'].str[:2]
    df['away'] = df['away'].astype(int)
    df['game_result'] = df['game_result'].astype(float)
    df['minutes_played'] = df['minutes_played'].astype(float)
    df['field_goals'] = df['field_goals'].astype(float)
    df['field_goals_att'] = df['field_goals_att'].astype(float)
    df['field_goal_perc'] = df['field_goal_perc'].astype(float)
    df['three_point_made'] = df['three_point_made'].astype(float)
    df['three_point_att'] = df['three_point_att'].astype(float)
    df['three_point_perc'] = df['three_point_perc'].astype(float)
    df['offensive_rebound'] = df['offensive_rebound'].astype(float)
    df['defensive_rebound'] = df['defensive_rebound'].astype(float)
    df['total_rebounds'] = df['total_rebounds'].astype(float)
    df['assists'] = df['assists'].astype(float)
    df['steals'] = df['steals'].astype(float)
    df['blocks'] = df['blocks'].astype(float)
    df['turnovers'] = df['turnovers'].astype(float)
    df['personal_fouls'] = df['personal_fouls'].astype(float)
    df['points'] = df['points'].astype(float)
    df['game_score'] = df['game_score'].astype(float)
    df['plus_minus'] = df['plus_minus'].astype(float)
#     df['wins'] = df['wins'].astype(float)
#     df['losses'] = df['losses'].astype(float)
#     df['win_pct'] = df['win_pct'].astype(float)
#     df['margin_of_victory'] = df['margin_of_victory'].astype(float)
#     df['offensive_rating'] = df['offensive_rating'].astype(float)
#     df['defensive_rating'] = df['defensive_rating'].astype(float)
#     df['net_rating'] = df['net_rating'].astype(float)
#     df['adjusted_margin_of_victory'] = df['adjusted_margin_of_victory'].astype(float)
#     df['adjusted_offensive_rating'] = df['adjusted_offensive_ratings'].astype(float)
#     df['adjusted_defensive_rating'] = df['adjusted_defensive_ratings'].astype(float)
#     df['adjusted_net_rating'] = df['adjusted_net_rating'].astype(float)
#     df.rename(columns = {'team_x' : 'team'}, inplace=True)
#     df = df.drop(columns = ['team_abr', 'team_y'])
    

In [56]:
columntype_cleaning(game_stats_df)

In [58]:
game_stats_df.isna().sum()

slug                      0
date                  10815
name                      0
age                       0
team                  10815
away                      0
opponent              10815
game_result           10815
minutes_played        64369
field_goals           64369
field_goals_att       64369
field_goal_perc       69239
three_point_made      64369
three_point_att       64369
three_point_perc     118078
free_throws_made      64369
free_throws_att       64369
free_throw_perc       64369
offensive_rebound     64369
defensive_rebound     64369
total_rebounds        64369
assists               64369
steals                64369
blocks                64369
turnovers             64369
personal_fouls        64369
points                64369
game_score            64369
plus_minus            64641
dtype: int64

In [60]:
game_stats_df.to_csv('../player_box_scores_clean.csv', index=False)

In [None]:
team_def_df.to_csv('../team_defensive_ratings.csv', index=False)