# 1. Data Scraping

First, we import the Python packages we'll be using in the entire notebook

In [1]:
#pip install progressbar
#pip install XXXXXXXXXXX [other packages needed]

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup, SoupStrainer
import sys
import string
import requests
import datetime
import progressbar
import time
import re
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from pandas_profiling import ProfileReport

import warnings
warnings.filterwarnings("ignore")

## 1.1 Player basic info 

Explicació

In [2]:
def player_basic_info():
    players = []
    base_url = 'http://www.basketball-reference.com/players/' # this is the page where we are going to find every player in the NBA and ABA history
    start_timer = time.perf_counter()
    for letter in string.ascii_lowercase:
        print('Scrapping basic info of players with letter ' + letter + '...')
        page_request = requests.get(base_url + letter) #we will navigate by letter
        soup = BeautifulSoup(page_request.text,"lxml")
        table = soup.find('table')
        if table:
            table_body = table.find('tbody')
            for row in table_body.findAll('tr'):
                player_url = row.find('a')
                player_names = player_url.text
                player_pages = player_url['href']
                cells = row.findAll('td') # all data for all players uniform across database
                #the information is displayed in this order:
                active_from = int(cells[0].text)
                active_to = int(cells[1].text)
                position = cells[2].text
                height = cells[3].text
                weight = cells[4].text
                birth_date = cells[5].text
                college = cells[6].text    
                player_entry = {'player_url': player_pages,
                                'player_name': player_names,
                                'active_from': active_from,
                                'active_to': active_to,
                                'position': position,
                                'college': college,# we will have to treat this info (+ one college, consistency in names...)
                                'height': height,
                                'weight': weight,
                                'birth_date': birth_date} # we will have to treat this info (date format)
                players.append(player_entry)
    end_timer = time.perf_counter()
    print(f"Scrapped the basic info of the " + str(len(players)) + f" NBA and ABA players in {end_timer - start_timer:0.4f} seconds")
    return pd.DataFrame(players)

#a[a['name'].str.contains('Rubio')] Proves that the dataset contains the info expected

## 1.2 NBA Player stats

Unas consideraciones previas:
- 1o hay que escoger que tablas vamos a scrapear-
- Hay que tener en cuenta que hay jugadores que van a tener muchas menos estadísticas que los actuales (a partir de finales de los 90). Por varias razones: antiguedad, aun NO EXISTIA el tiro de 3P, rebotes no contados hasta el 50, triples dobles popularizados en 1990, etc... Ejemplos: '/players/a/ablefo01.html', '/players/a/abdulza01.html' 
- Baixa't la funció de empty NaN's de PwC
- Considerar el borrar aquellos registros que no tengan suficientes datos
- Ver como tratamos los '' y los None del dataframe
- Scrapearia un botón de playoff para ver que análisis hago del jugador en concreto (me ahorro tiempo)

In [3]:
def player_info_per_game(player_url,table_id,extra_name): #table_id: 'per_game','playoffs_per_game'...
    
    #We will get the info at the Career footpart
    games = None
    games_started = None
    minutes_played_pg = None
    field_goals_pg = None
    field_goals_attempts_pg = None
    field_goals_percent_pg = None
    _3pts_goals_pg = None
    _3pts_goals_attempts_pg = None
    _3pts_goals_percent_pg = None
    _2pts_goals_pg = None
    _2pts_goals_attempts_pg = None
    _2pts_goals_percent_pg = None
    effective_field_goals_percent_pg = None
    FT_goals_pg = None
    FT_goals_attempts_pg = None
    FT_goals_percent_pg = None
    off_rebounds_pg = None
    def_rebounds_pg = None
    total_rebounds_pg = None
    assists_pg = None
    steals_pg = None
    blocks_pg = None
    turnovers_pg = None
    personal_foults_pg = None
    points_pg = None
    
    page_request = requests.get('https://www.basketball-reference.com' + str(player_url))
    soup = BeautifulSoup(page_request.text,"lxml")
    table = soup.find("table",{'id':table_id})
    
    if table:
            table_foot = table.find('tfoot')
            career_row = table_foot.find('tr')
            cells  = career_row.findAll('td')
            playerData = str(cells) #the indexes are not uniform across the database
            # It searches using regular expressions!!!!!
            games = re.search(r'data-stat="g">(.*?)</td>', playerData).group(1)
            games_started = re.search(r'data-stat="gs">(.*?)</td>', playerData).group(1)
            minutes_played_pg = re.search(r'data-stat="mp_per_g">(.*?)</td>', playerData).group(1)
            field_goals_pg = re.search(r'data-stat="fg_per_g">(.*?)</td>', playerData).group(1)
            field_goals_attempts_pg = re.search(r'data-stat="fga_per_g">(.*?)</td>', playerData).group(1)
            field_goals_percent_pg = re.search(r'data-stat="fg_pct">(.*?)</td>', playerData).group(1)
            if re.search(r'data-stat="fg3_per_g">(.*?)</td>', playerData) != None:
                _3pts_goals_pg = re.search(r'data-stat="fg3_per_g">(.*?)</td>', playerData).group(1)
                _3pts_goals_attempts_pg = re.search(r'data-stat="fg3a_per_g">(.*?)</td>', playerData).group(1)
                _3pts_goals_percent_pg = re.search(r'data-stat="fg3_pct">(.*?)</td>', playerData).group(1)
            if re.search(r'data-stat="fg2_per_g">(.*?)</td>', playerData) != None:
                _2pts_goals_pg = re.search(r'data-stat="fg2_per_g">(.*?)</td>', playerData).group(1)
                _2pts_goals_attempts_pg = re.search(r'data-stat="fg2a_per_g">(.*?)</td>', playerData).group(1)
                _2pts_goals_percent_pg = re.search(r'data-stat="fg2_pct">(.*?)</td>', playerData).group(1)
                effective_field_goals_percent_pg = re.search(r'data-stat="efg_pct">(.*?)</td>', playerData).group(1)
            FT_goals_pg = re.search(r'data-stat="ft_per_g">(.*?)</td>', playerData).group(1)
            FT_goals_attempts_pg = re.search(r'data-stat="fta_per_g">(.*?)</td>', playerData).group(1)
            FT_goals_percent_pg = re.search(r'data-stat="ft_pct">(.*?)</td>', playerData).group(1)
            if re.search(r'data-stat="orb_per_g">(.*?)</td>', playerData) != None:
                off_rebounds_pg = re.search(r'data-stat="orb_per_g">(.*?)</td>', playerData).group(1)
                def_rebounds_pg = re.search(r'data-stat="drb_per_g">(.*?)</td>', playerData).group(1)
            total_rebounds_pg = re.search(r'data-stat="trb_per_g">(.*?)</td>', playerData).group(1)
            assists_pg = re.search(r'data-stat="ast_per_g">(.*?)</td>', playerData).group(1)
            if re.search(r'data-stat="stl_per_g">(.*?)</td>', playerData) != None:
                steals_pg = re.search(r'data-stat="stl_per_g">(.*?)</td>', playerData).group(1)
                blocks_pg = re.search(r'data-stat="blk_per_g">(.*?)</td>', playerData).group(1)
                try:
                    turnovers_pg = re.search(r'data-stat="tov_per_g">(.*?)</td>', playerData).group(1)
                except:
                    pass
            personal_foults_pg = re.search(r'data-stat="pf_per_g">(.*?)</td>', playerData).group(1)
            points_pg = re.search(r'data-stat="pts_per_g">(.*?)</td>', playerData).group(1)
        
    player_info = {#'player_url' : player_url,
                   #'games' + extra_name : [ games ],
                   'games_started' + extra_name : [ games_started ],
                   'minutes_played_pg' + extra_name : [ minutes_played_pg ],
                   'field_goals_pg' + extra_name : [ field_goals_pg ],
                   'field_goals_attempts_pg' + extra_name : [ field_goals_attempts_pg ],
                   'field_goals_percent_pg' + extra_name : [ field_goals_percent_pg ],
                   '_3pts_goals_pg' + extra_name : [ _3pts_goals_pg ],
                   '_3pts_goals_attempts_pg' + extra_name : [ _3pts_goals_attempts_pg ],
                   '_3pts_goals_percent_pg' + extra_name : [ _3pts_goals_percent_pg ],
                   '_2pts_goals_pg' + extra_name : [ _2pts_goals_pg ],
                   '_2pts_goals_attempts_pg' + extra_name : [ _2pts_goals_attempts_pg ],
                   '_2pts_goals_percent_pg' + extra_name : [ _2pts_goals_percent_pg ],
                   'effective_field_goals_percent_pg' + extra_name : [ effective_field_goals_percent_pg ],
                   'FT_goals_pg' + extra_name : [ FT_goals_pg ],
                   'FT_goals_attempts_pg' + extra_name : [ FT_goals_attempts_pg ],
                   'FT_goals_percent_pg' + extra_name : [ FT_goals_percent_pg ],
                   'off_rebounds_pg' + extra_name : [ off_rebounds_pg ],
                   'def_rebounds_pg' + extra_name : [ def_rebounds_pg ],
                   'total_rebounds_pg' + extra_name : [ total_rebounds_pg ],
                   'assists_pg' + extra_name : [ assists_pg ],
                   'steals_pg' + extra_name : [ steals_pg ],
                   'blocks_pg' + extra_name : [ blocks_pg ],
                   'turnovers_pg' + extra_name : [ turnovers_pg ],
                   'personal_foults_pg' + extra_name : [ personal_foults_pg ],
                   'points_pg' + extra_name : [ points_pg ]}
                
    return player_info

In [4]:
def player_info_totals(player_url,table_id,extra_name): #table_id: 'totals','playoffs_totals '...
    
    #We will get the info at the Career footpart
    games = None
    games_started = None
    minutes_played_total = None
    field_goals_total = None
    field_goals_attempts_total = None
    field_goals_percent_total = None
    _3pts_goals_total = None
    _3pts_goals_attempts_total = None
    _3pts_goals_percent_total = None
    _2pts_goals_total = None
    _2pts_goals_attempts_total = None
    _2pts_goals_percent_total = None
    effective_field_goals_percent_total = None
    FT_goals_total = None
    FT_goals_attempts_total = None
    FT_goals_percent_total = None
    off_rebounds_total = None
    def_rebounds_total = None
    total_rebounds_total = None
    assists_total = None
    steals_total = None
    blocks_total = None
    turnovers_total = None
    personal_foults_total = None
    points_total = None
    triple_doubles_total = None 
    
    
    page_request = requests.get('https://www.basketball-reference.com' + str(player_url))
    soup = BeautifulSoup(page_request.text,"lxml")
    table = soup.find("table",{'id':table_id})
    
    if table:
            table_foot = table.find('tfoot')
            career_row = table_foot.find('tr')
            cells  = career_row.findAll('td')
            playerData = str(cells) #the indexes are not uniform across the database
            # It searches using regular expressions!!!!!
            games = re.search(r'data-stat="g">(.*?)</td>', playerData).group(1)
            games_started = re.search(r'data-stat="gs">(.*?)</td>', playerData).group(1)
            minutes_played_total = re.search(r'data-stat="mp">(.*?)</td>', playerData).group(1)
            field_goals_total = re.search(r'data-stat="fg">(.*?)</td>', playerData).group(1)
            field_goals_attempts_total = re.search(r'data-stat="fga">(.*?)</td>', playerData).group(1)
            field_goals_percent_total = re.search(r'data-stat="fg_pct">(.*?)</td>', playerData).group(1)
            if re.search(r'data-stat="fg3">(.*?)</td>', playerData) != None:
                _3pts_goals_total = re.search(r'data-stat="fg3">(.*?)</td>', playerData).group(1)
                _3pts_goals_attempts_total = re.search(r'data-stat="fg3a">(.*?)</td>', playerData).group(1)
                _3pts_goals_percent_total = re.search(r'data-stat="fg3_pct">(.*?)</td>', playerData).group(1)
            if re.search(r'data-stat="fg2">(.*?)</td>', playerData) != None:
                _2pts_goals_total = re.search(r'data-stat="fg2">(.*?)</td>', playerData).group(1)
                _2pts_goals_attempts_total = re.search(r'data-stat="fg2a">(.*?)</td>', playerData).group(1)
                _2pts_goals_percent_total = re.search(r'data-stat="fg2_pct">(.*?)</td>', playerData).group(1)
                effective_field_goals_percent_total = re.search(r'data-stat="efg_pct">(.*?)</td>', playerData).group(1)
            FT_goals_total = re.search(r'data-stat="ft">(.*?)</td>', playerData).group(1)
            FT_goals_attempts_total = re.search(r'data-stat="fta">(.*?)</td>', playerData).group(1)
            FT_goals_percent_total = re.search(r'data-stat="ft_pct">(.*?)</td>', playerData).group(1)
            if re.search(r'data-stat="orb">(.*?)</td>', playerData) != None:
                off_rebounds_total = re.search(r'data-stat="orb">(.*?)</td>', playerData).group(1)
                def_rebounds_total = re.search(r'data-stat="drb">(.*?)</td>', playerData).group(1)
            total_rebounds_total = re.search(r'data-stat="trb">(.*?)</td>', playerData).group(1)
            assists_total = re.search(r'data-stat="ast">(.*?)</td>', playerData).group(1)
            if re.search(r'data-stat="stl">(.*?)</td>', playerData) != None:
                steals_total = re.search(r'data-stat="stl">(.*?)</td>', playerData).group(1)
                blocks_total = re.search(r'data-stat="blk">(.*?)</td>', playerData).group(1)
                try:
                    turnovers_total = re.search(r'data-stat="tov">(.*?)</td>', playerData).group(1)
                except:
                    pass
            personal_foults_total = re.search(r'data-stat="pf">(.*?)</td>', playerData).group(1)
            points_total = re.search(r'data-stat="pts">(.*?)</td>', playerData).group(1)
            if re.search(r'data-stat="trp_dbl">(.*?)</td>', playerData) != None:
                triple_doubles_total = re.search(r'data-stat="trp_dbl">(.*?)</td>', playerData).group(1)
        
    player_info = {#'player_url' : [player_url],
                   'games' + extra_name: [games],
                   'games_started' + extra_name: [games_started],
                   'minutes_played_total' + extra_name: [minutes_played_total],
                   'field_goals_total' + extra_name: [field_goals_total],
                   'field_goals_attempts_total' + extra_name: [field_goals_attempts_total],
                   'field_goals_percent_total' + extra_name: [field_goals_percent_total],
                   '_3pts_goals_total' + extra_name : [ _3pts_goals_total],
                   '_3pts_goals_attempts_total' + extra_name : [ _3pts_goals_attempts_total],
                   '_3pts_goals_percent_total' + extra_name : [ _3pts_goals_percent_total],
                   '_2pts_goals_total' + extra_name : [ _2pts_goals_total],
                   '_2pts_goals_attempts_total' + extra_name : [ _2pts_goals_attempts_total],
                   '_2pts_goals_percent_total' + extra_name : [ _2pts_goals_percent_total],
                   'effective_field_goals_percent_total' + extra_name : [ effective_field_goals_percent_total],
                   'FT_goals_total' + extra_name : [ FT_goals_total],
                   'FT_goals_attempts_total' + extra_name : [ FT_goals_attempts_total],
                   'FT_goals_percent_total' + extra_name : [ FT_goals_percent_total],
                   'off_rebounds_total' + extra_name : [ off_rebounds_total],
                   'def_rebounds_total' + extra_name : [ def_rebounds_total],
                   'total_rebounds_total' + extra_name : [ total_rebounds_total],
                   'assists_total' + extra_name : [ assists_total],
                   'steals_total' + extra_name : [ steals_total],
                   'blocks_total' + extra_name : [ blocks_total],
                   'turnovers_total' + extra_name : [ turnovers_total],
                   'personal_foults_total' + extra_name : [ personal_foults_total],
                   'points_total' + extra_name : [ points_total ],
                   'triple_doubles_total' + extra_name : [ triple_doubles_total ]}
                
    return player_info

In [5]:
def player_info_advanced(player_url,table_id,extra_name): #table_id: 'advanced','playoffs_advanced'...
    
    #We will get the info at the Career footpart
    player_eff_rating = None
    true_shooting_eff = None
    _3pts_attempts_rate = None
    FT_attempts_rate = None
    off_rebounds_percent = None
    def_rebounds_percent = None
    total_rebounds_percent = None
    assists_percent = None
    steals_percent = None
    blocks_percent = None
    turnover_percent = None
    usage_percent = None
    off_win_share = None
    def_win_share = None
    win_share = None
    win_share_48min = None
    off_box_pm = None
    def_box_pm = None
    box_pm = None
    value_over_replacement = None
    
    page_request = requests.get('https://www.basketball-reference.com' + str(player_url))
    soup = BeautifulSoup(page_request.text,"lxml")
    table = soup.find("table",{'id':table_id})
    
    if table:
            table_foot = table.find('tfoot')
            career_row = table_foot.find('tr')
            cells  = career_row.findAll('td')
            playerData = str(cells) #the indexes are not uniform across the database
            # It searches using regular expressions!!!!!
            player_eff_rating = re.search(r'data-stat="per">(.*?)</td>', playerData).group(1)
            true_shooting_eff = re.search(r'data-stat="ts_pct">(.*?)</td>', playerData).group(1)
            FT_attempts_rate = re.search(r'data-stat="fta_per_fga_pct">(.*?)</td>', playerData).group(1)
            off_rebounds_percent = re.search(r'data-stat="orb_pct">(.*?)</td>', playerData).group(1)
            def_rebounds_percent = re.search(r'data-stat="drb_pct">(.*?)</td>', playerData).group(1)
            total_rebounds_percent = re.search(r'data-stat="trb_pct">(.*?)</td>', playerData).group(1)
            assists_percent = re.search(r'data-stat="ast_pct">(.*?)</td>', playerData).group(1)
            off_win_share = re.search(r'data-stat="ows">(.*?)</td>', playerData).group(1)
            def_win_share = re.search(r'data-stat="dws">(.*?)</td>', playerData).group(1)
            win_share = re.search(r'data-stat="ws">(.*?)</td>', playerData).group(1)
            win_share_48min = re.search(r'data-stat="ws_per_48">(.*?)</td>', playerData).group(1)
            if re.search(r'data-stat="fg3a_per_fga_pct">(.*?)</td>', playerData) != None:
                _3pts_attempts_rate = re.search(r'data-stat="fg3a_per_fga_pct">(.*?)</td>', playerData).group(1)
            if re.search(r'data-stat="stl_pct">(.*?)</td>', playerData) != None:
                steals_percent = re.search(r'data-stat="stl_pct">(.*?)</td>', playerData).group(1)
            if re.search(r'data-stat="blk_pct">(.*?)</td>', playerData) != None:
                blocks_percent = re.search(r'data-stat="blk_pct">(.*?)</td>', playerData).group(1)
            if re.search(r'data-stat="tov_pct">(.*?)</td>', playerData) != None:
                turnover_percent = re.search(r'data-stat="tov_pct">(.*?)</td>', playerData).group(1)
            if re.search(r'data-stat="usg_pct">(.*?)</td>', playerData) != None:
                usage_percent = re.search(r'data-stat="usg_pct">(.*?)</td>', playerData).group(1)
            if re.search(r'data-stat="obpm">(.*?)</td>', playerData) != None:
                off_box_pm = re.search(r'data-stat="obpm">(.*?)</td>', playerData).group(1)
            if re.search(r'data-stat="dbpm">(.*?)</td>', playerData) != None:
                def_box_pm = re.search(r'data-stat="dbpm">(.*?)</td>', playerData).group(1)
            if re.search(r'data-stat="bpm">(.*?)</td>', playerData) != None:
                box_pm = re.search(r'data-stat="bpm">(.*?)</td>', playerData).group(1)
            if re.search(r'data-stat="vorp">(.*?)</td>', playerData) != None:
                value_over_replacement = re.search(r'data-stat="vorp">(.*?)</td>', playerData).group(1)
        
    player_info = {#'player_url' : [player_url],
                   'player_eff_rating' + extra_name: [player_eff_rating],
                   'true_shooting_eff' + extra_name: [true_shooting_eff],
                   '_3pts_attempts_rate' + extra_name: [_3pts_attempts_rate],
                   'FT_attempts_rate' + extra_name: [FT_attempts_rate],
                   'off_rebounds_percent' + extra_name: [off_rebounds_percent],
                   'def_rebounds_percent' + extra_name: [def_rebounds_percent],
                   'total_rebounds_percent' + extra_name: [total_rebounds_percent],
                   'assists_percent' + extra_name: [assists_percent],
                   'steals_percent' + extra_name: [steals_percent],
                   'blocks_percent' + extra_name: [blocks_percent],
                   'turnover_percent' + extra_name: [turnover_percent],
                   'usage_percent' + extra_name: [usage_percent],
                   'off_win_share' + extra_name: [off_win_share],
                   'def_win_share' + extra_name: [def_win_share],
                   'win_share' + extra_name: [win_share],
                   'win_share_48min' + extra_name: [win_share_48min],
                   'off_box_pm' + extra_name: [off_box_pm],
                   'def_box_pm' + extra_name: [def_box_pm],
                   'value_over_replacement' + extra_name: [box_pm]}
    
    return player_info

In [6]:
def player_info_extras(player_url): #ver si queremos scrapear alguna otra información
    
    Hall_Of_Fame = 'No'
    n_NBA_Champ = 0
    n_All_Star = 0
    college_url = None
    gone_to_college = 'No'
    
    page_request = requests.get('https://www.basketball-reference.com' + str(player_url))
    soup = BeautifulSoup(page_request.text,"lxml")
    extras = soup.find('ul',{'id':'bling'})
    links = str(soup.findAll('li'))
    
    if extras:
        if extras.find('li',string = re.compile('Hall of Fame')) != None:  
            Hall_Of_Fame = 'Yes'
        if extras.find('li',string = re.compile('NBA Champ')) != None:
            aux = extras.find('li',string = re.compile('NBA Champ')).text
            if aux.find('x') != -1:
                n_NBA_Champ = aux[:aux.find('x')]
            else:
                n_NBA_Champ = 1
        if extras.find('li',string = re.compile('All Star')) != None:  
            aux = extras.find('li',string = re.compile('All Star')).text
            n_All_Star = aux[:aux.find('x')]
        if re.search(r'<a href="(.*?)">College Basketball at Sports-Reference.com</a>', links) != None:
            college_url = re.search(r'<a href="(.*?)">College Basketball at Sports-Reference.com</a>', links).group(1)
            gone_to_college = 'Yes'
        
    player_info = {#'player_url' : [player_url],
                   'Hall_Of_Fame' : [Hall_Of_Fame],
                   'n_NBA_Champ' : [n_NBA_Champ],
                   'n_All_Star' : [n_All_Star],
                   'college_url' : [college_url],
                   'gone_to_college' : [gone_to_college]}
    
    return player_info

In [7]:
def get_teams_played(player_url): 
    
    page_request = requests.get('https://www.basketball-reference.com' + str(player_url))
    soup = BeautifulSoup(page_request.text,"lxml")
    teams_container = soup.find("div",{'class':'uni_holder bbr'})
    teams_raw_list = teams_container.findAll("a")
    teams = []
    teams_years_dict = {}
    
    if teams_container:
        
        aux = 0
        for team in teams_raw_list:
            
            tm_yr = team['data-tip']
            teams_years_dict['team_' + str(aux)]= [tm_yr[:tm_yr.find(',')]]
            teams.append(tm_yr[:tm_yr.find(',')])
            teams_years_dict['years_' + str(aux)]= [tm_yr[tm_yr.find(',')+2:]]
            
            aux = aux + 1
        number_teams = len(set(teams))
            
    player_info = {#'player_url' : [player_url],
                   'n_teams' : [number_teams],
                   'teams_years_dict':[teams_years_dict]}
    #Alternativa: player_info.update(teams_years_dict)
    
    return player_info

## 1.3 College player stats

Explicació

In [8]:
def player_info_per_game_NCAA(player_url,extra_name,scraping): 
    
    #We will get the info at the Career footpart
    games = None
    games_started = None
    minutes_played_pg = None
    field_goals_pg = None
    field_goals_attempts_pg = None
    field_goals_percent_pg = None
    _3pts_goals_pg = None
    _3pts_goals_attempts_pg = None
    _3pts_goals_percent_pg = None
    _2pts_goals_pg = None
    _2pts_goals_attempts_pg = None
    _2pts_goals_percent_pg = None
    FT_goals_pg = None
    FT_goals_attempts_pg = None
    FT_goals_percent_pg = None
    off_rebounds_pg = None
    def_rebounds_pg = None
    total_rebounds_pg = None
    assists_pg = None
    steals_pg = None
    blocks_pg = None
    turnovers_pg = None
    personal_foults_pg = None
    points_pg = None
    sos_pg = None
    
    if scraping == 'Yes':
        page_request = requests.get(str(player_url))
        soup = BeautifulSoup(page_request.text,"lxml")
        table = soup.find("table",{'id':'players_per_game'})

        if table:
                table_foot = table.find('tfoot')
                career_row = table_foot.find('tr')
                cells  = career_row.findAll('td')
                playerData = str(cells) #the indexes are not uniform across the database
                # It searches using regular expressions!!!!!
                games = re.search(r'data-stat="g">(.*?)</td>', playerData).group(1)
                games_started = re.search(r'data-stat="gs">(.*?)</td>', playerData).group(1)
                minutes_played_pg = re.search(r'data-stat="mp_per_g">(.*?)</td>', playerData).group(1)
                field_goals_pg = re.search(r'data-stat="fg_per_g">(.*?)</td>', playerData).group(1)
                field_goals_attempts_pg = re.search(r'data-stat="fga_per_g">(.*?)</td>', playerData).group(1)
                field_goals_percent_pg = re.search(r'data-stat="fg_pct">(.*?)</td>', playerData).group(1)
                if re.search(r'data-stat="fg3_per_g">(.*?)</td>', playerData) != None:
                    _3pts_goals_pg = re.search(r'data-stat="fg3_per_g">(.*?)</td>', playerData).group(1)
                    _3pts_goals_attempts_pg = re.search(r'data-stat="fg3a_per_g">(.*?)</td>', playerData).group(1)
                    _3pts_goals_percent_pg = re.search(r'data-stat="fg3_pct">(.*?)</td>', playerData).group(1)
                if re.search(r'data-stat="fg2_per_g">(.*?)</td>', playerData) != None:
                    _2pts_goals_pg = re.search(r'data-stat="fg2_per_g">(.*?)</td>', playerData).group(1)
                    _2pts_goals_attempts_pg = re.search(r'data-stat="fg2a_per_g">(.*?)</td>', playerData).group(1)
                    _2pts_goals_percent_pg = re.search(r'data-stat="fg2_pct">(.*?)</td>', playerData).group(1)
                FT_goals_pg = re.search(r'data-stat="ft_per_g">(.*?)</td>', playerData).group(1)
                FT_goals_attempts_pg = re.search(r'data-stat="fta_per_g">(.*?)</td>', playerData).group(1)
                FT_goals_percent_pg = re.search(r'data-stat="ft_pct">(.*?)</td>', playerData).group(1)
                if re.search(r'data-stat="orb_per_g">(.*?)</td>', playerData) != None:
                    off_rebounds_pg = re.search(r'data-stat="orb_per_g">(.*?)</td>', playerData).group(1)
                    def_rebounds_pg = re.search(r'data-stat="drb_per_g">(.*?)</td>', playerData).group(1)
                total_rebounds_pg = re.search(r'data-stat="trb_per_g">(.*?)</td>', playerData).group(1)
                assists_pg = re.search(r'data-stat="ast_per_g">(.*?)</td>', playerData).group(1)
                if re.search(r'data-stat="stl_per_g">(.*?)</td>', playerData) != None:
                    steals_pg = re.search(r'data-stat="stl_per_g">(.*?)</td>', playerData).group(1)
                    blocks_pg = re.search(r'data-stat="blk_per_g">(.*?)</td>', playerData).group(1)
                    turnovers_pg = re.search(r'data-stat="tov_per_g">(.*?)</td>', playerData).group(1)
                personal_foults_pg = re.search(r'data-stat="pf_per_g">(.*?)</td>', playerData).group(1)
                points_pg = re.search(r'data-stat="pts_per_g">(.*?)</td>', playerData).group(1)
                sos_pg = re.search(r'data-stat="sos">(.*?)</td>', playerData).group(1)
        
    player_info = {#'player_url' : [player_url],
                   'games' + extra_name : [games],
                   'games_started' + extra_name : [games_started],
                   'minutes_played_pg' + extra_name : [minutes_played_pg],
                   'field_goals_pg' + extra_name : [field_goals_pg],
                   'field_goals_attempts_pg' + extra_name : [field_goals_attempts_pg],
                   'field_goals_percent_pg' + extra_name : [field_goals_percent_pg],
                   '_3pts_goals_pg' + extra_name : [_3pts_goals_pg],
                   '_3pts_goals_attempts_pg' + extra_name : [_3pts_goals_attempts_pg],
                   '_3pts_goals_percent_pg' + extra_name : [_3pts_goals_percent_pg],
                   '_2pts_goals_pg' + extra_name : [_2pts_goals_pg],
                   '_2pts_goals_attempts_pg' + extra_name : [_2pts_goals_attempts_pg],
                   '_2pts_goals_percent_pg' + extra_name : [_2pts_goals_percent_pg],
                   'FT_goals_pg' + extra_name : [FT_goals_pg],
                   'FT_goals_attempts_pg' + extra_name : [FT_goals_attempts_pg],
                   'FT_goals_percent_pg' + extra_name : [FT_goals_percent_pg],
                   'off_rebounds_pg' + extra_name : [off_rebounds_pg],
                   'def_rebounds_pg' + extra_name : [def_rebounds_pg],
                   'total_rebounds_pg' + extra_name : [total_rebounds_pg],
                   'assists_pg' + extra_name : [assists_pg],
                   'steals_pg' + extra_name : [steals_pg],
                   'blocks_pg' + extra_name : [blocks_pg],
                   'turnovers_pg' + extra_name : [turnovers_pg],
                   'personal_foults_pg' + extra_name : [personal_foults_pg],
                   'points_pg' + extra_name : [points_pg],
                   'sos_pg' + extra_name : [sos_pg]}
                
    return player_info

In [9]:
def player_info_totals_NCAA(player_url,extra_name,selenium):
    
    #We will get the info at the Career footpart
    games = None
    games_started = None
    minutes_played_total = None
    field_goals_total = None
    field_goals_attempts_total = None
    field_goals_percent_total = None
    _3pts_goals_total = None
    _3pts_goals_attempts_total = None
    _3pts_goals_percent_total = None
    _2pts_goals_total = None
    _2pts_goals_attempts_total = None
    _2pts_goals_percent_total = None
    FT_goals_total = None
    FT_goals_attempts_total = None
    FT_goals_percent_total = None
    off_rebounds_total = None
    def_rebounds_total = None
    total_rebounds_total = None
    assists_total = None
    steals_total = None
    blocks_total = None
    turnovers_total = None
    personal_foults_total = None
    points_total = None
    
#     driver = webdriver.Chrome(ChromeDriverManager().install())
#     driver.get(str(player_url))
#     time.sleep(1)
#     driver.find_element_by_class_name("css-47sehv").click() #accept cookies button
    
    #now we search by force
    if selenium == 'Yes':
        try:
            games = driver.find_element_by_xpath("//table[@id='players_totals']//tfoot//td[@data-stat='g']").text
        except:
            pass
        try:
            games_started = driver.find_element_by_xpath("//table[@id='players_totals']//tfoot//td[@data-stat='gs']").text
        except:
            pass
        try:
            minutes_played_total = driver.find_element_by_xpath("//table[@id='players_totals']//tfoot//td[@data-stat='mp']").text
        except:
            pass
        try:
            field_goals_total = driver.find_element_by_xpath("//table[@id='players_totals']//tfoot//td[@data-stat='fg']").text
        except:
            pass
        try:
            field_goals_attempts_total = driver.find_element_by_xpath("//table[@id='players_totals']//tfoot//td[@data-stat='fga']").text
        except:
            pass
        try:
            field_goals_percent_total = driver.find_element_by_xpath("//table[@id='players_totals']//tfoot//td[@data-stat='fg_pct']").text
        except:
            pass
        try:
            FT_goals_total = driver.find_element_by_xpath("//table[@id='players_totals']//tfoot//td[@data-stat='ft']").text
        except:
            pass
        try:
            FT_goals_attempts_total = driver.find_element_by_xpath("//table[@id='players_totals']//tfoot//td[@data-stat='fta']").text
        except:
            pass
        try:
            FT_goals_percent_total = driver.find_element_by_xpath("//table[@id='players_totals']//tfoot//td[@data-stat='ft_pct']").text
        except:
            pass
        try:
            total_rebounds_total = driver.find_element_by_xpath("//table[@id='players_totals']//tfoot//td[@data-stat='trb']").text
        except:
            pass
        try:
            assists_total = driver.find_element_by_xpath("//table[@id='players_totals']//tfoot//td[@data-stat='ast']").text
        except:
            pass
        try:
            steals_total = driver.find_element_by_xpath("//table[@id='players_totals']//tfoot//td[@data-stat='stl']").text
        except:
            pass
        try:
            blocks_total = driver.find_element_by_xpath("//table[@id='players_totals']//tfoot//td[@data-stat='blk']").text
        except:
            pass
        try:
            turnovers_total = driver.find_element_by_xpath("//table[@id='players_totals']//tfoot//td[@data-stat='tov']").text
        except:
            pass
        try:
            personal_foults_total = driver.find_element_by_xpath("//table[@id='players_totals']//tfoot//td[@data-stat='pf']").text
        except:
            pass
        try:
            points_total = driver.find_element_by_xpath("//table[@id='players_totals']//tfoot//td[@data-stat='pts']").text
        except:
            pass
        try:
            _2pts_goals_total = driver.find_element_by_xpath("//table[@id='players_totals']//tfoot//td[@data-stat='fg2']").text
        except:
            pass
        try:
            _2pts_goals_attempts_total = driver.find_element_by_xpath("//table[@id='players_totals']//tfoot//td[@data-stat='fg2a']").text
        except:
            pass
        try:
            _2pts_goals_percent_total = driver.find_element_by_xpath("//table[@id='players_totals']//tfoot//td[@data-stat='fg2_pct']").text
        except:
            pass
        try:
            _3pts_goals_total = driver.find_element_by_xpath("//table[@id='players_totals']//tfoot//td[@data-stat='fg3']").text
        except:
            pass
        try:
            _3pts_goals_attempts_total = driver.find_element_by_xpath("//table[@id='players_totals']//tfoot//td[@data-stat='fg3a']").text
        except:
            pass
        try:
            _3pts_goals_percent_total = driver.find_element_by_xpath("//table[@id='players_totals']//tfoot//td[@data-stat='fg3_pct']").text
        except:
            pass
        try:
            off_rebounds_total = driver.find_element_by_xpath("//table[@id='players_totals']//tfoot//td[@data-stat='orb']").text
        except:
            pass
        try:
            def_rebounds_total = driver.find_element_by_xpath("//table[@id='players_totals']//tfoot//td[@data-stat='drb']").text
        except:
            pass
        
    player_info = {#'player_url' : [player_url],
                   'minutes_played_total' + extra_name : [minutes_played_total],
                   'field_goals_total' + extra_name : [field_goals_total],
                   'field_goals_attempts_total' + extra_name : [field_goals_attempts_total],
                   'field_goals_percent_total' + extra_name : [field_goals_percent_total],
                   '_3pts_goals_total' + extra_name : [_3pts_goals_total],
                   '_3pts_goals_attempts_total' + extra_name : [_3pts_goals_attempts_total],
                   '_3pts_goals_percent_total' + extra_name : [_3pts_goals_percent_total],
                   '_2pts_goals_total' + extra_name : [_2pts_goals_total],
                   '_2pts_goals_attempts_total' + extra_name : [_2pts_goals_attempts_total],
                   '_2pts_goals_percent_total' + extra_name : [_2pts_goals_percent_total],
                   'FT_goals_total' + extra_name : [FT_goals_total],
                   'FT_goals_attempts_total' + extra_name : [FT_goals_attempts_total],
                   'FT_goals_percent_total' + extra_name : [FT_goals_percent_total],
                   'off_rebounds_total' + extra_name : [off_rebounds_total],
                   'def_rebounds_total' + extra_name : [def_rebounds_total],
                   'total_rebounds_total' + extra_name : [total_rebounds_total],
                   'assists_total' + extra_name : [assists_total],
                   'steals_total' + extra_name : [steals_total],
                   'blocks_total' + extra_name : [blocks_total],
                   'turnovers_total' + extra_name : [turnovers_total],
                   'personal_foults_total' + extra_name : [personal_foults_total],
                   'points_total' + extra_name : [points_total]}
#     driver.close()
    
    return player_info

In [10]:
def player_info_advanced_NCAA(player_url,extra_name,selenium):
    
    #We will get the info at the Career footpart
    games = None
    games_started = None
    minutes_played_advanced = None
    player_eff_rating = None
    true_shooting_eff = None
    _3pts_attempts_rate = None
    FT_attempts_rate = None
    points_produced = None
    off_rebounds_percent = None
    def_rebounds_percent = None
    total_rebounds_percent = None
    assists_percent = None
    steals_percent = None 
    blocks_percent = None 
    turnover_percent = None 
    usage_percent = None 
    off_win_share = None
    def_win_share = None
    win_share = None
    win_share_40min = None
    off_box_pm = None
    def_box_pm = None
    box_pm = None
    
#     driver = webdriver.Chrome(ChromeDriverManager().install())
#     driver.get(str(player_url))
#     time.sleep(1)
#     driver.find_element_by_class_name("css-47sehv").click() #accept cookies button
    
    #now we search by force
    if selenium == 'Yes':
        try:
            games = driver.find_element_by_xpath("//table[@id='players_advanced']//tfoot//td[@data-stat='g']").text
        except:
            pass
        try:
            games_started = driver.find_element_by_xpath("//table[@id='players_advanced']//tfoot//td[@data-stat='gs']").text
        except:
            pass
        try:
            minutes_played_advanced = driver.find_element_by_xpath("//table[@id='players_advanced']//tfoot//td[@data-stat='mp']").text
        except:
            pass
        try:
            true_shooting_eff = driver.find_element_by_xpath("//table[@id='players_advanced']//tfoot//td[@data-stat='ts_pct']").text
        except:
            pass
        try:
            player_eff_rating = driver.find_element_by_xpath("//table[@id='players_advanced']//tfoot//td[@data-stat='efg_pct']").text
        except:
            pass
        try:
            _3pts_attempts_rate = driver.find_element_by_xpath("//table[@id='players_advanced']//tfoot//td[@data-stat='fg3a_per_fga_pct']").text
        except:
            pass
        try:
            FT_attempts_rate = driver.find_element_by_xpath("//table[@id='players_advanced']//tfoot//td[@data-stat='fta_per_fga_pct']").text
        except:
            pass
        try:
            points_produced = driver.find_element_by_xpath("//table[@id='players_advanced']//tfoot//td[@data-stat='pprod']").text
        except:
            pass
        try:
            off_rebounds_percent = driver.find_element_by_xpath("//table[@id='players_advanced']//tfoot//td[@data-stat='orb_pct']").text
        except:
            pass
        try:
            def_rebounds_percent = driver.find_element_by_xpath("//table[@id='players_advanced']//tfoot//td[@data-stat='drb_pct']").text
        except:
            pass
        try:
            total_rebounds_percent = driver.find_element_by_xpath("//table[@id='players_advanced']//tfoot//td[@data-stat='trb_pct']").text
        except:
            pass
        try:
            assists_percent = driver.find_element_by_xpath("//table[@id='players_advanced']//tfoot//td[@data-stat='ast_pct']").text
        except:
            pass
        try:
            steals_percent = driver.find_element_by_xpath("//table[@id='players_advanced']//tfoot//td[@data-stat='stl_pct']").text
        except:
            pass
        try:
            blocks_percent = driver.find_element_by_xpath("//table[@id='players_advanced']//tfoot//td[@data-stat='blk_pct']").text
        except:
            pass
        try:
            turnover_percent = driver.find_element_by_xpath("//table[@id='players_advanced']//tfoot//td[@data-stat='tov_pct']").text
        except:
            pass
        try:
            usage_percent = driver.find_element_by_xpath("//table[@id='players_advanced']//tfoot//td[@data-stat='usg_pct']").text
        except:
            pass
        try:
            off_win_share = driver.find_element_by_xpath("//table[@id='players_advanced']//tfoot//td[@data-stat='ows']").text
        except:
            pass
        try:
            def_win_share = driver.find_element_by_xpath("//table[@id='players_advanced']//tfoot//td[@data-stat='dws']").text
        except:
            pass
        try:
            win_share = driver.find_element_by_xpath("//table[@id='players_advanced']//tfoot//td[@data-stat='ws']").text
        except:
            pass
        try:
            win_share_40min = driver.find_element_by_xpath("//table[@id='players_advanced']//tfoot//td[@data-stat='ws_per_40']").text
        except:
            pass
        try:
            off_box_pm = driver.find_element_by_xpath("//table[@id='players_advanced']//tfoot//td[@data-stat='obpm']").text
        except:
            pass
        try:
            def_box_pm = driver.find_element_by_xpath("//table[@id='players_advanced']//tfoot//td[@data-stat='dbpm']").text
        except:
            pass
        try:
            box_pm = driver.find_element_by_xpath("//table[@id='players_advanced']//tfoot//td[@data-stat='bpm']").text
        except:
            pass
        
    player_info = {#'player_url' : [player_url],
                   'minutes_played_advanced' + extra_name : [ minutes_played_advanced],
                   'player_eff_rating' + extra_name : [ player_eff_rating],
                   'true_shooting_eff' + extra_name : [ true_shooting_eff],
                   '_3pts_attempts_rate' + extra_name : [ _3pts_attempts_rate],
                   'FT_attempts_rate' + extra_name : [ FT_attempts_rate],
                   'points_produced' + extra_name : [ points_produced],
                   'off_rebounds_percent' + extra_name : [ off_rebounds_percent],
                   'def_rebounds_percent' + extra_name : [ def_rebounds_percent],
                   'total_rebounds_percent' + extra_name : [ total_rebounds_percent],
                   'assists_percent' + extra_name : [ assists_percent],
                   'steals_percent' + extra_name : [ steals_percent],
                   'blocks_percent' + extra_name : [ blocks_percent],
                   'turnover_percent' + extra_name : [ turnover_percent],
                   'usage_percent' + extra_name : [ usage_percent],
                   'off_win_share' + extra_name : [ off_win_share],
                   'def_win_share' + extra_name : [ def_win_share],
                   'win_share' + extra_name : [ win_share],
                   'win_share_40min' + extra_name : [ win_share_40min],
                   'off_box_pm' + extra_name : [ off_box_pm],
                   'def_box_pm' + extra_name : [ def_box_pm],
                   'box_pm' + extra_name : [ box_pm]}
#     driver.close()
    
    return player_info

In [11]:
def get_empty_df_s(player_url,extra_name):
    
    games = None
    games_started = None
    minutes_played_pg = None
    field_goals_pg = None
    field_goals_attempts_pg = None
    field_goals_percent_pg = None
    _3pts_goals_pg = None
    _3pts_goals_attempts_pg = None
    _3pts_goals_percent_pg = None
    _2pts_goals_pg = None
    _2pts_goals_attempts_pg = None
    _2pts_goals_percent_pg = None
    FT_goals_pg = None
    FT_goals_attempts_pg = None
    FT_goals_percent_pg = None
    off_rebounds_pg = None
    def_rebounds_pg = None
    total_rebounds_pg = None
    assists_pg = None
    steals_pg = None
    blocks_pg = None
    turnovers_pg = None
    personal_foults_pg = None
    points_pg = None
    sos_pg = None
    
    games = None
    games_started = None
    minutes_played_total = None
    field_goals_total = None
    field_goals_attempts_total = None
    field_goals_percent_total = None
    _3pts_goals_total = None
    _3pts_goals_attempts_total = None
    _3pts_goals_percent_total = None
    _2pts_goals_total = None
    _2pts_goals_attempts_total = None
    _2pts_goals_percent_total = None
    FT_goals_total = None
    FT_goals_attempts_total = None
    FT_goals_percent_total = None
    off_rebounds_total = None
    def_rebounds_total = None
    total_rebounds_total = None
    assists_total = None
    steals_total = None
    blocks_total = None
    turnovers_total = None
    personal_foults_total = None
    points_total = None
       
    games = None
    games_started = None
    minutes_played_advanced = None
    player_eff_rating = None
    true_shooting_eff = None
    _3pts_attempts_rate = None
    FT_attempts_rate = None
    points_produced = None
    off_rebounds_percent = None
    def_rebounds_percent = None
    total_rebounds_percent = None
    assists_percent = None
    steals_percent = None 
    blocks_percent = None 
    turnover_percent = None 
    usage_percent = None 
    off_win_share = None
    def_win_share = None
    win_share = None
    win_share_40min = None
    off_box_pm = None
    def_box_pm = None
    box_pm = None    
        
    player_info = {'games' + extra_name : [games],
                   'games_started' + extra_name : [games_started],
                   'minutes_played_pg' + extra_name : [minutes_played_pg],
                   'field_goals_pg' + extra_name : [field_goals_pg],
                   'field_goals_attempts_pg' + extra_name : [field_goals_attempts_pg],
                   'field_goals_percent_pg' + extra_name : [field_goals_percent_pg],
                   '_3pts_goals_pg' + extra_name : [_3pts_goals_pg],
                   '_3pts_goals_attempts_pg' + extra_name : [_3pts_goals_attempts_pg],
                   '_3pts_goals_percent_pg' + extra_name : [_3pts_goals_percent_pg],
                   '_2pts_goals_pg' + extra_name : [_2pts_goals_pg],
                   '_2pts_goals_attempts_pg' + extra_name : [_2pts_goals_attempts_pg],
                   '_2pts_goals_percent_pg' + extra_name : [_2pts_goals_percent_pg],
                   'FT_goals_pg' + extra_name : [FT_goals_pg],
                   'FT_goals_attempts_pg' + extra_name : [FT_goals_attempts_pg],
                   'FT_goals_percent_pg' + extra_name : [FT_goals_percent_pg],
                   'off_rebounds_pg' + extra_name : [off_rebounds_pg],
                   'def_rebounds_pg' + extra_name : [def_rebounds_pg],
                   'total_rebounds_pg' + extra_name : [total_rebounds_pg],
                   'assists_pg' + extra_name : [assists_pg],
                   'steals_pg' + extra_name : [steals_pg],
                   'blocks_pg' + extra_name : [blocks_pg],
                   'turnovers_pg' + extra_name : [turnovers_pg],
                   'personal_foults_pg' + extra_name : [personal_foults_pg],
                   'points_pg' + extra_name : [points_pg],
                   'sos_pg' + extra_name : [sos_pg],
                   'minutes_played_total' + extra_name : [minutes_played_total],
                   'field_goals_total' + extra_name : [field_goals_total],
                   'field_goals_attempts_total' + extra_name : [field_goals_attempts_total],
                   'field_goals_percent_total' + extra_name : [field_goals_percent_total],
                   '_3pts_goals_total' + extra_name : [_3pts_goals_total],
                   '_3pts_goals_attempts_total' + extra_name : [_3pts_goals_attempts_total],
                   '_3pts_goals_percent_total' + extra_name : [_3pts_goals_percent_total],
                   '_2pts_goals_total' + extra_name : [_2pts_goals_total],
                   '_2pts_goals_attempts_total' + extra_name : [_2pts_goals_attempts_total],
                   '_2pts_goals_percent_total' + extra_name : [_2pts_goals_percent_total],
                   'FT_goals_total' + extra_name : [FT_goals_total],
                   'FT_goals_attempts_total' + extra_name : [FT_goals_attempts_total],
                   'FT_goals_percent_total' + extra_name : [FT_goals_percent_total],
                   'off_rebounds_total' + extra_name : [off_rebounds_total],
                   'def_rebounds_total' + extra_name : [def_rebounds_total],
                   'total_rebounds_total' + extra_name : [total_rebounds_total],
                   'assists_total' + extra_name : [assists_total],
                   'steals_total' + extra_name : [steals_total],
                   'blocks_total' + extra_name : [blocks_total],
                   'turnovers_total' + extra_name : [turnovers_total],
                   'personal_foults_total' + extra_name : [personal_foults_total],
                   'points_total' + extra_name : [points_total],
                   'minutes_played_advanced' + extra_name : [ minutes_played_advanced],
                   'player_eff_rating' + extra_name : [ player_eff_rating],
                   'true_shooting_eff' + extra_name : [ true_shooting_eff],
                   '_3pts_attempts_rate' + extra_name : [ _3pts_attempts_rate],
                   'FT_attempts_rate' + extra_name : [ FT_attempts_rate],
                   'points_produced' + extra_name : [ points_produced],
                   'off_rebounds_percent' + extra_name : [ off_rebounds_percent],
                   'def_rebounds_percent' + extra_name : [ def_rebounds_percent],
                   'total_rebounds_percent' + extra_name : [ total_rebounds_percent],
                   'assists_percent' + extra_name : [ assists_percent],
                   'steals_percent' + extra_name : [ steals_percent],
                   'blocks_percent' + extra_name : [ blocks_percent],
                   'turnover_percent' + extra_name : [ turnover_percent],
                   'usage_percent' + extra_name : [ usage_percent],
                   'off_win_share' + extra_name : [ off_win_share],
                   'def_win_share' + extra_name : [ def_win_share],
                   'win_share' + extra_name : [ win_share],
                   'win_share_40min' + extra_name : [ win_share_40min],
                   'off_box_pm' + extra_name : [ off_box_pm],
                   'def_box_pm' + extra_name : [ def_box_pm],
                   'box_pm' + extra_name : [ box_pm]}
                
    return player_info

## 1.4 Complementary Dataframes to enrich player data

### Teams info (standings/stats over years)
To match with players

In [12]:
def get_all_teams(): 
    
    page_request = requests.get('https://www.basketball-reference.com/teams/')
    soup = BeautifulSoup(page_request.text,"lxml")
    teams_table = soup.find("table",{'id':'teams_active'})
    teams_with_link = teams_table.findAll("tr",{'class':'full_table'})
    
    teams_df = pd.DataFrame()

    for team in teams_with_link:
        
        teams_df = teams_df.append(get_team_data(team.find("a")['href']))
    
    return teams_df

In [13]:
def get_team_data(team_url): 
    
    three_letters = team_url[-4:-1]
    
    print('Scraping ' + three_letters + ' team info...')
    
    page_request = requests.get('https://www.basketball-reference.com' + str(team_url))
    soup = BeautifulSoup(page_request.text,"lxml")
    table = soup.find("table",{'id':three_letters})
    table_body = table.find("tbody")
    rows = table_body.findAll("tr")
    
    season = []
    league = []
    team_name = []
    wins = []
    losses = []
    w_l_pct = []
    
    
    for row in rows:
        season.append(row.find("th").get_text())
        columns = row.findAll("td")
    
        league.append(columns[0].get_text())
        team_name.append(columns[1].get_text())
        wins.append(columns[2].get_text())
        losses.append(columns[3].get_text())
        w_l_pct.append(columns[4].get_text())
    
    three_letters = [three_letters]*len(w_l_pct)
    
    team_df = pd.DataFrame({'team_3_lett':three_letters,'season':season,'league':league,'team_name':team_name,'wins':wins,'losses':losses,'w_l_pct':w_l_pct})

    return team_df

### Draft picks over the years
To match with players

In [14]:
def get_draft_dataframe():
    start_timer = time.perf_counter()
    page_request = requests.get('https://www.basketball-reference.com/draft/')
    soup = BeautifulSoup(page_request.text,"lxml")
    draft_init_table = soup.find_all("th",{"data-stat" : "draft"})[1:]
    draft_urls = []
    for row in draft_init_table:
        draft_urls.append(row.a['href'])
    
    data = []
    df = pd.DataFrame(data, columns=['draft_year','draft_round', 'draft_pick_number', 'draft_team','player_name','player_url','college'])
    for url in draft_urls:
        df = df.append(get_draft_info_by_year(url))
        
    end_timer = time.perf_counter()
    print(f"Scraped the Draft info of the " + str(len(df)) + f" drafted NBA players in {end_timer - start_timer:0.4f} seconds")    
    return df
    

In [15]:
def get_draft_info_by_year(url): #ojo pq hi ha files on l'equip va decidir no fer el pick (exemple 2002) 
    page_request = requests.get('https://www.basketball-reference.com/' + url)
    soup = BeautifulSoup(page_request.text,"lxml")
    
    year = url[-9:-5]
    print('Scraping ' + year + ' Draft info...')
    round = 1
    
    Round = []
    Pick_number = []
    Team = []
    Player_name = []
    Player_url = []
    College = []
    
    table = soup.find("table")
    table_body = table.find('tbody')
    
    for row in table_body.findAll('tr'):
        try:
            # primer comprovem que no hi ha cap error al scrapejar la info, després afegim
            cells = row.findAll('td')
            pick_number = cells[0].text
            team = cells[1].text
            player_name = cells[2].text
            player_url = cells[2].a['href']
            college = cells[3].text
            
            Pick_number.append(pick_number)
            Team.append(team)
            Player_name.append(player_name)
            Player_url.append(player_url)
            College.append(college)
            Round.append(round)
        except:
            try:
                if row.get('class')[0] == "over_header":
                    round = round + 1
            except:
                #print(len(Team))
                pass
    Year = [year] * len(Round)
    #print(len(Year))
    #print(len(Team))
    
    return pd.DataFrame({'draft_year':Year,'draft_round':Round, 'draft_pick_number':Pick_number, 'draft_team':Team,'player_name':Player_name,'player_url':Player_url,'college':College})
    

# Main Code

In [17]:
####

###### Basic info dataframe

In [18]:
df_basic_info = player_basic_info()
df_basic_info.to_excel('Scraped data/Basic_Info_Raw.xlsx')

Scrapping basic info of players with letter a...
Scrapping basic info of players with letter b...
Scrapping basic info of players with letter c...
Scrapping basic info of players with letter d...
Scrapping basic info of players with letter e...
Scrapping basic info of players with letter f...
Scrapping basic info of players with letter g...
Scrapping basic info of players with letter h...
Scrapping basic info of players with letter i...
Scrapping basic info of players with letter j...
Scrapping basic info of players with letter k...
Scrapping basic info of players with letter l...
Scrapping basic info of players with letter m...
Scrapping basic info of players with letter n...
Scrapping basic info of players with letter o...
Scrapping basic info of players with letter p...
Scrapping basic info of players with letter q...
Scrapping basic info of players with letter r...
Scrapping basic info of players with letter s...
Scrapping basic info of players with letter t...
Scrapping basic info

In [19]:
df_basic_info.head()

Unnamed: 0,player_url,player_name,active_from,active_to,position,college,height,weight,birth_date
0,/players/a/abdelal01.html,Alaa Abdelnaby,1991,1995,F-C,Duke,6-10,240,"June 24, 1968"
1,/players/a/abdulza01.html,Zaid Abdul-Aziz,1969,1978,C-F,Iowa State,6-9,235,"April 7, 1946"
2,/players/a/abdulka01.html,Kareem Abdul-Jabbar,1970,1989,C,UCLA,7-2,225,"April 16, 1947"
3,/players/a/abdulma02.html,Mahmoud Abdul-Rauf,1991,2001,G,LSU,6-1,162,"March 9, 1969"
4,/players/a/abdulta01.html,Tariq Abdul-Wahad,1998,2003,F,"Michigan, San Jose State",6-6,223,"November 3, 1974"


In [20]:
#We prove that the 'player_url' column is really an id column for each player
idd = df_basic_info.drop_duplicates(subset=['player_url'])
print(len(idd))
print(len(df_basic_info))

4985
4985


###### Every stat of each player (main dataframe)

In [24]:
df_basic_info = pd.read_excel('Scraped data/Basic_Info_Raw.xlsx',index_col=0)

try:
    df_scraped = pd.read_excel('Scraped data/Scraped_players.xlsx',index_col=0)
    df_basic_info = df_basic_info.merge(df_scraped,on='player_url')
    df_basic_info = df_basic_info[df_basic_info['check']=='No'] #we only want to scrap info from players not already scrapped
    df_final = pd.read_excel('Scraped data/NBA_Complete_Raw.xlsx',index_col=0)
except:
    df_basic_info['check'] = 'No' #To know which columns have been already scrapped #If it's the first time...
    df_scraped = df_basic_info[['player_url','check']]
    df_final = pd.DataFrame()

#bar = progressbar.ProgressBar(len(df_basic_info_subset)) # falta un enumerate i al for de després

driver = webdriver.Chrome(ChromeDriverManager().install())
for i,url in enumerate(df_basic_info.player_url):
        
        df_pg_RegS = pd.DataFrame(player_info_per_game(url,'per_game','_RegS'))
        df_pg_Playoff = pd.DataFrame(player_info_per_game(url,'playoffs_per_game','_Playoff'))
        df_tot_RegS = pd.DataFrame(player_info_totals(url,'totals','_RegS'))
        df_tot_Playoff = pd.DataFrame(player_info_totals(url,'playoffs_totals','_Playoff'))
        df_adv_RegS = pd.DataFrame(player_info_advanced(url,'advanced','_RegS'))
        df_adv_Playoff = pd.DataFrame(player_info_advanced(url,'playoffs_advanced','_Playoff'))
        df_extra = pd.DataFrame(player_info_extras(url))
        df_teams_played = pd.DataFrame(get_teams_played(url))
        
        df_aux = pd.concat([df_pg_RegS,df_pg_Playoff],axis=1)
        df_aux = pd.concat([df_aux,df_tot_RegS],axis=1)
        df_aux = pd.concat([df_aux,df_tot_Playoff],axis=1)
        df_aux = pd.concat([df_aux,df_adv_RegS],axis=1)
        df_aux = pd.concat([df_aux,df_adv_Playoff],axis=1)
        df_aux = pd.concat([df_aux,df_extra],axis=1)
        df_aux = pd.concat([df_aux,df_teams_played],axis=1)
        
        
#         df_aux = pd.DataFrame(player_info_per_game(url,'per_game','RegS'))
#         df_aux = pd.concat([df_aux,pd.DataFrame(player_info_per_game(url,'playoffs_per_game','_Playoff'))],axis=1)
#         df_aux = pd.concat([df_aux,pd.DataFrame(player_info_totals(url,'totals','_RegS'))],axis=1)
#         df_aux = pd.concat([df_aux,pd.DataFrame(player_info_totals(url,'playoffs_totals','_Playoff'))],axis=1)
#         df_aux = pd.concat([df_aux,pd.DataFrame(player_info_advanced(url,'advanced','RegS'))],axis=1)
#         df_aux = pd.concat([df_aux,pd.DataFrame(player_info_advanced(url,'playoffs_advanced','_Playoff'))],axis=1)
#         df_aux = pd.concat([df_aux,pd.DataFrame(player_info_extras(url))],axis=1) #OJOOO NAMEEE
#         df_aux = pd.concat([df_aux,pd.DataFrame(get_teams_played(url))],axis=1) #OJOOO NAMEEE
        
        college_url = df_aux['college_url'][0]
        if df_aux['gone_to_college'][0]=='Yes':
            
            df_pg_NCAA = pd.DataFrame(player_info_per_game_NCAA(college_url,'','Yes'))
            df_aux = pd.concat([df_aux,df_pg_NCAA],axis=1)
            
#             df_aux = pd.concat([df_aux,pd.DataFrame(player_info_per_game_NCAA(college_url,''))],axis=1)
            
            #Now we do scraping with the selenium functions
            driver.get(str(college_url))
            try:
                time.sleep(1)
                driver.find_element_by_class_name("css-47sehv").click() #accept cookies button
            except:
                pass
            
            df_tot_NCAA = pd.DataFrame(player_info_totals_NCAA(college_url,'','Yes'))
            df_adv_NCAA = pd.DataFrame(player_info_advanced_NCAA(college_url,'','Yes'))
            
            df_aux = pd.concat([df_aux,df_tot_NCAA],axis=1)
            df_aux = pd.concat([df_aux,df_adv_NCAA],axis=1)
            
#             df_aux = pd.concat([df_aux,pd.DataFrame(player_info_totals_NCAA(college_url,''))],axis=1)
#             df_aux = pd.concat([df_aux,pd.DataFrame(player_info_advanced_NCAA(college_url,''))],axis=1)
        else: #Not college
        
            df_pg_NCAA = pd.DataFrame(player_info_per_game_NCAA(college_url,'','No'))
            df_tot_NCAA = pd.DataFrame(player_info_totals_NCAA(college_url,'','No'))
            df_adv_NCAA = pd.DataFrame(player_info_advanced_NCAA(college_url,'','No'))
            
            df_aux = pd.concat([df_aux,df_pg_NCAA],axis=1)
            df_aux = pd.concat([df_aux,df_tot_NCAA],axis=1)
            df_aux = pd.concat([df_aux,df_adv_NCAA],axis=1)
        
#             df_aux = pd.concat([df_aux,pd.DataFrame(get_empty_df_s(college_url,''))],axis=1)
        df_aux['player_url'] = url
        df_final = df_final.loc[:,~df_final.columns.duplicated()]
        df_aux = df_aux.loc[:,~df_aux.columns.duplicated()]
        df_final = df_final.append(df_aux,ignore_index=True)
        df_scraped.loc[df_scraped['player_url']==url,'check'] = 'Yes'
        df_scraped.to_excel('Scraped data/Scraped_players.xlsx')
        df_final.to_excel('Scraped data/NBA_Complete_Raw.xlsx')
        
#         df_pg_RegS.to_excel('Scraped data/NBA_Complete_Raw.xlsx')
#         df_pg_Playoff.to_excel('Scraped data/NBA_Complete_Raw.xlsx')
#         df_tot_RegS.to_excel('Scraped data/NBA_Complete_Raw.xlsx')
#         df_tot_Playoff.to_excel('Scraped data/NBA_Complete_Raw.xlsx')
#         df_adv_RegS.to_excel('Scraped data/NBA_Complete_Raw.xlsx')
#         df_adv_Playoff.to_excel('Scraped data/NBA_Complete_Raw.xlsx')
#         df_teams_played.to_excel('Scraped data/NBA_Complete_Raw.xlsx')
#         df_extra.to_excel('Scraped data/NBA_Complete_Raw.xlsx')
#         df_pg_NCAA.to_excel('Scraped data/NBA_Complete_Raw.xlsx')
#         df_tot_NCAA.to_excel('Scraped data/NBA_Complete_Raw.xlsx')
#         df_adv_NCAA.to_excel('Scraped data/NBA_Complete_Raw.xlsx')
        
        print(str(i) + ' players out of ' + str(len(df_basic_info)) + ' scraped...')
        
#         if i==1:
#             break

driver.close()
#bar.update(i)
#fer .loc amb check 'Yes'
#+ falta merge amb el basics al final
#Com a suffix posaria que _NBA i _NCAA



Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Driver [C:\Users\oriol\.wdm\drivers\chromedriver\win32\96.0.4664.45\chromedriver.exe] found in cache


0 players out of 2462 scraped...
1 players out of 2462 scraped...
2 players out of 2462 scraped...
3 players out of 2462 scraped...
4 players out of 2462 scraped...
5 players out of 2462 scraped...
6 players out of 2462 scraped...
7 players out of 2462 scraped...
8 players out of 2462 scraped...
9 players out of 2462 scraped...
10 players out of 2462 scraped...
11 players out of 2462 scraped...
12 players out of 2462 scraped...
13 players out of 2462 scraped...
14 players out of 2462 scraped...
15 players out of 2462 scraped...
16 players out of 2462 scraped...
17 players out of 2462 scraped...
18 players out of 2462 scraped...
19 players out of 2462 scraped...
20 players out of 2462 scraped...
21 players out of 2462 scraped...
22 players out of 2462 scraped...
23 players out of 2462 scraped...
24 players out of 2462 scraped...
25 players out of 2462 scraped...
26 players out of 2462 scraped...
27 players out of 2462 scraped...
28 players out of 2462 scraped...
29 players out of 2462 s

238 players out of 2462 scraped...
239 players out of 2462 scraped...
240 players out of 2462 scraped...
241 players out of 2462 scraped...
242 players out of 2462 scraped...
243 players out of 2462 scraped...
244 players out of 2462 scraped...
245 players out of 2462 scraped...
246 players out of 2462 scraped...
247 players out of 2462 scraped...
248 players out of 2462 scraped...
249 players out of 2462 scraped...
250 players out of 2462 scraped...
251 players out of 2462 scraped...
252 players out of 2462 scraped...
253 players out of 2462 scraped...
254 players out of 2462 scraped...
255 players out of 2462 scraped...
256 players out of 2462 scraped...
257 players out of 2462 scraped...
258 players out of 2462 scraped...
259 players out of 2462 scraped...
260 players out of 2462 scraped...
261 players out of 2462 scraped...
262 players out of 2462 scraped...
263 players out of 2462 scraped...
264 players out of 2462 scraped...
265 players out of 2462 scraped...
266 players out of 2

473 players out of 2462 scraped...
474 players out of 2462 scraped...
475 players out of 2462 scraped...
476 players out of 2462 scraped...
477 players out of 2462 scraped...
478 players out of 2462 scraped...
479 players out of 2462 scraped...
480 players out of 2462 scraped...
481 players out of 2462 scraped...
482 players out of 2462 scraped...
483 players out of 2462 scraped...
484 players out of 2462 scraped...
485 players out of 2462 scraped...
486 players out of 2462 scraped...
487 players out of 2462 scraped...
488 players out of 2462 scraped...
489 players out of 2462 scraped...
490 players out of 2462 scraped...
491 players out of 2462 scraped...
492 players out of 2462 scraped...
493 players out of 2462 scraped...
494 players out of 2462 scraped...
495 players out of 2462 scraped...
496 players out of 2462 scraped...
497 players out of 2462 scraped...
498 players out of 2462 scraped...
499 players out of 2462 scraped...
500 players out of 2462 scraped...
501 players out of 2

708 players out of 2462 scraped...
709 players out of 2462 scraped...
710 players out of 2462 scraped...
711 players out of 2462 scraped...
712 players out of 2462 scraped...
713 players out of 2462 scraped...
714 players out of 2462 scraped...
715 players out of 2462 scraped...
716 players out of 2462 scraped...
717 players out of 2462 scraped...
718 players out of 2462 scraped...
719 players out of 2462 scraped...
720 players out of 2462 scraped...
721 players out of 2462 scraped...
722 players out of 2462 scraped...
723 players out of 2462 scraped...
724 players out of 2462 scraped...
725 players out of 2462 scraped...
726 players out of 2462 scraped...
727 players out of 2462 scraped...
728 players out of 2462 scraped...
729 players out of 2462 scraped...
730 players out of 2462 scraped...
731 players out of 2462 scraped...
732 players out of 2462 scraped...
733 players out of 2462 scraped...
734 players out of 2462 scraped...
735 players out of 2462 scraped...
736 players out of 2

943 players out of 2462 scraped...
944 players out of 2462 scraped...
945 players out of 2462 scraped...
946 players out of 2462 scraped...
947 players out of 2462 scraped...
948 players out of 2462 scraped...
949 players out of 2462 scraped...
950 players out of 2462 scraped...
951 players out of 2462 scraped...
952 players out of 2462 scraped...
953 players out of 2462 scraped...
954 players out of 2462 scraped...
955 players out of 2462 scraped...
956 players out of 2462 scraped...
957 players out of 2462 scraped...
958 players out of 2462 scraped...
959 players out of 2462 scraped...
960 players out of 2462 scraped...
961 players out of 2462 scraped...
962 players out of 2462 scraped...
963 players out of 2462 scraped...
964 players out of 2462 scraped...
965 players out of 2462 scraped...
966 players out of 2462 scraped...
967 players out of 2462 scraped...
968 players out of 2462 scraped...
969 players out of 2462 scraped...
970 players out of 2462 scraped...
971 players out of 2

1173 players out of 2462 scraped...
1174 players out of 2462 scraped...
1175 players out of 2462 scraped...
1176 players out of 2462 scraped...
1177 players out of 2462 scraped...
1178 players out of 2462 scraped...
1179 players out of 2462 scraped...
1180 players out of 2462 scraped...
1181 players out of 2462 scraped...
1182 players out of 2462 scraped...
1183 players out of 2462 scraped...
1184 players out of 2462 scraped...
1185 players out of 2462 scraped...
1186 players out of 2462 scraped...
1187 players out of 2462 scraped...
1188 players out of 2462 scraped...
1189 players out of 2462 scraped...
1190 players out of 2462 scraped...
1191 players out of 2462 scraped...
1192 players out of 2462 scraped...
1193 players out of 2462 scraped...
1194 players out of 2462 scraped...
1195 players out of 2462 scraped...
1196 players out of 2462 scraped...
1197 players out of 2462 scraped...
1198 players out of 2462 scraped...
1199 players out of 2462 scraped...
1200 players out of 2462 scr

1401 players out of 2462 scraped...
1402 players out of 2462 scraped...
1403 players out of 2462 scraped...
1404 players out of 2462 scraped...
1405 players out of 2462 scraped...
1406 players out of 2462 scraped...
1407 players out of 2462 scraped...
1408 players out of 2462 scraped...
1409 players out of 2462 scraped...
1410 players out of 2462 scraped...
1411 players out of 2462 scraped...
1412 players out of 2462 scraped...
1413 players out of 2462 scraped...
1414 players out of 2462 scraped...
1415 players out of 2462 scraped...
1416 players out of 2462 scraped...
1417 players out of 2462 scraped...
1418 players out of 2462 scraped...
1419 players out of 2462 scraped...
1420 players out of 2462 scraped...
1421 players out of 2462 scraped...
1422 players out of 2462 scraped...
1423 players out of 2462 scraped...
1424 players out of 2462 scraped...
1425 players out of 2462 scraped...
1426 players out of 2462 scraped...
1427 players out of 2462 scraped...
1428 players out of 2462 scr

1629 players out of 2462 scraped...
1630 players out of 2462 scraped...
1631 players out of 2462 scraped...
1632 players out of 2462 scraped...
1633 players out of 2462 scraped...
1634 players out of 2462 scraped...
1635 players out of 2462 scraped...
1636 players out of 2462 scraped...
1637 players out of 2462 scraped...
1638 players out of 2462 scraped...
1639 players out of 2462 scraped...
1640 players out of 2462 scraped...
1641 players out of 2462 scraped...
1642 players out of 2462 scraped...
1643 players out of 2462 scraped...
1644 players out of 2462 scraped...
1645 players out of 2462 scraped...
1646 players out of 2462 scraped...
1647 players out of 2462 scraped...
1648 players out of 2462 scraped...
1649 players out of 2462 scraped...
1650 players out of 2462 scraped...
1651 players out of 2462 scraped...
1652 players out of 2462 scraped...
1653 players out of 2462 scraped...
1654 players out of 2462 scraped...
1655 players out of 2462 scraped...
1656 players out of 2462 scr

1857 players out of 2462 scraped...
1858 players out of 2462 scraped...
1859 players out of 2462 scraped...
1860 players out of 2462 scraped...
1861 players out of 2462 scraped...
1862 players out of 2462 scraped...
1863 players out of 2462 scraped...
1864 players out of 2462 scraped...
1865 players out of 2462 scraped...
1866 players out of 2462 scraped...
1867 players out of 2462 scraped...
1868 players out of 2462 scraped...
1869 players out of 2462 scraped...
1870 players out of 2462 scraped...
1871 players out of 2462 scraped...
1872 players out of 2462 scraped...
1873 players out of 2462 scraped...
1874 players out of 2462 scraped...
1875 players out of 2462 scraped...
1876 players out of 2462 scraped...
1877 players out of 2462 scraped...
1878 players out of 2462 scraped...
1879 players out of 2462 scraped...
1880 players out of 2462 scraped...
1881 players out of 2462 scraped...
1882 players out of 2462 scraped...
1883 players out of 2462 scraped...
1884 players out of 2462 scr

2085 players out of 2462 scraped...
2086 players out of 2462 scraped...
2087 players out of 2462 scraped...
2088 players out of 2462 scraped...
2089 players out of 2462 scraped...
2090 players out of 2462 scraped...
2091 players out of 2462 scraped...
2092 players out of 2462 scraped...
2093 players out of 2462 scraped...
2094 players out of 2462 scraped...
2095 players out of 2462 scraped...
2096 players out of 2462 scraped...
2097 players out of 2462 scraped...
2098 players out of 2462 scraped...
2099 players out of 2462 scraped...
2100 players out of 2462 scraped...
2101 players out of 2462 scraped...
2102 players out of 2462 scraped...
2103 players out of 2462 scraped...
2104 players out of 2462 scraped...
2105 players out of 2462 scraped...
2106 players out of 2462 scraped...
2107 players out of 2462 scraped...
2108 players out of 2462 scraped...
2109 players out of 2462 scraped...
2110 players out of 2462 scraped...
2111 players out of 2462 scraped...
2112 players out of 2462 scr

2313 players out of 2462 scraped...
2314 players out of 2462 scraped...
2315 players out of 2462 scraped...
2316 players out of 2462 scraped...
2317 players out of 2462 scraped...
2318 players out of 2462 scraped...
2319 players out of 2462 scraped...
2320 players out of 2462 scraped...
2321 players out of 2462 scraped...
2322 players out of 2462 scraped...
2323 players out of 2462 scraped...
2324 players out of 2462 scraped...
2325 players out of 2462 scraped...
2326 players out of 2462 scraped...
2327 players out of 2462 scraped...
2328 players out of 2462 scraped...
2329 players out of 2462 scraped...
2330 players out of 2462 scraped...
2331 players out of 2462 scraped...
2332 players out of 2462 scraped...
2333 players out of 2462 scraped...
2334 players out of 2462 scraped...
2335 players out of 2462 scraped...
2336 players out of 2462 scraped...
2337 players out of 2462 scraped...
2338 players out of 2462 scraped...
2339 players out of 2462 scraped...
2340 players out of 2462 scr

Per al primer estudi de les dades

In [17]:
# first_X_column  = df_final.iloc[: , :26]
# first_XX_column  = df_final.iloc[: , 27:52]
# first_XXX_column  = df_final.iloc[: , 53:79]
# first_XXXX_column  = df_final.iloc[: , 80:106]
# first_XXXXX_column  = df_final.iloc[: , 107:126]

In [18]:
# profile_1 = first_X_column.profile_report()
# profile_1.to_file("report_0_26.html")

In [19]:
# profile_2 = first_XX_column.profile_report()
# profile_2.to_file("report_27_52.html")

In [20]:
# profile_3 = first_XXX_column.profile_report()
# profile_3.to_file("report_53_79.html")

In [21]:
# profile_4 = first_XXXX_column.profile_report()
# profile_4.to_file("report_80_106.html")

In [22]:
# profile_5 = first_XXXXX_column.profile_report()
# profile_5.to_file("report_107_126.html")

Coses generals:
- Transformar a númeric i % les corresponents columnes
- Separar en 2pts (aquí van els fg dels que no tenen 3pt) i 3pts, et quedes amb el total de tirats i si vols algun estadístic de pg.
- Veure l'any mig dels missings o el repartiment -> FER TALL

#### player_info_per_game (RegS)
- games_started: Observamos que hay algunos con GS 0 o vacío, se entiende que los vacíos se pueden rellenar con 0's.
- minutes_played: maybe we could infer in it with KNN (low % of nullity), mira si es pot amb el dendogram omplir segons unes x variables.
- fg: we will choose attemps or total (similar distribution) + pct
- 3pts: 20% de missing, aquí no té molt sentit omplir. Alomillor un -1, per dir que no tenien linia de 3!!!
- 2pts: ídem que amb fg, aquí hi ha molts missings, per això el que fem es completar amb els fg en aquells buits i ens quedem només amb això enlloc de fg també.
- effective_pct: fora, perq té molta correlació amb 2pts i fg
- els rebounds: tenim un 20% de missing als off i deff, per això, ens quedem amb els totals, i els que siguin missing posem un -1 perquè no es recollia la estadística 
- assists: re
- steals: hi ha missing (20%) que podem posar amb -1, tot i que es una dada que podem arribar a treure.
- blocks: hi ha missing (20%) que podem posar amb -1, tot i que es una dada que podem arribar a treure.
- turnovers: hi ha missing (20%) que podem posar amb -1, tot i que es una dada que podem arribar a treure.
#### player_info_per_game (Playoff)
- Hi ha un nombre molt elevat (40-50%) de jugadors sense dades de playoffs. Aquí podem procedir fent subclusters a part de crear la variable has played play-off or not.
- La resta, fer com abans.
#### player_info_totals (RegS)
- games_started: Observamos que hay algunos con GS 0 o vacío, se entiende que los vacíos se pueden rellenar con 0's.
- minutes_played: maybe we could infer in it with KNN (low % of nullity), mira si es pot amb el dendogram omplir segons unes x variables. JO crec que la majoria son del 46 al 49
- El FG/FT pct el rellenem amb -1 si no en té!!!
- Triple_double, eliminar...

#### advanced stats (Reg and playoff)
- eff_rating-> completar amb KNN
- pillaria el true shooting efficency i els de 2pt y 3pt me'ls petaria
- dels steals, assists... valorar si es poden pillar d'aquí o bé de les podem arribar a treure
- el win share de 48min fa pinta de tema de ABA, la resta podem quedar-nos amb tots 3.
- box_pm i value over replacement en un 20% no ho tenim -> aquí posar un -1 no té sentit, alomillor KNN si baixa el % de None

#### College
- FALTAAAAA

###### Complementary dataframes to enrich data

In [17]:
df_draft = get_draft_dataframe()
print(str(len(df_draft)) + ' players drafted over the years!')
df_draft.to_excel('Scraped data/All_drafts_raw.xlsx')

Scrapping 2021 Draft info...
Scrapping 2020 Draft info...
Scrapping 2019 Draft info...
Scrapping 2018 Draft info...
Scrapping 2017 Draft info...
Scrapping 2016 Draft info...
Scrapping 2015 Draft info...
Scrapping 2014 Draft info...
Scrapping 2013 Draft info...
Scrapping 2012 Draft info...
Scrapping 2011 Draft info...
Scrapping 2010 Draft info...
Scrapping 2009 Draft info...
Scrapping 2008 Draft info...
Scrapping 2007 Draft info...
Scrapping 2006 Draft info...
Scrapping 2005 Draft info...
Scrapping 2004 Draft info...
Scrapping 2003 Draft info...
Scrapping 2002 Draft info...
Scrapping 2001 Draft info...
Scrapping 2000 Draft info...
Scrapping 1999 Draft info...
Scrapping 1998 Draft info...
Scrapping 1997 Draft info...
Scrapping 1996 Draft info...
Scrapping 1995 Draft info...
Scrapping 1994 Draft info...
Scrapping 1993 Draft info...
Scrapping 1992 Draft info...
Scrapping 1991 Draft info...
Scrapping 1990 Draft info...
Scrapping 1989 Draft info...
Scrapping 1988 Draft info...
Scrapping 1987

### Observamos que hay muchos mas jugadores drafteados que del df original (jugadores con estadisticas). 

Razones: drafteado y no jugado, no drafteados (free agent), muy antiguos...

Aún así, hacemos el merge con el original

In [18]:
df_teams = get_all_teams()
print(str(len(df_teams)) + ' franchises in NBA nowadays!')
df_teams.to_excel('Scraped data/All_time_teams_raw.xlsx')

Scraping ATL team info
Scraping BOS team info
Scraping NJN team info
Scraping CHA team info
Scraping CHI team info
Scraping CLE team info
Scraping DAL team info
Scraping DEN team info
Scraping DET team info
Scraping GSW team info
Scraping HOU team info
Scraping IND team info
Scraping LAC team info
Scraping LAL team info
Scraping MEM team info
Scraping MIA team info
Scraping MIL team info
Scraping MIN team info
Scraping NOH team info
Scraping NYK team info
Scraping OKC team info
Scraping ORL team info
Scraping PHI team info
Scraping PHO team info
Scraping POR team info
Scraping SAC team info
Scraping SAS team info
Scraping TOR team info
Scraping UTA team info
Scraping WAS team info
1603 franchises in NBA nowadays!


In [19]:
df_teams

Unnamed: 0,team_3_lett,season,league,team_name,wins,losses,w_l_pct
0,ATL,2021-22,NBA,Atlanta Hawks,16,19,.457
1,ATL,2020-21,NBA,Atlanta Hawks*,41,31,.569
2,ATL,2019-20,NBA,Atlanta Hawks,20,47,.299
3,ATL,2018-19,NBA,Atlanta Hawks,29,53,.354
4,ATL,2017-18,NBA,Atlanta Hawks,24,58,.293
...,...,...,...,...,...,...,...
56,WAS,1965-66,NBA,Baltimore Bullets*,38,42,.475
57,WAS,1964-65,NBA,Baltimore Bullets*,37,43,.463
58,WAS,1963-64,NBA,Baltimore Bullets,31,49,.388
59,WAS,1962-63,NBA,Chicago Zephyrs,25,55,.313


## PROVES (13/05)
Extreure estadístiques del jugador per temporada

In [93]:
def player_info_per_game_v2(player_url,table_id,extra_name): #table_id: 'per_game','playoffs_per_game'...
    
    # Ahora cogemos la información por temporada, las medias las podemos calcular nosotros 
    # (de esta forma evitamos tiempo de escrapeado)
    # también obtenemos los equipos en los que ha jugado de forma directa
    season_l = []
    team_l = []
    games_l = []
    games_started_l = []
    minutes_played_pg_l = []
    field_goals_pg_l = []
    field_goals_attempts_pg_l = []
    field_goals_percent_pg_l = []
    _3pts_goals_pg_l = []
    _3pts_goals_attempts_pg_l = []
    _3pts_goals_percent_pg_l = []
    _2pts_goals_pg_l = []
    _2pts_goals_attempts_pg_l = []
    _2pts_goals_percent_pg_l = []
    effective_field_goals_percent_pg_l = []
    FT_goals_pg_l = []
    FT_goals_attempts_pg_l = []
    FT_goals_percent_pg_l = []
    off_rebounds_pg_l = []
    def_rebounds_pg_l = []
    total_rebounds_pg_l = []
    assists_pg_l = []
    steals_pg_l = []
    blocks_pg_l = []
    turnovers_pg_l = []
    personal_foults_pg_l = []
    points_pg_l = []

    season = None
    team = None
    games = None
    games_started = None
    minutes_played_pg = None
    field_goals_pg = None
    field_goals_attempts_pg = None
    field_goals_percent_pg = None
    _3pts_goals_pg = None
    _3pts_goals_attempts_pg = None
    _3pts_goals_percent_pg = None
    _2pts_goals_pg = None
    _2pts_goals_attempts_pg = None
    _2pts_goals_percent_pg = None
    effective_field_goals_percent_pg = None
    FT_goals_pg = None
    FT_goals_attempts_pg = None
    FT_goals_percent_pg = None
    off_rebounds_pg = None
    def_rebounds_pg = None
    total_rebounds_pg = None
    assists_pg = None
    steals_pg = None
    blocks_pg = None
    turnovers_pg = None
    personal_foults_pg = None
    points_pg = None
    
    page_request = requests.get('https://www.basketball-reference.com' + str(player_url))
    soup = BeautifulSoup(page_request.text,"lxml")
    table = soup.find("table",{'id':table_id})

    if table:
                table_foot = table.find('tbody')
                season_rows = table_foot.findAll('tr')
                for row in season_rows:
                    #La temmporada se debe buscar diferente
                    season = row.find('th')
                    #print(season)
                    if season!= None: #Sinó es un did not play
                        season = season.find('a').string
                        #print(season)
                        cells  = row.findAll('td')
                        playerData = str(cells) #the indexes are not uniform across the database
                        # It searches using regular expressions!!!!!
                        #print(playerData)
                        games = re.search(r'data-stat="g">(.*?)</td>', playerData).group(1)
                        team = re.search(r'data-stat="team_id">(.*?)</a>', playerData).group(1)[-3:] #aquest dóna problemes
                        age = re.search(r'data-stat="age">(.*?)</td>', playerData).group(1)
                        games_started = re.search(r'data-stat="gs">(.*?)</td>', playerData).group(1)
                        minutes_played_pg = re.search(r'data-stat="mp_per_g">(.*?)</td>', playerData).group(1)
                        field_goals_pg = re.search(r'data-stat="fg_per_g">(.*?)</td>', playerData).group(1)
                        field_goals_attempts_pg = re.search(r'data-stat="fga_per_g">(.*?)</td>', playerData).group(1)
                        field_goals_percent_pg = re.search(r'data-stat="fg_pct">(.*?)</td>', playerData).group(1)
                        if re.search(r'data-stat="fg3_per_g">(.*?)</td>', playerData) != None:
                            _3pts_goals_pg = re.search(r'data-stat="fg3_per_g">(.*?)</td>', playerData).group(1)
                            _3pts_goals_attempts_pg = re.search(r'data-stat="fg3a_per_g">(.*?)</td>', playerData).group(1)
                            _3pts_goals_percent_pg = re.search(r'data-stat="fg3_pct">(.*?)</td>', playerData).group(1)
                        if re.search(r'data-stat="fg2_per_g">(.*?)</td>', playerData) != None:
                            _2pts_goals_pg = re.search(r'data-stat="fg2_per_g">(.*?)</td>', playerData).group(1)
                            _2pts_goals_attempts_pg = re.search(r'data-stat="fg2a_per_g">(.*?)</td>', playerData).group(1)
                            _2pts_goals_percent_pg = re.search(r'data-stat="fg2_pct">(.*?)</td>', playerData).group(1)
                            effective_field_goals_percent_pg = re.search(r'data-stat="efg_pct">(.*?)</td>', playerData).group(1)
                        FT_goals_pg = re.search(r'data-stat="ft_per_g">(.*?)</td>', playerData).group(1)
                        FT_goals_attempts_pg = re.search(r'data-stat="fta_per_g">(.*?)</td>', playerData).group(1)
                        FT_goals_percent_pg = re.search(r'data-stat="ft_pct">(.*?)</td>', playerData).group(1)
                        if re.search(r'data-stat="orb_per_g">(.*?)</td>', playerData) != None:
                            off_rebounds_pg = re.search(r'data-stat="orb_per_g">(.*?)</td>', playerData).group(1)
                            def_rebounds_pg = re.search(r'data-stat="drb_per_g">(.*?)</td>', playerData).group(1)
                        total_rebounds_pg = re.search(r'data-stat="trb_per_g">(.*?)</td>', playerData).group(1)
                        assists_pg = re.search(r'data-stat="ast_per_g">(.*?)</td>', playerData).group(1)
                        if re.search(r'data-stat="stl_per_g">(.*?)</td>', playerData) != None:
                            steals_pg = re.search(r'data-stat="stl_per_g">(.*?)</td>', playerData).group(1)
                            blocks_pg = re.search(r'data-stat="blk_per_g">(.*?)</td>', playerData).group(1)
                            try:
                                turnovers_pg = re.search(r'data-stat="tov_per_g">(.*?)</td>', playerData).group(1)
                            except:
                                pass
                        personal_foults_pg = re.search(r'data-stat="pf_per_g">(.*?)</td>', playerData).group(1)
                        points_pg = re.search(r'data-stat="pts_per_g">(.*?)</td>', playerData).group(1)
                            
                        season_l.append(season)
                        team_l.append(team)
                        games_l.append(games)
                        games_started_l.append(games_started)
                        minutes_played_pg_l.append(minutes_played_pg)
                        field_goals_pg_l.append(field_goals_pg)
                        field_goals_attempts_pg_l.append(field_goals_attempts_pg)
                        field_goals_percent_pg_l.append(field_goals_percent_pg)
                        _3pts_goals_pg_l.append(_3pts_goals_pg)
                        _3pts_goals_attempts_pg_l.append(_3pts_goals_attempts_pg)
                        _3pts_goals_percent_pg_l.append(_3pts_goals_percent_pg)
                        _2pts_goals_pg_l.append(_2pts_goals_pg)
                        _2pts_goals_attempts_pg_l.append(_2pts_goals_attempts_pg)
                        _2pts_goals_percent_pg_l.append(_2pts_goals_percent_pg)
                        effective_field_goals_percent_pg_l.append(effective_field_goals_percent_pg)
                        FT_goals_pg_l.append(FT_goals_pg)
                        FT_goals_attempts_pg_l.append(FT_goals_attempts_pg)
                        FT_goals_percent_pg_l.append(FT_goals_percent_pg)
                        off_rebounds_pg_l.append(off_rebounds_pg)
                        def_rebounds_pg_l.append(def_rebounds_pg)
                        total_rebounds_pg_l.append(total_rebounds_pg)
                        assists_pg_l.append(assists_pg)
                        steals_pg_l.append(steals_pg)
                        blocks_pg_l.append(blocks_pg)
                        turnovers_pg_l.append(turnovers_pg)
                        personal_foults_pg_l.append(personal_foults_pg)
                        points_pg_l.append(points_pg)
                            
    player_info = {#'player_url' : player_url,
                   #'games' + extra_name : [ games ],
                   'season' + extra_name : season_l,
                   'team' + extra_name : team_l,
                   'games_started' + extra_name : games_started_l,
                   'minutes_played_pg' + extra_name : minutes_played_pg_l,
                   'field_goals_pg' + extra_name : field_goals_pg_l,
                   'field_goals_attempts_pg' + extra_name : field_goals_attempts_pg_l,
                   'field_goals_percent_pg' + extra_name : field_goals_percent_pg_l,
                   '_3pts_goals_pg' + extra_name : _3pts_goals_pg_l,
                   '_3pts_goals_attempts_pg' + extra_name : _3pts_goals_attempts_pg_l,
                   '_3pts_goals_percent_pg' + extra_name : _3pts_goals_percent_pg_l,
                   '_2pts_goals_pg' + extra_name : _2pts_goals_pg_l,
                   '_2pts_goals_attempts_pg' + extra_name : _2pts_goals_attempts_pg_l,
                   '_2pts_goals_percent_pg' + extra_name : _2pts_goals_percent_pg_l,
                   'effective_field_goals_percent_pg' + extra_name : effective_field_goals_percent_pg_l,
                   'FT_goals_pg' + extra_name : FT_goals_pg_l,
                   'FT_goals_attempts_pg' + extra_name : FT_goals_attempts_pg_l,
                   'FT_goals_percent_pg' + extra_name : FT_goals_percent_pg_l,
                   'off_rebounds_pg' + extra_name : off_rebounds_pg_l,
                   'def_rebounds_pg' + extra_name : def_rebounds_pg_l,
                   'total_rebounds_pg' + extra_name : total_rebounds_pg_l,
                   'assists_pg' + extra_name : assists_pg_l,
                   'steals_pg' + extra_name : steals_pg_l,
                   'blocks_pg' + extra_name : blocks_pg_l,
                   'turnovers_pg' + extra_name : turnovers_pg_l,
                   'personal_foults_pg' + extra_name : personal_foults_pg_l,
                   'points_pg' + extra_name : points_pg_l}
    
    return player_info

In [94]:
table_id = 'per_game'
player_url = '/players/r/rubiori01.html'
player_info_per_game_v2(player_url,table_id,'RegS')

{'seasonRegS': ['2011-12',
  '2012-13',
  '2013-14',
  '2014-15',
  '2015-16',
  '2016-17',
  '2017-18',
  '2018-19',
  '2019-20',
  '2020-21',
  '2021-22'],
 'teamRegS': ['MIN',
  'MIN',
  'MIN',
  'MIN',
  'MIN',
  'MIN',
  'UTA',
  'UTA',
  'PHO',
  'MIN',
  'CLE'],
 'games_startedRegS': ['31',
  '47',
  '82',
  '22',
  '76',
  '75',
  '77',
  '67',
  '65',
  '51',
  '8'],
 'minutes_played_pgRegS': ['34.2',
  '29.7',
  '32.2',
  '31.5',
  '30.6',
  '32.9',
  '29.3',
  '27.9',
  '31.0',
  '26.1',
  '28.5'],
 'field_goals_pgRegS': ['3.4',
  '3.2',
  '3.1',
  '3.5',
  '2.9',
  '3.5',
  '4.5',
  '4.3',
  '4.4',
  '2.8',
  '4.4'],
 'field_goals_attempts_pgRegS': ['9.5',
  '9.0',
  '8.2',
  '10.0',
  '7.7',
  '8.7',
  '10.7',
  '10.7',
  '10.6',
  '7.3',
  '12.1'],
 'field_goals_percent_pgRegS': ['.357',
  '.360',
  '.381',
  '.356',
  '.374',
  '.402',
  '.418',
  '.404',
  '.415',
  '.388',
  '.363'],
 '_3pts_goals_pgRegS': ['0.8',
  '0.5',
  '0.5',
  '0.6',
  '0.8',
  '0.8',
  '1.2',
 

Properes passes:
- Cal comprovar que aquesta funció va bé per a un petit nombre de jugadors, per veure si es prou general
- La funció es podria optimitzar a l'hora de guardar la info (enlloc de llistes, usar DataFrames, no sé...)
- Cal fer aquesta mateixa funció per la taula de Advanced y de Playoff
- Crec que per NCAA no cal fer una funció d'aquest tipus