In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
# Importamos las librerías necesarias para la ejecución del notebook

# Run 'pip install XX' as needed

from bs4 import BeautifulSoup
from datetime import datetime
from IPython.display import Image
import pandas as pd
import re
import requests as rq
import string
import time

from multiprocessing import Pool

# 1. Data Scraping

En el contexto del proyecto desarrollado *Clusterización de jugadores de la NBA para posterior aproximación/predicción de la tipología de jugadores que participaran en el Draft de '22*, el apartada de *Data Gathering* (obtención de datos) se ha resuelto mediante el scrapping de datos. Principalmente se han extraído, mediante librerías específicas de python, estadísticas medias por temporada para cada jugador de la NBA, de la página web [Basketball Reference](https://www.basketball-reference.com/).

El **data scrapping** es una técnica de programación que consiste en la extracción masiva, mediante ingeniería inversa, de datos mediante la descarga de la información subyacente en objetos que almacenan datos generalmente no estructurados (por ejemplo, y en nuestro caso, datos de los arxivos HTML fuente de las páginas de Basketball Reference).

<img src="./Pictures/big_picture.jpg"/>


En referencia a la **metodologia** utilizada en este notebook, basándonos principalmente en las librerías y frameworks propuestos por *Pandas* y *BeautifulSoup*, descargamos la información de los HTML de las distintas webs, recorremos la estrucutura de árbol hasta capturar las tablas u objetos con datos que sean de nuestro interés. A continuación los almazenamos en DataFrames y acabamos obteniendo objetos tabulares que aglutinan toda la información que usaremos a posteriori en otros notebooks.

## 1.1 Get all players

Función que permite navegar al *diccionario de jugadores* de [Basketball Reference](https://www.basketball-reference.com/players/) y, pasando por cada una de las letras del diccionario accede a las subpáginas con los jugadores que empiezan por esa mismaletra. En esta segunda página, busca un objeto tabla con información básica del jugador: nombre, fecha de nacimiento, college, etc. Pero sobretodo, se extrae la **URL del jugador** que, posteriormente, nos servirá para poder efectuar el scrapping de sus estadísticas concretas.

**Argumentos**: (*vacío*)

<img src="./Pictures/Data Scrapping 1.png"
     alt="Markdown Monster icon"
     style="float: left; margin-right: 10px;" />

In [3]:
def get_all_players():
    players = []
    base_url = 'http://www.basketball-reference.com/players/'
    start_timer = time.perf_counter()
    for letter in string.ascii_lowercase:
        print('Scraping players with letter ' + letter + '...')
        pr = rq.get(base_url + letter)
        if pr.status_code == 200:
            soup = BeautifulSoup(pr.content, 'html.parser')
            table = soup.find('table')
            if table is None:
                return pd.DataFrame()
            tbody = table.find('tbody')
            for row in tbody.findAll('tr'):
                _url = row.find('a')
                name = _url.text
                url = _url['href']
                cells = row.findAll('td')
                active_from = int(cells[0].text)
                active_to = int(cells[1].text)
                position = cells[2].text
                height = cells[3].text
                weight = cells[4].text
                birth_date = cells[5].text
                college = cells[6].text
                players.append({
                    'URL': url,
                    'NAME': name,
                    'ACTIVE_FROM': active_from,
                    'ACTIVE_TO': active_to,
                    'POSITION': position,
                    'COLLEGE': college,# we will have to treat this info (+ one college, consistency in names...)
                    'HEIGHT': height,
                    'WEIGHT': weight,
                    'BIRTH_DATE': pd.to_datetime(birth_date) # we will have to treat this info (date format)
                })
    end_timer = time.perf_counter()
    print(f"Scraped " + str(len(players)) + f" NBA and ABA players in {end_timer - start_timer:0.4f} seconds")
    return pd.DataFrame(players)

In [4]:
# Testing cell
df_all_players = get_all_players()
df_all_players.head()

Scraping players with letter a...
Scraping players with letter b...
Scraping players with letter c...
Scraping players with letter d...
Scraping players with letter e...
Scraping players with letter f...
Scraping players with letter g...
Scraping players with letter h...
Scraping players with letter i...
Scraping players with letter j...
Scraping players with letter k...
Scraping players with letter l...
Scraping players with letter m...
Scraping players with letter n...
Scraping players with letter o...
Scraping players with letter p...
Scraping players with letter q...
Scraping players with letter r...
Scraping players with letter s...
Scraping players with letter t...
Scraping players with letter u...
Scraping players with letter v...
Scraping players with letter w...
Scraping players with letter x...
Scraping players with letter y...
Scraping players with letter z...
Scraped 5023 NBA and ABA players in 32.1955 seconds


Unnamed: 0,URL,NAME,ACTIVE_FROM,ACTIVE_TO,POSITION,COLLEGE,HEIGHT,WEIGHT,BIRTH_DATE
0,/players/a/abdelal01.html,Alaa Abdelnaby,1991,1995,F-C,Duke,6-10,240,1968-06-24
1,/players/a/abdulza01.html,Zaid Abdul-Aziz,1969,1978,C-F,Iowa State,6-9,235,1946-04-07
2,/players/a/abdulka01.html,Kareem Abdul-Jabbar,1970,1989,C,UCLA,7-2,225,1947-04-16
3,/players/a/abdulma02.html,Mahmoud Abdul-Rauf,1991,2001,G,LSU,6-1,162,1969-03-09
4,/players/a/abdulta01.html,Tariq Abdul-Wahad,1998,2003,F,"Michigan, San Jose State",6-6,223,1974-11-03


## 1.2 Get player stats

Función que **captura el objeto tabla** determinado en los argumentos (según el tipo de estadística que se quiera capturar) presente en la página web del jugador del que se quiere scrappear las **estadísticas NBA** (por ejemplo, [Ricky Rubio](https://www.basketball-reference.com/players/r/rubiori01.html)). Se podrá decidir si se quiere obtener estadísticas medias por temporada o medias totales de carrera, si son estadísticas de Regular season o de Playoffs y si se quiere extraer dichas estadísticas en general o por equipo en el que haya jugado. 


**Argumentos:**
- '_player_url': URL del jugador obtenida en el scrapping de todos los jugadores. Ej: '/players/a/abdulka01.html'
- 'stat_type': Tabla que queremos recuperar de basketball-reference. Las siguientes tablas son válidas:
    - 'PER_GAME'
    - 'TOTALS'
    - 'PER_MINUTE'
    - 'PER_POSS'
    - 'ADVANCED'
- 'playoffs': True/False
    - True: se recuperan estadísticas referentes a los playoff
    - False: se recuperan estadísticas referentes a la Regular season
- 'career': True/False
    - True: estadísticas medias de la carrera
    - False: estadísticas medias por temporada
- 'career_by_team': True/False
    - True: estadísticas medias por equipo jugado
    - False: estadísticas medias de la carrera

In [5]:
def get_player_stats(_player_url, stat_type='PER_GAME', playoffs=False, career=False, career_by_team=False):
    suffix = _player_url.replace('/', '%2F')
    selector = stat_type.lower()
    if playoffs:
        selector = 'playoffs_'+ selector
    pr = rq.get(f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=bbr&url={suffix}&div=div_{selector}')
    if pr.status_code == 200:
        soup = BeautifulSoup(pr.content, 'html.parser')
        table = soup.find('table')
        if table is None:
            return pd.DataFrame()
        df = pd.read_html(str(table))[0]
        df.rename(columns={'Season': 'SEASON', 'Age': 'AGE',
                  'Tm': 'TEAM', 'Lg': 'LEAGUE', 'Pos': 'POS'}, inplace=True)
        if 'FG.1' in df.columns:
            df.rename(columns={'FG.1': 'FG%'}, inplace=True)
        if 'eFG' in df.columns:
            df.rename(columns={'eFG': 'eFG%'}, inplace=True)
        if 'FT.1' in df.columns:
            df.rename(columns={'FT.1': 'FT%'}, inplace=True)
        df.dropna(axis=0, how="all", inplace=True)
        df.dropna(axis=1, how="all", inplace=True)
        career_index = df[df['SEASON']=='Career'].index[0]
        df['URL'] = _player_url #nos permitirá hacer el join
        if career:
            if career_by_team:
                df = df.iloc[career_index+1:, :]
            else:
                df = df.iloc[career_index, :]
        else:
            df = df.iloc[:career_index, :]  
        df.reset_index().drop('index', axis=1, inplace=True)
        return df
    return pd.DataFrame()

In [6]:
# Testing cell
player_stats = get_player_stats('/players/a/abdulka01.html','PER_GAME',False,False,False)
player_stats

Unnamed: 0,SEASON,AGE,TEAM,LEAGUE,POS,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,URL
0,1969-70,22.0,MIL,NBA,C,82.0,,43.1,11.4,22.1,...,,,14.5,4.1,,,,3.5,28.8,/players/a/abdulka01.html
1,1970-71,23.0,MIL,NBA,C,82.0,,40.1,13.0,22.5,...,,,16.0,3.3,,,,3.2,31.7,/players/a/abdulka01.html
2,1971-72,24.0,MIL,NBA,C,81.0,,44.2,14.3,24.9,...,,,16.6,4.6,,,,2.9,34.8,/players/a/abdulka01.html
3,1972-73,25.0,MIL,NBA,C,76.0,,42.8,12.9,23.3,...,,,16.1,5.0,,,,2.7,30.2,/players/a/abdulka01.html
4,1973-74,26.0,MIL,NBA,C,81.0,,43.8,11.7,21.7,...,3.5,11.0,14.5,4.8,1.4,3.5,,2.9,27.0,/players/a/abdulka01.html
5,1974-75,27.0,MIL,NBA,C,65.0,,42.3,12.5,24.4,...,3.0,11.0,14.0,4.1,1.0,3.3,,3.2,30.0,/players/a/abdulka01.html
6,1975-76,28.0,LAL,NBA,C,82.0,,41.2,11.1,21.1,...,3.3,13.5,16.9,5.0,1.5,4.1,,3.6,27.7,/players/a/abdulka01.html
7,1976-77,29.0,LAL,NBA,C,82.0,,36.8,10.8,18.7,...,3.2,10.0,13.3,3.9,1.2,3.2,,3.2,26.2,/players/a/abdulka01.html
8,1977-78,30.0,LAL,NBA,C,62.0,,36.5,10.7,19.4,...,3.0,9.9,12.9,4.3,1.7,3.0,3.4,2.9,25.8,/players/a/abdulka01.html
9,1978-79,31.0,LAL,NBA,C,80.0,,39.5,9.7,16.8,...,2.6,10.2,12.8,5.4,1.0,4.0,3.5,2.9,23.8,/players/a/abdulka01.html


## 1.3 Get player picture

Función que permite obtener un archivo .jpg con la imagen del jugador seleccionado. *Probablemente no acabemos usando estos datos*.

**Argumentos:**
- '_player_url': URL del jugador obtenida en el scrapping de todos los jugadores. Ej: '/players/a/abdulka01.html'

In [7]:
def get_player_picture(_player_url):
    jpg = _player_url.split('/')[-1].replace('html', 'jpg')
    url = 'https://d2cwpp38twqe55.cloudfront.net/req/202006192/images/players/' + jpg
    return url

In [8]:
# Testing cell
player_picture = get_player_picture('/players/a/abdulka01.html')
Image(url=player_picture)

## 1.4 Get game logs

Función que permite recuperar datos y estadísticas de todos los partidos jugados por un jugador en una temporada concreta, referentes a Playoff o Regular season. *Probablemente no acabemos usando estos datos*.

**Argumentos:**
- '_player_url': URL del jugador obtenida en el scrapping de todos los jugadores. Ej: '/players/a/abdulka01.html'
- 'year': año del que se quieren recuperar las estadísticas de los partidos
- 'playoffs': True/False
    - True: los partidos son de Playoff
    - False: los partidos son de Regular season

In [9]:
def get_game_logs(_player_url, year, playoffs=False):
    suffix = _player_url.replace('/', '%2F').replace('.html', '')
    if playoffs:
        selector = 'div_pgl_basic_playoffs'
    else:
        selector = 'div_pgl_basic'
    pr = rq.get(f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=bbr&url={suffix}%2Fgamelog%2F{year}%2F&div={selector}')
    if pr.status_code == 200:
        soup = BeautifulSoup(pr.content, 'html.parser')
        table = soup.find('table')
        if table is None:
            return pd.DataFrame()
        df = pd.read_html(str(table))[0]
        df.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',
            'Unnamed: 7': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)
        df['HOME/AWAY'] = df['HOME/AWAY'].apply(lambda x: 'AWAY' if x=='@' else 'HOME')
        df = df[df['Rk']!='Rk']
        df = df.drop(['Rk', 'G'], axis=1)
        df['DATE'] = pd.to_datetime(df['DATE'])
        # df = df[df['GS'] == '1'].reset_index(drop=True)
        df.dropna(axis=0, how="all", inplace=True)
        df.dropna(axis=1, how="all", inplace=True)
        df.reset_index().drop('index', axis=1, inplace=True)
        return df
    return pd.DataFrame()

In [10]:
# Testing cell
game_logs = get_game_logs('/players/a/abdulka01.html', '1970', False)
game_logs

Unnamed: 0,DATE,AGE,TEAM,HOME/AWAY,OPPONENT,RESULT,MP,FG,FGA,FG%,FT,FTA,FT%,TRB,AST,PF,PTS
0,1969-10-18,22-185,MIL,HOME,DET,W (+9),48:00,12,27,.444,5,8,.625,12,6,5,29
1,1969-10-19,22-186,MIL,HOME,SEA,W (+24),29:00,9,14,.643,1,1,1.000,11,5,6,19
2,1969-10-22,22-189,MIL,AWAY,SDR,W (+13),42:00,15,26,.577,6,10,.600,19,3,6,36
3,1969-10-24,22-191,MIL,AWAY,LAL,L (-11),48:00,9,21,.429,5,9,.556,20,2,3,23
4,1969-10-25,22-192,MIL,AWAY,SFW,L (-14),42:00,7,20,.350,2,3,.667,5,2,1,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,1970-03-10,22-328,MIL,AWAY,ATL,L (-13),43:00,16,24,.667,8,12,.667,13,3,4,40
81,1970-03-15,22-333,MIL,HOME,CHI,L (-23),35:00,11,16,.688,8,10,.800,12,6,1,30
82,1970-03-18,22-336,MIL,AWAY,NYK,W (+8),44:00,10,20,.500,6,9,.667,16,2,4,26
84,1970-03-20,22-338,MIL,AWAY,PHI,W (+26),39:00,16,18,.889,9,9,1.000,15,6,2,41


## 1.5 Get player extra information

Función que permite recuperar información extra/específica de los jugadores que no se encuentra presente en objetos tabulares, sinó en objetos rectangulares específicos del HTML. Se quiere poder obtener información relacionada con títulos y reconocimientos individuales (MVP, Hall-of-Fame, ROY, etc.) y colectivos (campeón NBA, campeón de conferencia, etc.).

Para capturar esta información usaremos, a parte del framework del resto de funciones, las expresiones regulares, dado que los campos a capturar fluctuan bastante en cuanto a formato.

**Argumentos:**
- '_player_url': URL del jugador obtenida en el scrapping de todos los jugadores. Ej: '/players/a/abdulka01.html'
<img src="./Pictures/Kareem extra 1.png" width="350" style="float: left"/>

In [11]:
def get_player_extra_info(_player_url):
  pr = rq.get('https://www.basketball-reference.com' + str(_player_url))
  if pr.status_code == 200:
    soup = BeautifulSoup(pr.content, 'html.parser')
    df = pd.DataFrame()
    try:
        extras = soup.find('ul', {'id':'bling'}).findAll('li')
    #     if extras is None:
    #       return pd.DataFrame()
        for each in extras:
          match = re.search(r'\S*\d((x)|(-)\d*)\S*', each.text)
          if match:
            value = each.text[match.start():match.end()]
            if(value.endswith('x')):
                value = value[:-1]
            data = each.text[match.end()+1:].upper().replace(' ', '_')
          else:
            data = each.text.upper().replace(' ', '_')
            value = 1
          df[data] = [value]
        df['COLLEGE_URL'] = soup.find('a', string = re.compile('College Basketball at Sports-Reference.com')).get('href').replace('https://www.sports-reference.com/cbb', '')
        df.reset_index().drop('index', axis=1, inplace=True)
        df['URL'] = _player_url
        return df
    except:
        return df.append({'URL':_player_url},ignore_index=True)
  return pd.DataFrame()

In [12]:
# Testing cell
player_extras = get_player_extra_info('/players/j/jamesle01.html')
player_extras

Unnamed: 0,ALL_STAR,SCORING_CHAMP,AST_CHAMP,NBA_CHAMP,ALL-NBA,ALL-ROOKIE,ROY,AS_MVP,ALL-DEFENSIVE,MVP,FINALS_MVP,NBA_75TH_ANNIV._TEAM,URL
0,18.0,2007-08,2019-20,4.0,18.0,2003-04,2003-04,3.0,6.0,4.0,4.0,1.0,
1,,,,,,,,,,,,,/players/j/jamesle01.html


## 1.6 Get player teams

Función que permite recuperar en qué equipos y qué intérvalos de tiempo los jugadores formaron parte de ciertos equipos. Los objetos con esta información no son objetos tabulares, sinó  objetos rectangulares específicos del HTML.

Para capturar esta información usaremos, a parte del framework del resto de funciones, las expresiones regulares, dado que los campos a capturar fluctuan bastante en cuanto a formato.

**Argumentos:**
- '_player_url': URL del jugador obtenida en el scrapping de todos los jugadores. Ej: '/players/a/abdulka01.html'

<img src="./Pictures/Kareem extra 2.png" width="350" style="float: left"/>

In [13]:
def get_player_teams(_player_url):
  pr = rq.get('https://www.basketball-reference.com' + str(_player_url))
  if pr.status_code == 200:
    soup = BeautifulSoup(pr.content, 'html.parser')
    teams = soup.find('div', {'class': 'uni_holder bbr'}).findAll('a')
    if teams is None:
      return pd.DataFrame()
    teams_array = []
    for each in teams:
        match = re.search(r'[,]\s\d*', each.get('data-tip'))
        if match:
            team = each.get('data-tip')[:match.start()]
            played_from = each.get('data-tip')[match.start()+2:match.end()]
            match = re.search(r'[-]\d*\b', each.get('data-tip'))
            if match:
                played_to = each.get('data-tip')[match.start()+1:]
            else:
                played_to = played_from
            teams_array.append({
              'TEAM': team,
              'FROM': played_from,
              'TO': played_to
            })
    return pd.DataFrame(teams_array)
  return pd.DataFrame()

In [14]:
# Testing cell
player_teams = get_player_teams('/players/a/abdulka01.html')
player_teams

Unnamed: 0,TEAM,FROM,TO
0,Milwaukee Bucks,1970,1975
1,Los Angeles Lakers,1976,1989


## 1.7 Get player college stats

## 1.2 Get player stats

Función que **captura el objeto tabla** determinado en los argumentos (según el tipo de estadística que se quiera capturar) presente en la página web del jugador del que se quiere scrappear las **estadísticas de college** (por ejemplo, [Kyler Edwards](https://www.sports-reference.com/cbb/players/kyler-edwards-1.html)). Se podrá decidir si se quiere obtener estadísticas medias por temporada o medias totales de carrera, estadísticas medias por temporada o torneo de conferencia (en caso de tenerlas, dada su recencia de la *March Madness*) y si se quiere extraer dichas estadísticas en general o por equipo en el que haya jugado. 

**Argumentos:**
- '_player_college_url': URL del jugador obtenida en el scrapping extra de los jugadores. Ej: '/players/jared-butler-1.html'
- 'stat_type': Tabla que queremos recuperar de basketball-reference. Las siguientes tablas son válidas:
    - 'PER_GAME'
    - 'TOTALS'
    - 'PER_MIN'
    - 'PER_POSS'
    - 'ADVANCED'
- 'conference': True/False
    - True: se recuperan estadísticas referentes a los torneos de ocnferencia
    - False: se recuperan estadísticas referentes a la Regular season
- 'career': True/False
    - True: estadísticas medias de la carrera
    - False: estadísticas medias por temporada
- 'career_by_team': True/False
    - True: estadísticas medias por equipo jugado
    - False: estadísticas medias de la carrera
    
En este caso, las estadísticas de college se encuentran el la web matriz de Basketball Reference: [Sports Reference](https://www.sports-reference.com/). Concretamente en la sección de [Basketball (college)](https://www.sports-reference.com/cbb/)

In [15]:
def get_player_college_stats(_player_college_url, stat_type='PER_GAME', conference=False, career=False, career_by_team=False):
  suffix = _player_college_url.replace('/', '%2F')
  selector = stat_type.lower()
  if conference:
    selector = selector + '_conf'
  pr = rq.get(f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=cbb&url=%2Fcbb{suffix}&div=div_players_{selector}')
  if pr.status_code == 200:
    soup = BeautifulSoup(pr.content, 'html.parser')
    table = soup.find('table')
    if table is None:
      return pd.DataFrame()
    df = pd.read_html(str(table))[0]
    df.rename(columns={'Season': 'SEASON', 'School': 'SCHOOL', 'Conf': 'CONF'}, inplace=True)
    if 'FG.1' in df.columns:
      df.rename(columns={'FG.1': 'FG%'}, inplace=True)
    if 'eFG' in df.columns:
      df.rename(columns={'eFG': 'eFG%'}, inplace=True)
    if 'FT.1' in df.columns:
      df.rename(columns={'FT.1': 'FT%'}, inplace=True)
    df.dropna(axis=0, how="all", inplace=True)
    df.dropna(axis=1, how="all", inplace=True)
    career_index = df[df['SEASON']=='Career'].index[0]
    multiple_college = False
    if len(df[df['SCHOOL']=='Overall']) != 0:
      multiple_college = True
    if career:
      if career_by_team and multiple_college:
        df = df.iloc[career_index+1:, :]
      else:
        df = df.iloc[career_index, :]
    else:
      df = df.iloc[:career_index, :]
    df['COLLEGE_URL'] = _player_college_url #nos permitirá hacer el join
    df.reset_index().drop('index', axis=1, inplace=True)
    return df
  return pd.DataFrame()

In [16]:
player_college_stats = get_player_college_stats('/players/anthony-fair-1.html', 'ADVANCED', False, True, True)
player_college_stats

Unnamed: 0,SEASON,SCHOOL,CONF,G,GS,MP,TS%,eFG%,3PAr,FTr,TRB%,AST%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/40,COLLEGE_URL
3,,Delaware State,,5,0,14,,,0.0,1.0,,,,40.4,,-0.1,0.0,0.0,-0.099,/players/anthony-fair-1.html
4,,Towson,,1,0,1,,,,,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,/players/anthony-fair-1.html


## 1.8 Get college game logs
Función que permite recuperar datos y estadísticas de todos los partidos jugados por un jugador en una temporada concreta, referentes a sus años de college y por tipología de partido. *Probablemente no acabemos usando estos datos*.

**Argumentos:**
- '_player_college_url': URL de college del jugador obtenida en el scrapping de player extra info. Ej: '/players/jared-butler-1.html'
- 'year': año del que se quieren recuperar las estadísticas de los partidos
- 'game_type'
    - 'REG' Regular Season
    - 'CTOURN' Conference Tournament
    - 'NCAA' NCAA Tournament
    - 'NIT' National Invitation Tournament
    - 'CBI' College Basketball Invitational
    - 'CIT' CollegeInsider.com Tournament

In [17]:
def get_college_game_logs(_player_college_url, year, game_type='REG'):
    suffix = _player_college_url.replace('/', '%2F').replace('.html', '')
    selector = 'gamelog'
    pr = rq.get(f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=cbb&url=%2Fcbb{suffix}%2Fgamelog%2F{year}%2F&div=div_{selector}')
    if pr.status_code == 200:
        soup = BeautifulSoup(pr.content, 'html.parser')
        table = soup.find('table')
        if table is None:
            return pd.DataFrame()
        df = pd.read_html(str(table))[0]
        df.rename(columns = {'Date': 'DATE', 'School': 'SCHOOL', 'Unnamed: 3': 'HOME/AWAY/NEUTRAL', 'Opponent': 'OPPONENT',
            'Type': 'TYPE', 'Unnamed: 6': 'RESULT'}, inplace=True)
        df['HOME/AWAY/NEUTRAL'] = df['HOME/AWAY/NEUTRAL'].apply(lambda x: 'AWAY' if x=='@' else ('NEUTRAL' if x=='N' else 'HOME'))
        df = df[df['Rk'] != '']
        df = df.drop(['Rk'], axis=1)
        df = df.drop(df[df['TYPE'] != game_type].index)
        df['DATE'] = pd.to_datetime(df['DATE'])
        #df = df[df['GS'] == '1'].reset_index(drop=True)
        df.dropna(axis=0, how="all", inplace=True)
        df.dropna(axis=1, how="all", inplace=True)
        df.reset_index().drop('index', axis=1, inplace=True)
        
        return df
    return pd.DataFrame()

In [18]:
college_game_logs = get_college_game_logs('/players/jared-butler-1.html', 2019, 'REG')
college_game_logs

Unnamed: 0,DATE,SCHOOL,HOME/AWAY/NEUTRAL,OPPONENT,TYPE,RESULT,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,2018-11-06,Baylor,HOME,Texas Southern,REG,L,1,21,1,6,...,,0,0,0,1,2,0,3,4,2
1,2018-11-10,Baylor,HOME,Southern,REG,W,0,20,3,7,...,0.5,0,2,2,5,1,0,0,2,9
2,2018-11-12,Baylor,HOME,Prairie View,REG,W,0,31,7,10,...,0.833,1,3,4,1,0,1,1,2,22
3,2018-11-16,Baylor,HOME,Nicholls State,REG,W,0,25,1,9,...,1.0,0,4,4,1,0,0,1,1,5
4,2018-11-23,Baylor,NEUTRAL,Ole Miss,REG,L,0,33,4,11,...,1.0,0,5,5,1,2,0,2,2,11
5,2018-11-24,Baylor,NEUTRAL,George Mason,REG,W,0,22,0,5,...,1.0,0,0,0,1,1,0,1,4,4
6,2018-11-27,Baylor,HOME,South Dakota,REG,W,0,16,1,5,...,,0,3,3,0,1,1,0,3,2
7,2018-12-01,Baylor,AWAY,Wichita State,REG,L,0,12,2,3,...,,0,0,0,0,0,0,1,0,5
8,2018-12-15,Baylor,AWAY,Arizona,REG,W,0,13,2,4,...,,0,1,1,0,1,1,1,1,5
9,2018-12-18,Baylor,HOME,Stephen F. Austin,REG,L,0,14,0,5,...,1.0,0,1,1,1,1,0,0,1,2


## 1.9 Get player college extra info
Función que permite recuperar información extra/específica de los jugadores en su época de college que no se encuentra presente en objetos tabulares, sinó en objetos rectangulares específicos del HTML. Se quiere poder obtener información relacionada con títulos y reconocimientos individuales (BIG_EAST_TOURNEY_MVP, etc.) y colectivos (campeón NCAA, campeón de conferencia, etc.).

Para capturar esta información usaremos, a parte del framework del resto de funciones, las expresiones regulares, dado que los campos a capturar fluctuan bastante en cuanto a formato.

**Argumentos:**
- '_player_college_url': URL de college del jugador obtenida en el scrapping de todos los jugadores. Ej: '/players/josh-hart-1.html'

In [19]:
def get_player_college_extra_info(_player_college_url):
  pr = rq.get('https://www.sports-reference.com/cbb' + str(_player_college_url))
  df = pd.DataFrame()
  try:  
    if pr.status_code == 200:
        soup = BeautifulSoup(pr.content, 'html.parser')
        extras = soup.find('ul', {'id':'bling'}).findAll('li')
    #     if extras is None:
    #       return pd.DataFrame()

        for each in extras:
          match = re.search(r'\S*\d((x)|(-)\d*)\S*', each.text)
          if match:
            value = each.text[match.start():match.end()]
            if(value.endswith('x')):
                value = value[:-1]
            data = each.text[match.end()+1:].upper().replace(' ', '_')
          else:
            data = each.text.upper().replace(' ', '_')
            value = 1
          df[data] = [value]
        df['COLLEGE_URL'] = _player_college_url
        return df
  except:
        return df.append({'COLLEGE_URL':_player_college_url},ignore_index=True)

In [20]:
player_college_extra_info = get_player_college_extra_info('/players/josh-hart-1.html')
player_college_extra_info

Unnamed: 0,NCAA_CHAMPION,NCAA_ALL-TOURNEY,NCAA_ALL-REGION,CONSENSUS_AA,BIG_EAST_POY,ALL-BIG_EAST,ALL-BIG_EAST_TOURNEY,BIG_EAST_ALL-FRESHMAN,BIG_EAST_DPOY,BIG_EAST_6MOY,BIG_EAST_TOURNEY_MVP,COLLEGE_URL
0,1,1,1,1,2016-17,2,3,1,2016-17,2014-15,2,/players/josh-hart-1.html


## 1.10 Get player college teams
Permite recuperar los equipos y rango de años que jugó en cada equipo para un jugador de college.

**Argumentos:**
- '_player_college_url': URL de college del jugador obtenida en el scrapping de todos los jugadores. Ej: '/players/josh-hart-1.html'

In [21]:
def get_player_college_teams(_player_college_url):
  pr = rq.get('https://www.sports-reference.com/cbb' + str(_player_college_url))
  if pr.status_code == 200:
    soup = BeautifulSoup(pr.content, 'html.parser')
    teams = soup.find('div', {'class': 'uni_holder cbb'}).findAll('a')
    if teams is None:
      return pd.DataFrame()
    teams_array = []
    for each in teams:
        match = re.search(r'[,]\s\d*', each.get('data-tip'))
        if match:
            team = each.get('data-tip')[:match.start()]
            played_from = each.get('data-tip')[match.start()+2:match.end()]
            match = re.search(r'[-]\d*\b', each.get('data-tip'))
            if match:
                played_to = each.get('data-tip')[match.start()+1:]
            else:
                played_to = played_from
            teams_array.append({
                'TEAM': team,
                'FROM': played_from,
                'TO': played_to
            })
    return pd.DataFrame(teams_array)
  return pd.DataFrame()

In [22]:
player_college_teams = get_player_college_teams('/players/josh-hart-1.html')
player_college_teams

Unnamed: 0,TEAM,FROM,TO
0,Villanova,2014,2017


## 1.11 Get all teams
Permite obtener un tablón con todos los equipos de la história de la NBA y algunas estadísticas, por temporada, associadas al equipo: victorias, derroatas, resutado en playoff, etc.

**Argumentos**

In [23]:
def get_all_teams():
    start_timer = time.perf_counter()
    pr = rq.get('https://widgets.sports-reference.com/wg.fcgi?css=1&site=bbr&url=%2Fteams%2F&div=div_teams_active')
    if pr.status_code == 200:
        soup = BeautifulSoup(pr.content, 'html.parser')
        table = soup.find('table')
        if table is None:
            return pd.DataFrame()
        rows = table.findAll('tr', {'class':'full_table'})
        
        df = pd.DataFrame()
        for each in rows:
            df = df.append(get_team_info(each.find('a').get('href').replace('https://www.basketball-reference.com/teams', '')), ignore_index=True) 
        end_timer = time.perf_counter()
        print(f"Scraped " + str(len(df)) + f" teams in {end_timer - start_timer:0.4f} seconds")
        return df
    return pd.DataFrame()

def get_team_info(_team_url):
    team_3_letters = str(_team_url[-4:-1])
    suffix = _team_url.replace('/', '%2F')
    selector = team_3_letters
    print(f'Scraping {team_3_letters} team info...')
    pr = rq.get(f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=bbr&url=%2Fteams{suffix}&div=div_{selector}')
    if pr.status_code == 200:
        soup = BeautifulSoup(pr.content, 'html.parser')
        table = soup.find('table')
        if table is None:
            return pd.DataFrame()
        df = pd.read_html(str(table))[0]
        df.insert(0, 'TEAM_3_LETTERS', team_3_letters)
        df.rename(columns = {'Season': 'SEASON', 'Lg': 'LEAGUE', 'Finish': 'FINISH', 'Pace': 'PACE',
            'Rel Pace': 'REL_PACE', 'ORRtg': 'ORTG', 'Rel ORtg': 'REL_ORTG', 'DRtg': 'DRTG',
            'Rel DRtg': 'REL_DRTG', 'Playoffs': 'PLAYOFFS'}, inplace=True)
        #df['PLAYOFFS'] = df['PLAYOFFS'].apply(lambda x: True if not pd.isnull(x) else False)
        df = df.drop(['Team'], axis=1)
        df = df.drop(['Coaches'], axis=1)
        df = df.drop(['Top WS'], axis=1)
        df.dropna(axis=0, how="all", inplace=True)
        df.dropna(axis=1, how="all", inplace=True)
        df.reset_index().drop('index', axis=1, inplace=True)
        return df
    return pd.DataFrame()
    

In [24]:
# Testing cell
all_teams = get_all_teams()
all_teams

Scraping ATL team info...
Scraping BOS team info...
Scraping NJN team info...
Scraping CHA team info...
Scraping CHI team info...
Scraping CLE team info...
Scraping DAL team info...
Scraping DEN team info...
Scraping DET team info...
Scraping GSW team info...
Scraping HOU team info...
Scraping IND team info...
Scraping LAC team info...
Scraping LAL team info...
Scraping MEM team info...
Scraping MIA team info...
Scraping MIL team info...
Scraping MIN team info...
Scraping NOH team info...
Scraping NYK team info...
Scraping OKC team info...
Scraping ORL team info...
Scraping PHI team info...
Scraping PHO team info...
Scraping POR team info...
Scraping SAC team info...
Scraping SAS team info...
Scraping TOR team info...
Scraping UTA team info...
Scraping WAS team info...
Scraped 1603 teams in 26.5174 seconds


Unnamed: 0,TEAM_3_LETTERS,SEASON,LEAGUE,W,L,W/L%,FINISH,SRS,PACE,REL_PACE,ORtg,REL_ORTG,DRTG,REL_DRTG,PLAYOFFS
0,ATL,2021-22,NBA,43,39,0.524,2nd of 5,1.55,97.7,-0.5,116.5,4.5,114.9,2.9,Lost E. Conf. 1st Rnd.
1,ATL,2020-21,NBA,41,31,0.569,1st of 5,2.14,97.6,-1.6,115.7,3.4,113.3,1.0,Lost E. Conf. Finals
2,ATL,2019-20,NBA,20,47,0.299,5th of 5,-7.71,103.0,2.7,107.2,-3.4,114.8,4.2,
3,ATL,2018-19,NBA,29,53,0.354,5th of 5,-6.06,103.9,3.9,108.1,-2.3,113.9,3.5,
4,ATL,2017-18,NBA,24,58,0.293,5th of 5,-5.30,98.3,1.0,105.0,-3.6,110.6,2.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1598,WAS,1965-66,NBA,38,42,0.475,2nd of 5,-1.06,123.1,1.7,95.8,0.9,96.8,1.9,Lost W. Div. Semis
1599,WAS,1964-65,NBA,37,43,0.463,3rd of 5,-1.97,116.2,-1.1,96.8,3.2,98.7,5.1,Lost W. Div. Finals
1600,WAS,1963-64,NBA,31,49,0.388,4th of 5,-1.59,117.0,3.8,95.0,0.4,96.6,2.0,
1601,WAS,1962-63,NBA,25,55,0.313,5th of 5,-3.63,112.2,-2.8,97.5,1.6,101.1,5.2,


## 1.12 Get all drafted players
Permite obtener un Dataframe con un registro para jugador NBA drafteado, con información básica sobre el Draft (número de pick, college, etc.)

**Solo cogemos drafts desde 1950 (descarto pre NBA)**

In [25]:
def get_all_drafted_players():
    start_timer = time.perf_counter()
    pr = rq.get('https://www.basketball-reference.com/draft/')
    if pr.status_code == 200:
        soup = BeautifulSoup(pr.content, 'html.parser')
        table = soup.find('table', {'id': 'first_overall'})
        if table is None:
            return pd.DataFrame()
        rows = table.findAll('th', {'data-stat': 'draft'})[1:-3]    
        df = pd.DataFrame()
        for each in rows:
            df = df.append(get_drafted_players_from_draft(each.a.get('href')), ignore_index=True)
        end_timer = time.perf_counter()
        print(f"Scraped the draft info of " + str(len(df)) + f" players in {end_timer - start_timer:0.4f} seconds")
        return df
    return pd.DataFrame()

def get_drafted_players_from_draft(_draft_url):
    draft_year = _draft_url[-9:-5]
    suffix = _draft_url.replace('/', '%2F')
    print(f'Scraping {_draft_url} draft...')
    pr = rq.get(f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=bbr&url={suffix}&div=div_stats')
    if pr.status_code == 200:
        soup = BeautifulSoup(pr.content, 'html.parser')
        table = soup.find('table')
        if table is None:
            return pd.DataFrame()
        df = pd.read_html(str(table))[0]
        df.columns = df.columns.droplevel(level=0)
        df = df[df['Player'] != 'Player']
        df = df[~df['Player'].str.contains('Round\s\d', na=False)]
        df.dropna(subset=['Player'], axis=0, how="all", inplace=True)
        df.insert(0, 'DRAFT_YEAR', draft_year)
        player_url = []
        for each in table.findAll('td', {'data-stat': 'player'}):
            player_url.append(each.find('a').get('href').replace('https://www.basketball-reference.com', ''))
        df.insert(5, 'PLAYER_URL', player_url)
        df.rename(columns = {'Pk': 'PICK', 'Tm': 'TEAM', 'Player': 'PLAYER', 'College': 'COLLEGE',
            'Yrs': 'YEARS_IN_LEAGUE'}, inplace=True)
        df = df.drop(['Rk'], axis=1)
        df = df.iloc[:, :6]
        df.dropna(axis=0, how="all", inplace=True)
        df.dropna(axis=1, how="all", inplace=True)
        df.reset_index().drop('index', axis=1, inplace=True)
        return df
    return pd.DataFrame()

In [26]:
# Testing cell
all_drafted_players = get_all_drafted_players()
all_drafted_players

Scraping /draft/NBA_2022.html draft...
Scraping /draft/NBA_2021.html draft...
Scraping /draft/NBA_2020.html draft...
Scraping /draft/NBA_2019.html draft...
Scraping /draft/NBA_2018.html draft...
Scraping /draft/NBA_2017.html draft...
Scraping /draft/NBA_2016.html draft...
Scraping /draft/NBA_2015.html draft...
Scraping /draft/NBA_2014.html draft...
Scraping /draft/NBA_2013.html draft...
Scraping /draft/NBA_2012.html draft...
Scraping /draft/NBA_2011.html draft...
Scraping /draft/NBA_2010.html draft...
Scraping /draft/NBA_2009.html draft...
Scraping /draft/NBA_2008.html draft...
Scraping /draft/NBA_2007.html draft...
Scraping /draft/NBA_2006.html draft...
Scraping /draft/NBA_2005.html draft...
Scraping /draft/NBA_2004.html draft...
Scraping /draft/NBA_2003.html draft...
Scraping /draft/NBA_2002.html draft...
Scraping /draft/NBA_2001.html draft...
Scraping /draft/NBA_2000.html draft...
Scraping /draft/NBA_1999.html draft...
Scraping /draft/NBA_1998.html draft...
Scraping /draft/NBA_1997.

Unnamed: 0,DRAFT_YEAR,PICK,TEAM,PLAYER,PLAYER_URL,COLLEGE
0,2022,1,ORL,Paolo Banchero,/players/b/banchpa01.html,Duke
1,2022,2,OKC,Chet Holmgren,/players/h/holmgch01.html,Gonzaga
2,2022,3,HOU,Jabari Smith Jr.,/players/s/smithja05.html,Auburn
3,2022,4,SAC,Keegan Murray,/players/m/murrake02.html,Iowa
4,2022,5,DET,Jaden Ivey,/players/i/iveyja01.html,Purdue
...,...,...,...,...,...,...
7929,1950,117,MNL,Andy Butchko,/players/b/butchan01.html,Purdue
7930,1950,118,SYR,Glenn Wilkes,/players/w/wilkegl01.html,Mercer University
7931,1950,119,PHW,Leo Wolfe,/players/w/wolfele01.html,Villanova
7932,1950,120,INO,Jimmy Doyle,/players/d/doyleji01.html,Butler


# 2.0 Main scraper code

A continuación usamos el suite de funciones definidas anteriormente para poder recopilar la información que usaremos para los posteriores notebooks y modelado de datos.

### 2.1 NBA scraper code
Iniciamos con la descraga de algunas tablas básicas: tabla de jugadores, tabla del draft y la unión de ambas.

In [27]:
try:
    
    all_players = pd.read_excel('./Scraped data/all_players.xlsx',index_col=0)
    all_drafted_players = pd.read_excel('./Scraped data/all_drafted_players.xlsx',index_col=0) 
    df = pd.read_excel('./Scraped data/combined_drafted_and_all_players.xlsx',index_col=0)
    
    print('Documentos ya descargados...')
    
except:
    print('Descargando documentos...')
    all_players = get_all_players()
    all_players.to_excel('./Scraped data/all_players.xlsx')

    all_drafted_players = get_all_drafted_players()
    all_drafted_players.to_excel('./Scraped data/all_drafted_players.xlsx')

    df = pd.merge(all_players, all_drafted_players, left_on='NAME', right_on='PLAYER', how='left') #El resultado de esto, añade a la tabla inicial de jugadores sus datos de Draft
    df = df.drop(['PLAYER'], axis=1)
    df = df.drop(['PLAYER_URL'], axis=1)
    df = df.drop(['COLLEGE_y'], axis=1)
    df.rename(columns = {'PICK': 'DRAFT_PICK', 'COLLEGE_x': 'COLLEGE', 'TEAM': 'DRAFT_TEAM'}, inplace=True)
    df.to_excel('./Scraped data/combined_drafted_and_all_players.xlsx') #Esta es la tabla maestra de jugadores resultante, dado que continene sólo jugadores con estadísticas NBA

Documentos ya descargados...


In [28]:
all_players.head()

Unnamed: 0,URL,NAME,ACTIVE_FROM,ACTIVE_TO,POSITION,COLLEGE,HEIGHT,WEIGHT,BIRTH_DATE
0,/players/a/abdelal01.html,Alaa Abdelnaby,1991,1995,F-C,Duke,6-10,240.0,1968-06-24
1,/players/a/abdulza01.html,Zaid Abdul-Aziz,1969,1978,C-F,Iowa State,6-9,235.0,1946-04-07
2,/players/a/abdulka01.html,Kareem Abdul-Jabbar,1970,1989,C,UCLA,7-2,225.0,1947-04-16
3,/players/a/abdulma02.html,Mahmoud Abdul-Rauf,1991,2001,G,LSU,6-1,162.0,1969-03-09
4,/players/a/abdulta01.html,Tariq Abdul-Wahad,1998,2003,F,"Michigan, San Jose State",6-6,223.0,1974-11-03


In [29]:
all_drafted_players.head()

Unnamed: 0,DRAFT_YEAR,PICK,TEAM,PLAYER,PLAYER_URL,COLLEGE
0,2021,1.0,DET,Cade Cunningham,/players/c/cunnica01.html,Oklahoma State
1,2021,2.0,HOU,Jalen Green,/players/g/greenja05.html,
2,2021,3.0,CLE,Evan Mobley,/players/m/mobleev01.html,USC
3,2021,4.0,TOR,Scottie Barnes,/players/b/barnesc01.html,Florida State
4,2021,5.0,ORL,Jalen Suggs,/players/s/suggsja01.html,Gonzaga


In [30]:
df.head()

Unnamed: 0,URL,NAME,ACTIVE_FROM,ACTIVE_TO,POSITION,COLLEGE,HEIGHT,WEIGHT,BIRTH_DATE,DRAFT_YEAR,DRAFT_PICK,DRAFT_TEAM
0,/players/a/abdelal01.html,Alaa Abdelnaby,1991,1995,F-C,Duke,6-10,240.0,1968-06-24,1990.0,25.0,POR
1,/players/a/abdulza01.html,Zaid Abdul-Aziz,1969,1978,C-F,Iowa State,6-9,235.0,1946-04-07,1968.0,5.0,CIN
2,/players/a/abdulka01.html,Kareem Abdul-Jabbar,1970,1989,C,UCLA,7-2,225.0,1947-04-16,1969.0,1.0,MIL
3,/players/a/abdulma02.html,Mahmoud Abdul-Rauf,1991,2001,G,LSU,6-1,162.0,1969-03-09,1990.0,3.0,DEN
4,/players/a/abdulta01.html,Tariq Abdul-Wahad,1998,2003,F,"Michigan, San Jose State",6-6,223.0,1974-11-03,1997.0,11.0,SAC


A continuación la tabla con la información referente a todos los equipos de la NBA y sus resultados por temporada.

In [31]:
try:
    
    all_teams = pd.read_excel('./Scraped data/all_teams.xlsx',index_col=0)
    print('Documentos ya descargados...')
    
except:
    print('Descargando documentos...')
    all_teams = get_all_teams()
    all_teams.to_excel('./Scraped data/all_teams.xlsx')

Documentos ya descargados...


In [32]:
all_teams.head()

Unnamed: 0,TEAM_3_LETTERS,SEASON,LEAGUE,W,L,W/L%,FINISH,SRS,PACE,REL_PACE,ORtg,REL_ORTG,DRTG,REL_DRTG,PLAYOFFS
0,ATL,2021-22,NBA,43,39,0.524,2nd of 5,1.55,97.7,-0.5,116.5,4.5,114.9,2.9,Lost E. Conf. 1st Rnd.
1,ATL,2020-21,NBA,41,31,0.569,1st of 5,2.14,97.6,-1.6,115.7,3.4,113.3,1.0,Lost E. Conf. Finals
2,ATL,2019-20,NBA,20,47,0.299,5th of 5,-7.71,103.0,2.7,107.2,-3.4,114.8,4.2,
3,ATL,2018-19,NBA,29,53,0.354,5th of 5,-6.06,103.9,3.9,108.1,-2.3,113.9,3.5,
4,ATL,2017-18,NBA,24,58,0.293,5th of 5,-5.3,98.3,1.0,105.0,-3.6,110.6,2.0,


En las próximas celdas,scrapearemos estadísticas de jugadores (tanto de regular season como de playoff) de diferentes tablas (totales, avanzadas y por partido) para jugadores posteriores a 1986 (año en que se instauró la línea de 3pt en la NCAA).

In [33]:
Filtro_3pt_NBA = 1986

In [34]:
#Filtramos por la aparición del tiro de 3

print('Pasamos de ' + str(len(df)) + ' jugadores...')
df_1 = df.copy()
df_1 = df_1[df_1['ACTIVE_FROM']>=Filtro_3pt_NBA].reset_index(drop=True)
print('...a ' + str(len(df_1)) + ' jugadores')

Pasamos de 5250 jugadores...
...a 2877 jugadores


In [35]:
df_1.head()

Unnamed: 0,URL,NAME,ACTIVE_FROM,ACTIVE_TO,POSITION,COLLEGE,HEIGHT,WEIGHT,BIRTH_DATE,DRAFT_YEAR,DRAFT_PICK,DRAFT_TEAM
0,/players/a/abdelal01.html,Alaa Abdelnaby,1991,1995,F-C,Duke,6-10,240.0,1968-06-24,1990.0,25.0,POR
1,/players/a/abdulma02.html,Mahmoud Abdul-Rauf,1991,2001,G,LSU,6-1,162.0,1969-03-09,1990.0,3.0,DEN
2,/players/a/abdulta01.html,Tariq Abdul-Wahad,1998,2003,F,"Michigan, San Jose State",6-6,223.0,1974-11-03,1997.0,11.0,SAC
3,/players/a/abdursh01.html,Shareef Abdur-Rahim,1997,2008,F,California,6-9,225.0,1976-12-11,1996.0,3.0,VAN
4,/players/a/abrinal01.html,Álex Abrines,2017,2019,G-F,,6-6,200.0,1993-08-01,,,


In [36]:
try:
   df_1 = pd.read_excel('./Scraped data/SCRAPPING_CHKPT.xlsx',index_col=0) 
   print('Cargando checkpoint...')
except:
    df_1['Scrapped'] = 'No'
    df_1[['URL','NAME','Scrapped']].to_excel('./Scraped data/SCRAPPING_CHKPT.xlsx')
    print('Creando checkpoint...')

Cargando checkpoint...


In [37]:
print('Faltan scrapear ' + str(len(df_1[df_1['Scrapped']=='No'])) + ' jugadores')

Faltan scrapear 0 jugadores


In [38]:
df_2 = df_1[df_1['Scrapped']=='No']
df_2.head()

Unnamed: 0,URL,NAME,Scrapped


**No se recomienda ejecutar el scraper, puede tardar varios minutos...**

In [39]:
#Nomenclatura para los siguientes Dataframes:

#stats: Referente a estadísticas en la NBA

# RS: Regular season
# PFF: Playoff

# PG: Per game
# TOT: Total
# ADV: Advanced

# CAR: career total
# BS: By season
# BT: By team

try:
    stats_RS_PG_BS_df = pd.read_csv('./Scraped data/stats_RS_PG_BS_df.csv',index_col=0) 
    stats_PFF_PG_BS_df = pd.read_csv('./Scraped data/stats_PFF_PG_BS_df.csv',index_col=0) 

    stats_RS_TOT_BS_df = pd.read_csv('./Scraped data/stats_RS_TOT_BS_df.csv',index_col=0) 
    stats_PFF_TOT_BS_df = pd.read_csv('./Scraped data/stats_PFF_TOT_BS_df.csv',index_col=0) 

    stats_RS_ADV_BS_df = pd.read_csv('./Scraped data/stats_RS_ADV_BS_df.csv',index_col=0) 
    stats_PFF_ADV_BS_df = pd.read_csv('./Scraped data/stats_PFF_ADV_BS_df.csv',index_col=0) 
    
    player_extra_info_df = pd.read_csv('./Scraped data/player_extra_info_df.csv',index_col=0)

except:
    
    print('Algun problema')
    
    stats_RS_PG_BS_df = pd.DataFrame()
    stats_PFF_PG_BS_df = pd.DataFrame()

    stats_RS_TOT_BS_df = pd.DataFrame()
    stats_PFF_TOT_BS_df = pd.DataFrame()

    stats_RS_ADV_BS_df = pd.DataFrame()
    stats_PFF_ADV_BS_df = pd.DataFrame()
    
    player_extra_info_df = pd.DataFrame()

In [40]:
player_extra_info_df.columns

Index(['URL', 'ALL-ROOKIE', 'MOST_IMPROVED', 'COLLEGE_URL', 'ALL_STAR',
       'ALL-DEFENSIVE', 'ALL-NBA', 'HALL_OF_FAME', 'NBA_CHAMP',
       'NBA_75TH_ANNIV._TEAM', '2008_NBA_CHAMP', '2013_NBA_CHAMP',
       '2006_NBA_CHAMP', '2021_NBA_CHAMP', 'MVP', 'DEF._POY', 'AS_MVP',
       'FINALS_MVP', '2020_NBA_CHAMP', 'SCORING_CHAMP', '2019_NBA_CHAMP',
       '2009_NBA_CHAMP', 'SIXTH_MAN', '2014_NBA_CHAMP', 'ROY',
       '2015_NBA_CHAMP', '2011_NBA_CHAMP', '2017_NBA_CHAMP', '2003_NBA_CHAMP',
       '1990_NBA_CHAMP', '2018_NBA_CHAMP', '2004_NBA_CHAMP', 'STL_CHAMP',
       'BLK_CHAMP', '1998_NBA_CHAMP', '1987_NBA_CHAMP', '1995_NBA_CHAMP',
       '1994_NBA_CHAMP', '2005_NBA_CHAMP', '2007_NBA_CHAMP', 'TRB_CHAMP',
       '2000_NBA_CHAMP', '2012_NBA_CHAMP', '1999_NBA_CHAMP', '1997_NBA_CHAMP',
       '2016_NBA_CHAMP', '1989_NBA_CHAMP', '2001_NBA_CHAMP', '1996_NBA_CHAMP',
       'AST_CHAMP', '1991_NBA_CHAMP', '2002_NBA_CHAMP', '1986_NBA_CHAMP',
       '1988_NBA_CHAMP', '1993_NBA_CHAMP', '2010_NBA_CH

In [41]:
player_extra_info_df[player_extra_info_df['ALL-ROOKIE'].isna()==False]

Unnamed: 0,URL,ALL-ROOKIE,MOST_IMPROVED,COLLEGE_URL,ALL_STAR,ALL-DEFENSIVE,ALL-NBA,HALL_OF_FAME,NBA_CHAMP,NBA_75TH_ANNIV._TEAM,...,1989_NBA_CHAMP,2001_NBA_CHAMP,1996_NBA_CHAMP,AST_CHAMP,1991_NBA_CHAMP,2002_NBA_CHAMP,1986_NBA_CHAMP,1988_NBA_CHAMP,1993_NBA_CHAMP,2010_NBA_CHAMP
1,/players/a/abdulma02.html,1990-91,1992-93,/players/mahmoud-abdul-rauf-1.html,,,,,,,...,,,,,,,,,,
3,/players/a/abdursh01.html,1996-97,,/players/shareef-abdur-rahim-1.html,1.0,,,,,,...,,,,,,,,,,
13,/players/a/adamsst01.html,2013-14,,/players/steven-adams-1.html,,,,,,,...,,,,,,,,,,
29,/players/a/aldrila01.html,2006-07,,/players/lamarcus-aldridge-1.html,7.0,,5,,,,...,,,,,,,,,,
32,/players/a/alexaco02.html,2000-01,,/players/courtney-alexander-1.html,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999,/players/w/wrighsh01.html,1994-95,,/players/sharone-wright-1.html,,,,,,,...,,,,,,,,,,
3012,/players/y/youngth01.html,2007-08,,/players/thaddeus-young-1.html,,,,,,,...,,,,,,,,,,
3014,/players/y/youngtr01.html,2018-19,,/players/trae-young-1.html,2.0,,2021-22,,,,...,,,,,,,,,,
3018,/players/z/zelleco01.html,2013-14,,/players/cody-zeller-1.html,,,,,,,...,,,,,,,,,,


In [42]:
#Solo escrapeamos por temporada, dado que la media total y por equipo derivan de aquí !!!

for index, row in df_2.iterrows():
    
    stats_RS_PG_BS_df = stats_RS_PG_BS_df.append(get_player_stats(row['URL'], stat_type='PER_GAME', playoffs=False, career=False, career_by_team=False), ignore_index=True)
    
    stats_PFF_PG_BS_df = stats_PFF_PG_BS_df.append(get_player_stats(row['URL'], stat_type='PER_GAME', playoffs=True, career=False, career_by_team=False), ignore_index=True)
    
    stats_RS_TOT_BS_df = stats_RS_TOT_BS_df.append(get_player_stats(row['URL'], stat_type='TOTALS', playoffs=False, career=False, career_by_team=False), ignore_index=True)
    
    stats_PFF_TOT_BS_df = stats_PFF_TOT_BS_df.append(get_player_stats(row['URL'], stat_type='TOTALS', playoffs=True, career=False, career_by_team=False), ignore_index=True)
    
    stats_RS_ADV_BS_df = stats_RS_ADV_BS_df.append(get_player_stats(row['URL'], stat_type='ADVANCED', playoffs=False, career=False, career_by_team=False), ignore_index=True)
    
    stats_PFF_ADV_BS_df = stats_PFF_ADV_BS_df.append(get_player_stats(row['URL'], stat_type='ADVANCED', playoffs=True, career=False, career_by_team=False), ignore_index=True)

    player_extra_info_df = player_extra_info_df.append(get_player_extra_info(row['URL']), ignore_index=True)
    
    stats_RS_PG_BS_df.to_csv('./Scraped data/stats_RS_PG_BS_df.csv')
    stats_PFF_PG_BS_df.to_csv('./Scraped data/stats_PFF_PG_BS_df.csv')
    stats_RS_TOT_BS_df.to_csv('./Scraped data/stats_RS_TOT_BS_df.csv')
    stats_PFF_TOT_BS_df.to_csv('./Scraped data/stats_PFF_TOT_BS_df.csv')
    stats_RS_ADV_BS_df.to_csv('./Scraped data/stats_RS_ADV_BS_df.csv')
    stats_PFF_ADV_BS_df.to_csv('./Scraped data/stats_PFF_ADV_BS_df.csv')
    player_extra_info_df.to_csv('./Scraped data/player_extra_info_df.csv')
    
    df_1.loc[index,'Scrapped'] = 'Yes'
    df_1.to_excel('./Scraped data/SCRAPPING_CHKPT.xlsx') #para ir actualizando el checkpoint de los scrapeados
    
    print(index)
    #time.sleep(1)

In [43]:
stats_PFF_TOT_BS_df.head()

Unnamed: 0,SEASON,AGE,TEAM,LEAGUE,POS,G,GS,MP,FG,FGA,...,TRB,AST,STL,BLK,TOV,PF,PTS,URL,3P%,Trp Dbl
0,1990-91,22.0,POR,NBA,PF,5.0,0.0,13.0,2.0,6.0,...,3.0,0.0,0.0,0.0,0.0,0.0,4.0,/players/a/abdelal01.html,,
1,1991-92,23.0,POR,NBA,PF,8.0,0.0,25.0,5.0,10.0,...,4.0,2.0,0.0,0.0,2.0,4.0,12.0,/players/a/abdelal01.html,,
2,1992-93,24.0,BOS,NBA,PF,4.0,4.0,68.0,11.0,24.0,...,13.0,1.0,0.0,1.0,9.0,7.0,22.0,/players/a/abdelal01.html,,
3,1993-94,24.0,DEN,NBA,PG,12.0,12.0,339.0,57.0,154.0,...,18.0,30.0,5.0,1.0,14.0,29.0,155.0,/players/a/abdulma02.html,0.324,
4,1994-95,25.0,DEN,NBA,PG,3.0,2.0,76.0,12.0,33.0,...,5.0,5.0,2.0,0.0,8.0,8.0,40.0,/players/a/abdulma02.html,0.167,


In [44]:
player_extra_info_df.head()

Unnamed: 0,URL,ALL-ROOKIE,MOST_IMPROVED,COLLEGE_URL,ALL_STAR,ALL-DEFENSIVE,ALL-NBA,HALL_OF_FAME,NBA_CHAMP,NBA_75TH_ANNIV._TEAM,...,1989_NBA_CHAMP,2001_NBA_CHAMP,1996_NBA_CHAMP,AST_CHAMP,1991_NBA_CHAMP,2002_NBA_CHAMP,1986_NBA_CHAMP,1988_NBA_CHAMP,1993_NBA_CHAMP,2010_NBA_CHAMP
0,/players/a/abdelal01.html,,,,,,,,,,...,,,,,,,,,,
1,/players/a/abdulma02.html,1990-91,1992-93,/players/mahmoud-abdul-rauf-1.html,,,,,,,...,,,,,,,,,,
2,/players/a/abdulta01.html,,,,,,,,,,...,,,,,,,,,,
3,/players/a/abdursh01.html,1996-97,,/players/shareef-abdur-rahim-1.html,1.0,,,,,,...,,,,,,,,,,
4,/players/a/abrinal01.html,,,,,,,,,,...,,,,,,,,,,


### 2.1 COLLEGE scrapper code
En este caso, scrapeamos estadísticas referentes a los años de college de jugadores NBA, también para diferentes tablas y aplicando el filtro ya mencionado anteriormente.

In [45]:
Filtro_3pt_college = 1986 #se puede cambiar a 1979

In [46]:
df_c = player_extra_info_df[['URL','COLLEGE_URL']]
df_c

Unnamed: 0,URL,COLLEGE_URL
0,/players/a/abdelal01.html,
1,/players/a/abdulma02.html,/players/mahmoud-abdul-rauf-1.html
2,/players/a/abdulta01.html,
3,/players/a/abdursh01.html,/players/shareef-abdur-rahim-1.html
4,/players/a/abrinal01.html,
...,...,...
3024,/players/z/zimmede01.html,
3025,/players/z/zimmest01.html,
3026,/players/z/zipsepa01.html,
3027,/players/z/zizican01.html,


In [47]:
df_c_1 = df_c.dropna(subset=['COLLEGE_URL']).reset_index(drop=True)
df_c_1

Unnamed: 0,URL,COLLEGE_URL
0,/players/a/abdulma02.html,/players/mahmoud-abdul-rauf-1.html
1,/players/a/abdursh01.html,/players/shareef-abdur-rahim-1.html
2,/players/a/adamsmi01.html,/players/michael-adams-1.html
3,/players/a/adamsst01.html,/players/steven-adams-1.html
4,/players/a/adebaba01.html,/players/edrice-adebayo-1.html
...,...,...
580,/players/y/youngni01.html,/players/nick-young-1.html
581,/players/y/youngth01.html,/players/thaddeus-young-1.html
582,/players/y/youngtr01.html,/players/trae-young-1.html
583,/players/z/zelleco01.html,/players/cody-zeller-1.html


In [48]:
# No hace falta filtrar por la aparición del tiro de 3, ya se ha hecho anteriormente

print('Tenemos ' + str(len(df_c_1)) + ' jugadores...')

Tenemos 585 jugadores...


In [49]:
try:
   df_c_1 = pd.read_excel('./Scraped data/SCRAPPING_CHKPT_college.xlsx',index_col=0) 
   print('Cargando checkpoint...')
except:
    df_c_1['Scrapped'] = 'No'
    df_c_1[['URL','COLLEGE_URL','Scrapped']].to_excel('./Scraped data/SCRAPPING_CHKPT_college.xlsx')
    print('Creando checkpoint...')

Cargando checkpoint...


In [50]:
print('Faltan scrapear ' + str(len(df_c_1[df_c_1['Scrapped']=='No'])) + ' jugadores')

Faltan scrapear 0 jugadores


In [51]:
df_c_2 = df_c_1[df_c_1['Scrapped']=='No']
df_c_2.head()

Unnamed: 0,URL,COLLEGE_URL,Scrapped


**No se recomienda ejecutar el scraper, puede tardar varios minutos...**

In [52]:
#Nomenclatura para los siguientes Dataframes:

#stats: Referente a estadísticas en la NBA

# RS: Regular season
# CONF: Playoff

# PG: Per game
# TOT: Total
# ADV: Advanced

# CAR: career total
# BS: By season
# BT: By team

try:
    college_stats_RS_PG_BS_df = pd.read_csv('./Scraped data/college_stats_RS_PG_BS_df.csv',index_col=0) 
    college_stats_CONF_PG_BS_df = pd.read_csv('./Scraped data/college_stats_CONF_PG_BS_df.csv',index_col=0) 

    college_stats_RS_TOT_BS_df = pd.read_csv('./Scraped data/college_stats_RS_TOT_BS_df.csv',index_col=0) 
    college_stats_CONF_TOT_BS_df = pd.read_csv('./Scraped data/college_stats_CONF_TOT_BS_df.csv',index_col=0) 

    college_stats_RS_ADV_BS_df = pd.read_csv('./Scraped data/college_stats_RS_ADV_BS_df.csv',index_col=0) 
    college_stats_CONF_ADV_BS_df = pd.read_csv('./Scraped data/college_stats_CONF_ADV_BS_df.csv',index_col=0) 
    
    college_player_extra_info_df = pd.read_csv('./Scraped data/college_player_extra_info_df.csv',index_col=0)

except:
    
    print('Algun problema')
    
    college_stats_RS_PG_BS_df = pd.DataFrame()
    college_stats_CONF_PG_BS_df = pd.DataFrame()

    college_stats_RS_TOT_BS_df = pd.DataFrame()
    college_stats_CONF_TOT_BS_df = pd.DataFrame()

    college_stats_RS_ADV_BS_df = pd.DataFrame()
    college_stats_CONF_ADV_BS_df = pd.DataFrame()
    
    college_player_extra_info_df = pd.DataFrame()

In [53]:
#Solo escrapeamos por temporada, dado que la media total y por equipo derivan de aquí !!!

for index, row in df_c_2.iterrows():
    
    college_stats_RS_PG_BS_df = college_stats_RS_PG_BS_df.append(get_player_college_stats(row['COLLEGE_URL'], stat_type='PER_GAME', conference=False, career=False, career_by_team=False), ignore_index=True)
    
    college_stats_CONF_PG_BS_df = college_stats_CONF_PG_BS_df.append(get_player_college_stats(row['COLLEGE_URL'], stat_type='PER_GAME', conference=True, career=False, career_by_team=False), ignore_index=True)
    
    college_stats_RS_TOT_BS_df = college_stats_RS_TOT_BS_df.append(get_player_college_stats(row['COLLEGE_URL'], stat_type='TOTALS', conference=False, career=False, career_by_team=False), ignore_index=True)
    
    college_stats_CONF_TOT_BS_df = college_stats_CONF_TOT_BS_df.append(get_player_college_stats(row['COLLEGE_URL'], stat_type='TOTALS', conference=True, career=False, career_by_team=False), ignore_index=True)
    
    college_stats_RS_ADV_BS_df = college_stats_RS_ADV_BS_df.append(get_player_college_stats(row['COLLEGE_URL'], stat_type='ADVANCED', conference=False, career=False, career_by_team=False), ignore_index=True)
    
    college_stats_CONF_ADV_BS_df = college_stats_CONF_ADV_BS_df.append(get_player_college_stats(row['COLLEGE_URL'], stat_type='ADVANCED', conference=True, career=False, career_by_team=False), ignore_index=True)

    college_player_extra_info_df = college_player_extra_info_df.append(get_player_college_extra_info(row['COLLEGE_URL']), ignore_index=True)
    
    college_stats_RS_PG_BS_df.to_csv('./Scraped data/college_stats_RS_PG_BS_df.csv')
    college_stats_CONF_PG_BS_df.to_csv('./Scraped data/college_stats_CONF_PG_BS_df.csv')
    college_stats_RS_TOT_BS_df.to_csv('./Scraped data/college_stats_RS_TOT_BS_df.csv')
    college_stats_CONF_TOT_BS_df.to_csv('./Scraped data/college_stats_CONF_TOT_BS_df.csv')
    college_stats_RS_ADV_BS_df.to_csv('./Scraped data/college_stats_RS_ADV_BS_df.csv')
    college_stats_CONF_ADV_BS_df.to_csv('./Scraped data/college_stats_CONF_ADV_BS_df.csv')
    college_player_extra_info_df.to_csv('./Scraped data/college_player_extra_info_df.csv')
    
    df_c_1.loc[index,'Scrapped'] = 'Yes'
    df_c_1.to_excel('./Scraped data/SCRAPPING_CHKPT_college.xlsx') #para ir actualizando el checkpoint de los scrapeados
    
    print(index)
    #time.sleep(1)

In [54]:
college_stats_RS_TOT_BS_df

Unnamed: 0,SEASON,SCHOOL,CONF,G,MP,FG,FGA,FG%,2P,2PA,...,AST,STL,BLK,TOV,PF,PTS,COLLEGE_URL,GS,ORB,DRB
0,1988-89,LSU,SEC,32.0,1180.0,359.0,739.0,0.486,275.0,523.0,...,130.0,55.0,6.0,113.0,70.0,965.0,/players/mahmoud-abdul-rauf-1.html,,,
1,1989-90,LSU,SEC,32.0,1202.0,305.0,662.0,0.461,217.0,416.0,...,102.0,52.0,1.0,117.0,66.0,889.0,/players/mahmoud-abdul-rauf-1.html,,,
2,1995-96,California,Pac-10,28.0,972.0,206.0,398.0,0.518,198.0,377.0,...,29.0,52.0,35.0,87.0,58.0,590.0,/players/shareef-abdur-rahim-1.html,28.0,,
3,1981-82,Boston College,Big East,26.0,379.0,51.0,103.0,0.495,,,...,40.0,29.0,3.0,28.0,28.0,138.0,/players/michael-adams-1.html,0.0,,
4,1982-83,Boston College,Big East,32.0,1075.0,195.0,405.0,0.481,,,...,170.0,88.0,1.0,117.0,71.0,517.0,/players/michael-adams-1.html,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1702,2012-13,Indiana,Big Ten,36.0,1062.0,199.0,353.0,0.564,199.0,351.0,...,47.0,37.0,45.0,81.0,80.0,594.0,/players/cody-zeller-1.html,36.0,101.0,188.0
1703,2008-09,UNC,ACC,15.0,117.0,17.0,36.0,0.472,17.0,36.0,...,3.0,3.0,3.0,8.0,20.0,47.0,/players/tyler-zeller-1.html,2.0,11.0,19.0
1704,2009-10,UNC,ACC,27.0,468.0,100.0,192.0,0.521,100.0,190.0,...,7.0,14.0,24.0,34.0,49.0,252.0,/players/tyler-zeller-1.html,0.0,52.0,73.0
1705,2010-11,UNC,ACC,37.0,1041.0,211.0,384.0,0.549,211.0,384.0,...,23.0,27.0,45.0,50.0,100.0,583.0,/players/tyler-zeller-1.html,35.0,107.0,161.0


In [55]:
college_stats_RS_ADV_BS_df

Unnamed: 0,SEASON,SCHOOL,CONF,G,MP,TS%,eFG%,3PAr,FTr,TOV%,...,ORB%,DRB%,TRB%,AST%,STL%,BLK%,USG%,OBPM,DBPM,BPM
0,1988-89,LSU,SEC,32.0,1180.0,0.579,0.543,0.292,0.271,11.9,...,,,,,,,,,,
1,1989-90,LSU,SEC,32.0,1202.0,0.584,0.527,0.372,0.317,13.3,...,,,,,,,,,,
2,1995-96,California,Pac-10,28.0,972.0,0.571,0.528,0.053,0.626,14.4,...,,,,,,,,,,
3,1981-82,Boston College,Big East,26.0,379.0,0.523,,,0.592,17.5,...,,,,,,,,,,
4,1982-83,Boston College,Big East,32.0,1075.0,0.539,,,0.388,19.6,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1702,2012-13,Indiana,Big Ten,36.0,1062.0,0.624,0.564,0.006,0.734,14.5,...,12.3,20.4,16.6,9.5,2.1,4.4,26.5,7.2,3.8,11.1
1703,2008-09,UNC,ACC,15.0,117.0,0.533,0.472,0.000,0.472,15.4,...,,,13.3,4.0,,1.8,19.7,,,
1704,2009-10,UNC,ACC,27.0,468.0,0.557,0.521,0.010,0.375,13.1,...,11.6,15.7,13.7,3.2,1.7,4.9,25.4,,,
1705,2010-11,UNC,ACC,37.0,1041.0,0.601,0.549,0.000,0.552,9.4,...,10.8,15.2,13.1,4.4,1.4,4.2,23.6,6.1,3.6,9.7


In [56]:
college_player_extra_info_df

Unnamed: 0,HS_PARADE_AA,CONSENSUS_AA,SEC_POY,ALL-SEC,SEC_ALL-FRESHMAN,COLLEGE_URL,PAC-10_POY,ALL-PAC-12,PAC-12_ALL-FRESHMAN,PAC-10_ROY,...,PAC-10_MIP,2011_ALL-BIG_EAST_TOURNEY,2002_ALL-PAC-12_TOURNEY,2019_ALL-ACC_TOURNEY,2014_ALL-BIG_12_TOURNEY,1989_NIT_MVP,1988_ALL-ACC_TOURNEY,1998_ALL-BIG_EAST_TOURNEY,1993_ALL-ACC_TOURNEY,2013_ALL-BIG_TEN_TOURNEY
0,1.0,2.0,2,2,1.0,/players/mahmoud-abdul-rauf-1.html,,,,,...,,,,,,,,,,
1,1.0,,,,,/players/shareef-abdur-rahim-1.html,1995-96,1995-96,1.0,1995-96,...,,,,,,,,,,
2,,,,,,/players/michael-adams-1.html,,,,,...,,,,,,,,,,
3,,,,,,/players/steven-adams-1.html,,,,,...,,,,,,,,,,
4,,,,2016-17,1.0,/players/edrice-adebayo-1.html,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
572,,,,,,/players/nick-young-1.html,,2,,,...,,,,,,,,,,
573,1.0,,,,,/players/thaddeus-young-1.html,,,,,...,,,,,,,,,,
574,,1.0,,,,/players/trae-young-1.html,,,,,...,,,,,,,,,,
575,1.0,1.0,,,,/players/cody-zeller-1.html,,,,,...,,,,,,,,,,1.0
