In [1]:
import os
import pandas as pd
import requests
import src.downloading_and_cleaning as dac
from bs4 import BeautifulSoup

## 1. Downloading and cleaning

<img src="https://media.giphy.com/media/fMA8PgiJnCkNO92avj/giphy.gif" width="500" align="center">

In [2]:
#dac.download_dataset()

In [3]:
data = pd.read_csv("nba_data/dataset.csv", encoding = "ISO-8859-1")

As we could see there are no missing values in the dataset, so we are going to the court just renaming some columns and dropping one of them, the classic one "Unnamed:0". 

In [5]:
#dac.cleaning_data()

In [6]:
data.drop(["Unnamed: 0"], axis = 1, inplace = True)
data.columns = map(str.upper, data.columns)

Now we are on the court. We are playing a clean game with no fouls but... we need to enrich our stats. How? With information from NBA stats. We are going to add average statistics from this season. Because, you know, it's 2021. 

In [7]:
url_bio = "https://stats.nba.com/stats/leaguedashplayerbiostats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&Season=2020-21&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight="

In [8]:
headers = {
    "Accept": "application/json, text/plain, */*",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "es-ES,es;q=0.9",
    "Origin": "https://www.nba.com",
    "Referer": "https://www.nba.com/",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "same-site",
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
    "x-nba-stats-origin": "stats",
    "x-nba-stats-token": "true"
}

In [9]:
response_bio = requests.get(url_bio, headers=headers).json()

In [10]:
frame_bio = pd.DataFrame(response_bio['resultSets'][0]['rowSet'])
frame_bio.columns = response_bio['resultSets'][0]['headers']

In [11]:
frame_bio.drop(["PLAYER_ID", "TEAM_ID", "PLAYER_HEIGHT_INCHES", "PLAYER_HEIGHT", "PLAYER_WEIGHT"],
               axis=1, inplace=True)
frame_bio["SEASON"] = "2020-21"

In [72]:
frame_bio.head()

Unnamed: 0,PLAYER_NAME,TEAM_ABBREVIATION,AGE,COLLEGE,COUNTRY,DRAFT_YEAR,DRAFT_ROUND,DRAFT_NUMBER,GP,PTS,REB,AST,NET_RATING,OREB_PCT,DREB_PCT,USG_PCT,TS_PCT,AST_PCT,SEASON
0,Aaron Gordon,DEN,25.0,Arizona,USA,2014,1,4,33,13.8,6.0,3.8,1.5,0.047,0.158,0.217,0.553,0.191,2020-21
1,Aaron Holiday,IND,24.0,UCLA,USA,2018,1,23,50,7.4,1.2,1.7,-0.9,0.009,0.058,0.193,0.506,0.131,2020-21
2,Aaron Nesmith,BOS,21.0,Vanderbilt,USA,2020,1,14,30,3.4,2.2,0.3,3.6,0.034,0.122,0.122,0.51,0.033,2020-21
3,Abdel Nader,PHX,27.0,Iowa State,Egypt,2016,2,58,24,6.7,2.6,0.8,5.0,0.02,0.151,0.183,0.605,0.078,2020-21
4,Adam Mokoka,CHI,22.0,,France,Undrafted,Undrafted,Undrafted,9,1.4,0.4,0.4,-13.6,0.02,0.073,0.148,0.433,0.167,2020-21


At this moment, we have a new DataFrame with similar information of the original one but updated with info from this season. As we have assured that there are the same columns in each DF, the next step will be concat the datasets. Afterwards, we will export it as a .csv file.

In [12]:
data_updated = pd.concat([data, frame_bio])

In [13]:
data_updated.to_csv("nba_data_2021.csv")

## 2. Scrapping

<img src="https://media.giphy.com/media/c14TAuvWFjnhe/giphy.gif" width="500" align="center">

What's our idea? Calling the API from NBA Stats was easy so we are gonna do some scrapping to create a new database, pretty similar to the original one, but with info from the Spanish League. To do so, we are gonna dive into the website of acb.com. At the end of the process, we will export the new DataFrame as a .csv file.

In [86]:
url = "https://www.acb.com/club/estadisticas/id/"

In [87]:
stats_list = []
for year in range(2000, 2021):
    for team in range(1,20):
        html = requests.get(url+"{}".format(team)+"/temporada_id/"+"{}".format(year))
        res = BeautifulSoup(html.content,"html.parser")
        #Aquí faltaría un range de los jugadores
        table = res.find("tbody")
        rows = table.findAll("tr")
        for r in rows:
            elements = r.findAll("td")
            jugador = [e.getText().strip() for e in elements]
            stats_list.append(jugador)

In [88]:
#Buscamos los headers y reemplazamos los "\n" con comas y separamos por comas para crear la lista que queremos"
headers = res.findAll("thead")[0].findAll("tr")[1].text.replace("\n", ",").split(",")

In [89]:
#Eliminamos las columnas de más y añadimos la de "Val"
unwanted = {"", "\xa0"}
headers = [e for e in headers if e not in unwanted]
headers.append("Val")
len(headers)

25

In [90]:
#Me he hecho un lío y he copiado y pegado el "headers.columns" y he añadido un par de columnas más. 
headers = ['NADA','PLAYER_NAME', 'GP','MIN/G','5i','PPG','3FGM','3FGA','3FG%','2FGM','2FGA','2FG%','FTM','FTA','FT%',
           'DREB','OREB','TREB','ASIST','ST','TO','BLK','RBLK','DUNKS','PF','RPF','+/-','VAL']

In [104]:
acb_dataset = pd.DataFrame(stats_list, columns = headers)
index = acb_dataset[acb_dataset["PLAYER_NAME"] == "Totales"].index
acb_dataset.drop(index, inplace=True)
acb_dataset = acb_dataset.drop(["NADA", "5i"], axis = 1)

In [106]:
acb_dataset.to_csv("./acb_dataset.csv")