### Imports básicos

In [100]:
import requests
from bs4 import BeautifulSoup
import math
import random
from nltk.tokenize import word_tokenize
import string

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from datetime import datetime
import time

In [3]:
today = datetime.today().strftime('%Y-%m-%d')

In [4]:
year = today[0:4]
month = today[5:7]
day = int(today[8:10]) -1

In [5]:
print(year)
print(month)
print(day)

2024
11
0


# Explorando os dados

## Funções para o Download das informações

In [6]:
#Função para armazenar os jogos do dia
def get_games(d, m, y, url_list):
    
    #adicionando um timer para não sobrecarregar o site
    time.sleep(random.randint(8, 12))
    
    #Definindo a url do dia
    url = f"https://www.basketball-reference.com/boxscores/?month={m}&day={d}&year={y}"
    
    #Download do site
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    
    #Lista de jogos do dia
    games = soup.find('div', id='content').find('div', class_='game_summaries'
                        ).find_all('div', class_='game_summary expanded nohover')
    
    #Adicionando Urls do play_by_play
    for i in games:
        url_list.append(i.find('p', class_='links').find_all('a')[1].attrs['href'])
    

In [7]:
#Função para coletar os dados de cada jogo para a tabelas de jogos
def get_game_infos(d, m, y, games_dict, url):
    
    #adicionando um timer para não sobrecarregar o site
    time.sleep(random.randint(8, 12))
    
    #Fazendo Download do site
    pbp_url = f'https://www.basketball-reference.com{url}'
    response_pbp = requests.get(pbp_url)
    soup_pbp = BeautifulSoup(response_pbp.content, "html.parser")
    
    #Getting dict keys
    dict_keys = list(games_dict)
    
    #Adicionando chave e infos do dias
    games_dict[dict_keys[0]].append(url)
    games_dict[dict_keys[1]].append(d)
    games_dict[dict_keys[2]].append(m)
    games_dict[dict_keys[3]].append(y)    
    
    #Adicionando infos do away team
    away_team_name = soup_pbp.find('div', class_="scorebox").find_all('div')[0].find('strong').find('a').text
    games_dict[dict_keys[4]].append(away_team_name)
    away_team_score = soup_pbp.find('div', class_="scorebox").find_all('div')[0].find('div', class_ = 'score').text
    games_dict[dict_keys[5]].append(away_team_score)
    away_team_key = soup_pbp.find('div', class_="scorebox").find_all('div')[0].find('strong').find('a').attrs['href']
    games_dict[dict_keys[6]].append(away_team_key)
    
    #Adicionando infos do home team
    home_team_name = soup_pbp.find('div', class_="scorebox").find_all('div')[7].find('strong').find('a').text
    games_dict[dict_keys[8]].append(home_team_name)
    home_team_score = soup_pbp.find('div', class_="scorebox").find_all('div')[7].find('div', class_ = 'score').text
    games_dict[dict_keys[9]].append(home_team_score)
    home_team_key = soup_pbp.find('div', class_="scorebox").find_all('div')[7].find('strong').find('a').attrs['href']
    games_dict[dict_keys[10]].append(home_team_key)
    
    #Calculando o Ganhador
    if int(away_team_score) > int(home_team_score):
        games_dict[dict_keys[7]].append('W')
        games_dict[dict_keys[11]].append('L')
    else:
        games_dict[dict_keys[11]].append('W')
        games_dict[dict_keys[7]].append('L')    
        
    #Coletando o play by play
    plays = soup_pbp.find('table').find_all('tr')   
    pbp = get_pbp(plays, url, away_team_name, home_team_name)
    games_dict[dict_keys[12]].append(pbp)

In [8]:
# Função para coletar o play by play
def get_pbp(pbp_table, key, at, ht):

    #Dicionários com as info básicas de cada jogo
    dict_pbp = {
    'key' : [],
    'quarter' : [],
    'time' : [],
    'score' : [],
    'team_play' : [],
    'play': [],
    }
    
    #Apoio para calcular o quarto
    q = 1
    
    #Loop para fazer o download das jogadas
    for i in pbp_table[3:]:
    
        #Checar primeiro se uma linha da tabela com jogadas
        if len(i.find_all('td')) == 6:
            dict_pbp['key'].append(key)
            dict_pbp['quarter'].append(f"Q{q}")
            dict_pbp['time'].append(i.find_all('td')[0].text)
            dict_pbp['score'].append(i.find_all('td')[3].text)
          
            #Checando se a jogada foi do time visitante ou do time da casa
            if len(i.find_all('td')[1].text) > 2:
                dict_pbp['team_play'].append(at)
                dict_pbp['play'].append(i.find_all('td')[1].text)            
            
            elif len(i.find_all('td')[5].text) > 2:
                dict_pbp['team_play'].append(ht)
                dict_pbp['play'].append(i.find_all('td')[5].text)   
            
        #Bucando as linhas da tabela que indicam a mudança de quarto
        elif len(i.find_all('th')) == 1:
            q += 1
    
    return dict_pbp
    

In [9]:
games_table = {
    'key' : [],
    'day' : [],
    'month' : [],
    'year' : [],
    'away_team' : [],
    'away_team_score' : [],
    'away_team_key' : [],
    'away_team_W/L': [],
    'home_team' : [],
    'home_team_score' : [],
    'home_team_key' : [],
    'home_team_W/L': [],
    'game_pbp' : []
}

## Checando as DFs para ver se o downaload foi eficaz

### Coletando os dados

In [10]:
#Usando um dia travado para não comprometer as análises futuras
today_urls = []
get_games(d='30', m='10', y='2024', url_list=today_urls)

In [11]:
for i in today_urls:
    get_game_infos(d='30', m='10', y='2024', games_dict=games_table, url=i)
    print(i)

/boxscores/pbp/202410300CHI.html
/boxscores/pbp/202410300CHO.html
/boxscores/pbp/202410300CLE.html
/boxscores/pbp/202410300GSW.html
/boxscores/pbp/202410300IND.html
/boxscores/pbp/202410300LAC.html
/boxscores/pbp/202410300MEM.html
/boxscores/pbp/202410300MIA.html
/boxscores/pbp/202410300OKC.html
/boxscores/pbp/202410300PHI.html
/boxscores/pbp/202410300WAS.html


In [12]:
#Usando um dia travado para não comprometer as análises futuras
today_urls = []
get_games(d='29', m='10', y='2024', url_list=today_urls)

In [13]:
for i in today_urls:
    get_game_infos(d='29', m='10', y='2024', games_dict=games_table, url=i)
    print(i)

/boxscores/pbp/202410290BRK.html
/boxscores/pbp/202410290GSW.html
/boxscores/pbp/202410290MIN.html
/boxscores/pbp/202410290UTA.html


In [14]:
#Usando um dia travado para não comprometer as análises futuras
today_urls = []
get_games(d='28', m='10', y='2024', url_list=today_urls)

In [15]:
for i in today_urls:
    get_game_infos(d='28', m='10', y='2024', games_dict=games_table, url=i)
    print(i)

/boxscores/pbp/202410280ATL.html
/boxscores/pbp/202410280BOS.html
/boxscores/pbp/202410280DAL.html
/boxscores/pbp/202410280MEM.html
/boxscores/pbp/202410280MIA.html
/boxscores/pbp/202410280NYK.html
/boxscores/pbp/202410280ORL.html
/boxscores/pbp/202410280PHO.html
/boxscores/pbp/202410280SAC.html
/boxscores/pbp/202410280SAS.html
/boxscores/pbp/202410280TOR.html


In [16]:
#Usando um dia travado para não comprometer as análises futuras
today_urls = []
get_games(d='27', m='10', y='2024', url_list=today_urls)

In [17]:
for i in today_urls:
    get_game_infos(d='27', m='10', y='2024', games_dict=games_table, url=i)
    print(i)

/boxscores/pbp/202410270BRK.html
/boxscores/pbp/202410270GSW.html
/boxscores/pbp/202410270IND.html
/boxscores/pbp/202410270OKC.html
/boxscores/pbp/202410270POR.html


In [18]:
#Usando um dia travado para não comprometer as análises futuras
today_urls = []
get_games(d='26', m='10', y='2024', url_list=today_urls)

In [19]:
for i in today_urls:
    get_game_infos(d='26', m='10', y='2024', games_dict=games_table, url=i)
    print(i)

/boxscores/pbp/202410260CHI.html
/boxscores/pbp/202410260CHO.html
/boxscores/pbp/202410260DEN.html
/boxscores/pbp/202410260DET.html
/boxscores/pbp/202410260LAL.html
/boxscores/pbp/202410260MEM.html
/boxscores/pbp/202410260MIN.html
/boxscores/pbp/202410260PHO.html
/boxscores/pbp/202410260SAS.html
/boxscores/pbp/202410260WAS.html


In [20]:
#Usando um dia travado para não comprometer as análises futuras
today_urls = []
get_games(d='25', m='10', y='2024', url_list=today_urls)

In [21]:
for i in today_urls:
    get_game_infos(d='25', m='10', y='2024', games_dict=games_table, url=i)
    print(i)

/boxscores/pbp/202410250ATL.html
/boxscores/pbp/202410250CLE.html
/boxscores/pbp/202410250HOU.html
/boxscores/pbp/202410250LAL.html
/boxscores/pbp/202410250MIL.html
/boxscores/pbp/202410250NYK.html
/boxscores/pbp/202410250ORL.html
/boxscores/pbp/202410250POR.html
/boxscores/pbp/202410250TOR.html
/boxscores/pbp/202410250UTA.html


In [22]:
#Usando um dia travado para não comprometer as análises futuras
today_urls = []
get_games(d='24', m='10', y='2024', url_list=today_urls)

In [23]:
for i in today_urls:
    get_game_infos(d='24', m='10', y='2024', games_dict=games_table, url=i)
    print(i)

/boxscores/pbp/202410240DAL.html
/boxscores/pbp/202410240DEN.html
/boxscores/pbp/202410240SAC.html
/boxscores/pbp/202410240WAS.html


In [24]:
#Usando um dia travado para não comprometer as análises futuras
today_urls = []
get_games(d='23', m='10', y='2024', url_list=today_urls)

In [25]:
for i in today_urls:
    get_game_infos(d='23', m='10', y='2024', games_dict=games_table, url=i)
    print(i)

/boxscores/pbp/202410230ATL.html
/boxscores/pbp/202410230DET.html
/boxscores/pbp/202410230HOU.html
/boxscores/pbp/202410230LAC.html
/boxscores/pbp/202410230MIA.html
/boxscores/pbp/202410230NOP.html
/boxscores/pbp/202410230PHI.html
/boxscores/pbp/202410230POR.html
/boxscores/pbp/202410230TOR.html
/boxscores/pbp/202410230UTA.html


### Checando estrutura da tabela de jogos

In [26]:
# Criando uma tabela apenas com so resultados dos jogos
table = games_table.copy()
table.pop('game_pbp')
games_df = pd.DataFrame(table)
games_df

Unnamed: 0,key,day,month,year,away_team,away_team_score,away_team_key,away_team_W/L,home_team,home_team_score,home_team_key,home_team_W/L
0,/boxscores/pbp/202410300CHI.html,30,10,2024,Orlando Magic,99,/teams/ORL/2025.html,L,Chicago Bulls,102,/teams/CHI/2025.html,W
1,/boxscores/pbp/202410300CHO.html,30,10,2024,Toronto Raptors,133,/teams/TOR/2025.html,L,Charlotte Hornets,138,/teams/CHO/2025.html,W
2,/boxscores/pbp/202410300CLE.html,30,10,2024,Los Angeles Lakers,110,/teams/LAL/2025.html,L,Cleveland Cavaliers,134,/teams/CLE/2025.html,W
3,/boxscores/pbp/202410300GSW.html,30,10,2024,New Orleans Pelicans,89,/teams/NOP/2025.html,L,Golden State Warriors,104,/teams/GSW/2025.html,W
4,/boxscores/pbp/202410300IND.html,30,10,2024,Boston Celtics,132,/teams/BOS/2025.html,L,Indiana Pacers,135,/teams/IND/2025.html,W
...,...,...,...,...,...,...,...,...,...,...,...,...
60,/boxscores/pbp/202410230NOP.html,23,10,2024,Chicago Bulls,111,/teams/CHI/2025.html,L,New Orleans Pelicans,123,/teams/NOP/2025.html,W
61,/boxscores/pbp/202410230PHI.html,23,10,2024,Milwaukee Bucks,124,/teams/MIL/2025.html,W,Philadelphia 76ers,109,/teams/PHI/2025.html,L
62,/boxscores/pbp/202410230POR.html,23,10,2024,Golden State Warriors,140,/teams/GSW/2025.html,W,Portland Trail Blazers,104,/teams/POR/2025.html,L
63,/boxscores/pbp/202410230TOR.html,23,10,2024,Cleveland Cavaliers,136,/teams/CLE/2025.html,W,Toronto Raptors,106,/teams/TOR/2025.html,L


In [104]:
games_df.to_csv('games.csv')

Checando A estrutura do Play - by - Play

In [27]:
# Criando um dataframe 
pbp_df = pd.DataFrame(games_table['game_pbp'][0])
for i in games_table['game_pbp'][1:]:
    temp_df = pd.DataFrame(i)
    pbp_df = pd.concat([pbp_df, temp_df])

pbp_df

Unnamed: 0,key,quarter,time,score,team_play,play
0,/boxscores/pbp/202410300CHI.html,Q1,11:44.0,3-0,Orlando Magic,K. Caldwell-Pope makes 3-pt jump shot from 26 ...
1,/boxscores/pbp/202410300CHI.html,Q1,11:27.0,3-2,Chicago Bulls,J. Giddey makes 2-pt jump shot from 20 ft
2,/boxscores/pbp/202410300CHI.html,Q1,11:15.0,3-2,Orlando Magic,P. Banchero misses 2-pt jump shot from 6 ft
3,/boxscores/pbp/202410300CHI.html,Q1,11:12.0,3-2,Chicago Bulls,Defensive rebound by J. Giddey
4,/boxscores/pbp/202410300CHI.html,Q1,11:01.0,3-2,Chicago Bulls,P. Williams misses 2-pt layup from 2 ft
...,...,...,...,...,...,...
531,/boxscores/pbp/202410230UTA.html,Q4,0:06.0,125-121,Memphis Grizzlies,D. Bane makes free throw 1 of 2
532,/boxscores/pbp/202410230UTA.html,Q4,0:06.0,126-121,Memphis Grizzlies,D. Bane makes free throw 2 of 2
533,/boxscores/pbp/202410230UTA.html,Q4,0:06.0,126-121,Utah Jazz,Utah full timeout
534,/boxscores/pbp/202410230UTA.html,Q4,0:06.0,126-121,Utah Jazz,C. Sexton enters the game for W. Kessler


In [99]:
pbp_df.to_csv('PBP.csv')

Misturando as duas dfs

In [28]:
merge_df = pbp_df.merge(games_df[['key', 'away_team', 'home_team']], on='key')
merge_df

Unnamed: 0,key,quarter,time,score,team_play,play,away_team,home_team
0,/boxscores/pbp/202410300CHI.html,Q1,11:44.0,3-0,Orlando Magic,K. Caldwell-Pope makes 3-pt jump shot from 26 ...,Orlando Magic,Chicago Bulls
1,/boxscores/pbp/202410300CHI.html,Q1,11:27.0,3-2,Chicago Bulls,J. Giddey makes 2-pt jump shot from 20 ft,Orlando Magic,Chicago Bulls
2,/boxscores/pbp/202410300CHI.html,Q1,11:15.0,3-2,Orlando Magic,P. Banchero misses 2-pt jump shot from 6 ft,Orlando Magic,Chicago Bulls
3,/boxscores/pbp/202410300CHI.html,Q1,11:12.0,3-2,Chicago Bulls,Defensive rebound by J. Giddey,Orlando Magic,Chicago Bulls
4,/boxscores/pbp/202410300CHI.html,Q1,11:01.0,3-2,Chicago Bulls,P. Williams misses 2-pt layup from 2 ft,Orlando Magic,Chicago Bulls
...,...,...,...,...,...,...,...,...
31152,/boxscores/pbp/202410230UTA.html,Q4,0:06.0,125-121,Memphis Grizzlies,D. Bane makes free throw 1 of 2,Memphis Grizzlies,Utah Jazz
31153,/boxscores/pbp/202410230UTA.html,Q4,0:06.0,126-121,Memphis Grizzlies,D. Bane makes free throw 2 of 2,Memphis Grizzlies,Utah Jazz
31154,/boxscores/pbp/202410230UTA.html,Q4,0:06.0,126-121,Utah Jazz,Utah full timeout,Memphis Grizzlies,Utah Jazz
31155,/boxscores/pbp/202410230UTA.html,Q4,0:06.0,126-121,Utah Jazz,C. Sexton enters the game for W. Kessler,Memphis Grizzlies,Utah Jazz


Função para Pré Processar os dados

In [29]:
# Função para limpar os nomes
def processing_text(text):
    
    text = text.replace('(', '').replace(')', '').replace(".",'').replace(".."
                    , '').replace(";", '').lower().replace('da silva', 'dasilva')
    
    text = word_tokenize(text)
    
    return text

In [30]:
merge_df['clean_play'] = merge_df['play'].apply(processing_text)
merge_df

Unnamed: 0,key,quarter,time,score,team_play,play,away_team,home_team,clean_play
0,/boxscores/pbp/202410300CHI.html,Q1,11:44.0,3-0,Orlando Magic,K. Caldwell-Pope makes 3-pt jump shot from 26 ...,Orlando Magic,Chicago Bulls,"[k, caldwell-pope, makes, 3-pt, jump, shot, fr..."
1,/boxscores/pbp/202410300CHI.html,Q1,11:27.0,3-2,Chicago Bulls,J. Giddey makes 2-pt jump shot from 20 ft,Orlando Magic,Chicago Bulls,"[j, giddey, makes, 2-pt, jump, shot, from, 20,..."
2,/boxscores/pbp/202410300CHI.html,Q1,11:15.0,3-2,Orlando Magic,P. Banchero misses 2-pt jump shot from 6 ft,Orlando Magic,Chicago Bulls,"[p, banchero, misses, 2-pt, jump, shot, from, ..."
3,/boxscores/pbp/202410300CHI.html,Q1,11:12.0,3-2,Chicago Bulls,Defensive rebound by J. Giddey,Orlando Magic,Chicago Bulls,"[defensive, rebound, by, j, giddey]"
4,/boxscores/pbp/202410300CHI.html,Q1,11:01.0,3-2,Chicago Bulls,P. Williams misses 2-pt layup from 2 ft,Orlando Magic,Chicago Bulls,"[p, williams, misses, 2-pt, layup, from, 2, ft]"
...,...,...,...,...,...,...,...,...,...
31152,/boxscores/pbp/202410230UTA.html,Q4,0:06.0,125-121,Memphis Grizzlies,D. Bane makes free throw 1 of 2,Memphis Grizzlies,Utah Jazz,"[d, bane, makes, free, throw, 1, of, 2]"
31153,/boxscores/pbp/202410230UTA.html,Q4,0:06.0,126-121,Memphis Grizzlies,D. Bane makes free throw 2 of 2,Memphis Grizzlies,Utah Jazz,"[d, bane, makes, free, throw, 2, of, 2]"
31154,/boxscores/pbp/202410230UTA.html,Q4,0:06.0,126-121,Utah Jazz,Utah full timeout,Memphis Grizzlies,Utah Jazz,"[utah, full, timeout]"
31155,/boxscores/pbp/202410230UTA.html,Q4,0:06.0,126-121,Utah Jazz,C. Sexton enters the game for W. Kessler,Memphis Grizzlies,Utah Jazz,"[c, sexton, enters, the, game, for, w, kessler]"


In [101]:
merge_df.to_csv('merge_df.csv')

In [103]:
dict_df = pd.DataFrame(games_table)
dict_df.to_csv('dict_df.csv')

In [110]:
pbp_url = f'https://www.basketball-reference.com{today_urls[0]}'
response_pbp = requests.get(pbp_url)
soup_pbp = BeautifulSoup(response_pbp.content, "html.parser")
plays = soup_pbp.find('table').find_all('tr')   


<tr>
<td>11:41.0</td>
<td> </td><td> </td><td class="center">0-3</td><td class="bbr-play-score">+3</td><td class="bbr-play-score"><a href="/players/h/huntede01.html">D. Hunter</a> makes 3-pt jump shot from 25 ft</td>
</tr>

In [142]:
plays[8].find_all('a')[0].attrs['href']

'/players/j/johnsca02.html'