# <h1 style="font-family: Trebuchet MS; padding: 12px; font-size: 25px; color: #2D2926; text-align: center; line-height: 0.75;background-color: #41dcf7"><b>Web Scraping with Python - Brasileirão 2022</b><br></h1>

### Description:

This is a project to extract game information data from the web using the Requests and BeautifulSoup libraries in Python.

### Links:

[SITE CBF - Campeonato Brasileiro 2022](https://www.cbf.com.br/futebol-brasileiro/competicoes/campeonato-brasileiro-serie-a/2022)

# <center><div style="font-family: Trebuchet MS; background-color: #41dcf7; color: #2D2926; padding: 12px; line-height: 1;">Web Scraping</div></center>

### Importing Libraries:

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
url = 'https://www.cbf.com.br/futebol-brasileiro/competicoes/campeonato-brasileiro-serie-a/2022'
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'html.parser') #resp.text
#print(soup.prettify())

### Extracting Game Information:

In [3]:
#Useful Classes in Html
class_rodada = 'swiper-slide'
class_NumJogo ='partida-desc text-1 color-lightgray p-b-15 block uppercase text-center'
class_GameInfo = 'partida-desc text-1 color-lightgray block uppercase text-center'
TableName = soup.title.text

df_GameInfo = pd.DataFrame({'Date':[],'Time':[], 'GameRound': [], 'GameNumber': [], 'HomeTeam': [],  'AwayTeam': [], 
                   'HomeTeamScore': [],'AwayTeamScore':[], 'Stadium': [], 'GameCity': [], 'GameLink': []})
                   
#Extracting Game Information
for rodada in soup.find_all(class_=class_rodada):
    Rodada = rodada.find('h3').text
    #print(rodada.prettify())
    for jogo in rodada.find_all('li'):
        #Game Number
        list_jogo = jogo.find(class_=class_NumJogo).text.split()
        index_jogo = list_jogo.index('Jogo:')
        Num_jogo = list_jogo[index_jogo+1]
        
        #Team name
        HomeTeam = jogo.find_all('img')[0]['alt'].split(' - ')[0]
        AwayTeam = jogo.find_all('img')[1]['alt'].split(' - ')[0]

        #Match score
        HomeTeamScore = jogo.find(class_='bg-blue color-white label-2').text.split('x')[0]
        AwayTeamScore = jogo.find(class_='bg-blue color-white label-2').text.split('x')[1]
        
        #Match Info
        Stadium = jogo.find(class_=class_GameInfo).text.split(' - ')[0].strip()
        GameCity = jogo.find(class_=class_GameInfo).text.split(' - ')[1]
        GameLink = jogo.find(class_='btn btn-xs btn-success m-t-5')['href']
        Date = jogo.find(class_='partida-desc text-1 color-lightgray p-b-15 block uppercase text-center').text.split('-')[0].split()[1]
        Time = jogo.find(class_='partida-desc text-1 color-lightgray p-b-15 block uppercase text-center').text.split('-')[0].split()[-1]
        
        #Appending to DataFrame
        df_GameInfo.loc[len(df_GameInfo)] = [Date, Time, Rodada, Num_jogo, HomeTeam, AwayTeam, HomeTeamScore, AwayTeamScore, Stadium, GameCity, GameLink]

df_GameInfo       

Unnamed: 0,Date,Time,GameRound,GameNumber,HomeTeam,AwayTeam,HomeTeamScore,AwayTeamScore,Stadium,GameCity,GameLink
0,09/04/2022,16:30,Rodada 1,2,Fluminense,Santos,0,0,Maracanã,Rio de Janeiro,https://www.cbf.com.br/futebol-brasileiro/comp...
1,09/04/2022,19:00,Rodada 1,8,Atlético,Flamengo,1,1,Antônio Accioly,Goiania,https://www.cbf.com.br/futebol-brasileiro/comp...
2,09/04/2022,21:00,Rodada 1,4,Palmeiras,Ceará,2,3,Allianz Parque,Sao Paulo,https://www.cbf.com.br/futebol-brasileiro/comp...
3,10/04/2022,11:00,Rodada 1,10,Coritiba,Goiás,3,0,Couto Pereira,Curitiba,https://www.cbf.com.br/futebol-brasileiro/comp...
4,10/04/2022,16:00,Rodada 1,1,Atlético Mineiro,Internacional,2,0,Mineirão,Belo Horizonte,https://www.cbf.com.br/futebol-brasileiro/comp...
...,...,...,...,...,...,...,...,...,...,...,...
375,13/11/2022,16:00,Rodada 38,376,Internacional,Palmeiras,3,0,Beira-Rio,Porto Alegre,https://www.cbf.com.br/futebol-brasileiro/comp...
376,13/11/2022,16:00,Rodada 38,377,Ceará,Juventude,4,1,Arena Castelão,Fortaleza,https://www.cbf.com.br/futebol-brasileiro/comp...
377,13/11/2022,16:00,Rodada 38,378,Goiás,São Paulo,0,4,Hailé Pinheiro,Goiania,https://www.cbf.com.br/futebol-brasileiro/comp...
378,13/11/2022,16:00,Rodada 38,379,Cuiabá Saf,Coritiba,2,1,Arena Pantanal,Cuiaba,https://www.cbf.com.br/futebol-brasileiro/comp...


### Extracting Player Information:

In [4]:
#Extracting Player Information
df_PlayerInfo = pd.DataFrame({'PlayerName':[],'Team':[], 'Gols':[], 'PlayerLink':[]})

for player in soup.find(class_='table border-body').tbody.find_all('tr')[:-1]:
    
    #PlayerInfo
    Team = player.img['title'].split('-')[0].strip()
    PlayerName = player.a.text
    Gols = player.th.text
    PlayerLink = player.a['href']
    
    #Appending to DataFrame
    df_PlayerInfo.loc[len(df_PlayerInfo)] = [PlayerName, Team, Gols, PlayerLink]
    df_PlayerInfo['Gols'] = df_PlayerInfo['Gols'].astype(int)

df_PlayerInfo

Unnamed: 0,PlayerName,Team,Gols,PlayerLink
0,German Ezequiel Cano,Fluminense,27,https://www.cbf.com.br/futebol-brasileiro/atle...
1,Pedro Raul Garay da Silva,Goiás,19,https://www.cbf.com.br/futebol-brasileiro/atle...
2,Jonathan Calleri,São Paulo,18,https://www.cbf.com.br/futebol-brasileiro/atle...
3,Guilherme Bissoli Campos,Avaí,14,https://www.cbf.com.br/futebol-brasileiro/atle...
4,Marcos Leonardo Santos Almeida,Santos,13,https://www.cbf.com.br/futebol-brasileiro/atle...
...,...,...,...,...
300,Vanderlan Barbosa da Silva,Palmeiras,1,https://www.cbf.com.br/futebol-brasileiro/atle...
301,Romulo Otero Vasquez,Fortaleza,1,https://www.cbf.com.br/futebol-brasileiro/atle...
302,Khellven Douglas Silva Oliveira,Athletico Paranaense,1,https://www.cbf.com.br/futebol-brasileiro/atle...
303,Mateus Ferreira,Avaí,1,https://www.cbf.com.br/futebol-brasileiro/atle...


# <center><div style="font-family: Trebuchet MS; background-color: #41dcf7; color: #2D2926; padding: 12px; line-height: 1;">Data Cleaning</div></center>

In [6]:
# Data Cleaning
df_PlayerInfo['Team'] = df_PlayerInfo['Team'].replace({'Cuiabá Saf': 'Cuiabá', 'América Fc Saf': 'América-MG', 'Atlético':'Atlético-GO'})
df_GameInfo[['HomeTeam', 'AwayTeam']] = df_GameInfo[['HomeTeam', 'AwayTeam']].replace({'Cuiabá Saf': 'Cuiabá', 'América Fc Saf': 'América-MG', 'Atlético':'Atlético-GO'})

df_GameInfo[['HomeTeamScore', 'AwayTeamScore']] = df_GameInfo[['HomeTeamScore', 'AwayTeamScore']].astype(int)

# Transforming Score Columns
df_GameInfo['HomeTeamPoints'] = np.where(df_GameInfo['HomeTeamScore'] > df_GameInfo['AwayTeamScore'], 3,
                                         np.where(df_GameInfo['HomeTeamScore'] == df_GameInfo['AwayTeamScore'], 1, 0))
df_GameInfo['AwayTeamPoints'] = np.where(df_GameInfo['AwayTeamScore'] > df_GameInfo['HomeTeamScore'], 3,
                                         np.where(df_GameInfo['HomeTeamScore'] == df_GameInfo['AwayTeamScore'], 1, 0))
# Total Score
result1 = df_GameInfo.groupby('HomeTeam')['HomeTeamPoints'].sum()
result2 = df_GameInfo.groupby('AwayTeam')['AwayTeamPoints'].sum()
df_Game_sum = result1.add(result2).reset_index().rename(columns={'HomeTeam':'TeamName',0:'Score'})


# Wins, Draws and Defeats
VED_1 = df_GameInfo.groupby('HomeTeam')['HomeTeamPoints'].value_counts().unstack()
VED_2 = df_GameInfo.groupby('AwayTeam')['AwayTeamPoints'].value_counts().unstack()
df_VED = VED_1.add(VED_2).reset_index().rename(columns={3:'Wins',1:'Draws',0:'Defeats'})

df_Game_sum = df_Game_sum.merge(df_VED, left_on='TeamName', right_on='HomeTeam')
df_Game_sum = df_Game_sum.drop('HomeTeam', axis=1)
cols = df_Game_sum.columns.tolist()
new_order = cols[:-3] + cols[-1:-4:-1]
df_Game_sum = df_Game_sum[new_order]

#Gols by team
Gols = df_PlayerInfo.groupby('Team')['Gols'].sum()
df_Game_sum = df_Game_sum.merge(Gols, left_on='TeamName', right_on='Team')
df_Game_sum['%'] = (df_Game_sum['Score']/(38*3)*100).astype(int)
df_Game_sum = df_Game_sum.sort_values(['Score', 'Wins', 'Draws', 'Gols'], ascending=False, ignore_index=True)

# <center><div style="font-family: Trebuchet MS; background-color: #41dcf7; color: #2D2926; padding: 12px; line-height: 1;">Exporting Data</div></center>

In [None]:
# Exporting data
df_GameInfo.to_csv('BrasileiraoSerie_A_2022_GameInfo.csv', index=False)
df_PlayerInfo.to_csv('BrasileiraoSerie_A_2022_PlayerInfo.csv', index=False)
df_Game_sum.to_csv('BrasileiraoSerie_A_2022_Table.csv', index=False)