## 1. Scrape soccer tournaments from UOL web site

### 1.1 Create a scraping dataset with all matches of soccer double-round-robin tournaments

In [35]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
import pandas as pd
import re

def uol_matches(url):
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    html = urlopen(req).read()
    page = soup(html, 'html.parser')
    containers = page.findAll('ul', class_ = 'rounds-content')
    tag = containers[0].text

    if re.search('encerrado           A definir', tag) != None:
        part1, part2 = re.split('encerrado           A definir', tag)[:2]
        part2_split = re.split('A definir', part2)
        part3 = []
        for part in part2_split:
            part = part.split('  ')
            part = part[2] + ' - * - * - ' + part[1]
            part3.append(part)
        part2 = "            ".join(part3)
        tag = part1 + part2
        tag
    tag = re.sub('Domus', 'DOMUS', tag)
    games = re.split('Sáb|Dom|Seg|Ter|Qua|Qui|Sex|pós-jogo', tag)
    games = [g.replace(',', ' ').strip() for g in games if g.strip() != '']

    for i in range(len(games)):
        res1 = re.search(r'^\d\d/\d\d\s-\s\d\dh\d*', games[i])
        if i<len(games)-1: res2 = re.search(r'^\d\d/\d\d\s-\s\d\dh\d*', games[i+1])
        if res1 == None and res2 != None: 
            games[i] = res2[0] + ' - ' + games[i]
    games = [re.sub(r'\s{3,9}', ' - ', g).strip() for g in games if len(g.split('-')) > 2]
    games = [re.split(r' - |\(\)', g) for g in games]
    df = pd.DataFrame(games).iloc[:,:5]
    df.columns = ['Date', 'Time', 'Place', 'Team1', 'Team2']
    df[['Score1', 'Score2']] = ''

    for i in range(len(df)):
        txt1 = df.loc[i,'Team1'].strip()[3:]
        score1 = re.findall(r'\d$|-$', txt1)[0]
        if score1== '-': score1=''
        txt2 = df.loc[i,'Team2'].strip()[:-3]
        score2 = re.findall(r'^\d+|^-', txt2)[0]
        if score2== '-': score2=''
        df.loc[i, 'Team1'] = re.split(r'\d+|-$', txt1)[0]
        df.loc[i, 'Team2'] = re.split(r'^\d+|^-', txt2)[1]
        df.loc[i, 'Score1'] = score1
        df.loc[i, 'Score2'] = score2
    df.replace('*', '', inplace=True)
    return df

### 1.2 Create a scraping dataset with the ranking of soccer double-round-robin tournaments

In [36]:
def uol_ranking(url):
    page = pd.read_html(url)
    df = pd.concat([pd.DataFrame(page[0]), pd.DataFrame(page[1])], axis=1)
    df.rename(columns = {'classificação':'Ranking'}, inplace = True)
    df[['Ranking', 'Team']] = df['Ranking'].str.split('°', 1, expand=True)
    df['Team'] = df['Team'].apply(lambda x:re.split(r'[A-Z]{3}$', x)[0].strip())
    cols = df.columns.tolist()
    cols = [cols[0], cols[-1]] + cols[1:-1]
    df = df[cols]
    return df

### 1.3 Show part of dataset with soccer matches and ranking of different leagues

In [50]:
leagues = ['https://www.uol.com.br/esporte/futebol/campeonatos/brasileirao',
           'https://www.uol.com.br/esporte/futebol/campeonatos/serie-b',
           'https://www.uol.com.br/esporte/futebol/campeonatos/eliminatorias-sul-americanas',
           'https://www.uol.com.br/esporte/futebol/campeonatos/frances',
           'https://www.uol.com.br/esporte/futebol/campeonatos/alemao',
           'https://www.uol.com.br/esporte/futebol/campeonatos/la-liga',
           'https://www.uol.com.br/esporte/futebol/campeonatos/italiano',
           'https://www.uol.com.br/esporte/futebol/campeonatos/ingles']

for league in leagues:
    print('League:', league.split('/')[-1].capitalize())
    display(uol_ranking(league).head(4))
    display(uol_matches(league).head(4))
    print('')

League: Brasileirao


Unnamed: 0,Ranking,Team,PG,J,V,E,D,GC,GP,SG,%
0,1,Atlético-MG,71,32,22,5,5,22,51,29,74
1,2,Flamengo,63,32,19,6,7,28,62,34,66
2,3,Palmeiras,58,33,18,4,11,39,52,13,59
3,4,Red Bull Bragantino,52,34,13,13,8,40,50,10,51


Unnamed: 0,Date,Time,Place,Team1,Team2,Score1,Score2
0,29/05,19h,Arena Pantanal,Cuiabá,Juventude,2,2
1,29/05,20h,Pituaçu,Bahia,Santos,3,0
2,29/05,21h,Morumbi,São Paulo,Fluminense,0,0
3,30/05,11h,Mineirão,Atlético-MG,Fortaleza,1,2



League: Serie-b


Unnamed: 0,Ranking,Team,PG,J,V,E,D,GC,GP,SG,%
0,1,Botafogo,66,36,19,9,8,29,53,24,61
1,2,Coritiba,64,36,18,10,8,31,47,16,59
2,3,Goiás,61,36,16,13,7,29,44,15,56
3,4,Guarani,59,36,16,11,9,37,52,15,55


Unnamed: 0,Date,Time,Place,Team1,Team2,Score1,Score2
0,28/05,16h,Bento Freitas,Brasil de Pelotas,Londrina,0,0
1,28/05,19h,Brinco de Ouro,Guarani,Vitória,1,1
2,28/05,21h30,OBA,Vila Nova,Botafogo,1,1
3,28/05,21h30,Aflitos,Náutico,CSA,1,0



League: Eliminatorias-sul-americanas


Unnamed: 0,Ranking,Team,PG,J,V,E,D,GC,GP,SG,%
0,1,Brasil,35,13,11,2,0,4,27,23,90
1,2,Argentina,29,13,8,5,0,6,20,14,74
2,3,Equador,23,14,7,2,5,13,23,10,55
3,4,Colômbia,17,14,3,8,3,17,16,-1,40


Unnamed: 0,Date,Time,Place,Team1,Team2,Score1,Score2
0,08/10,19h30,Defensores del Chaco,Paraguai,Peru,2,2
1,08/10,19h45,Centenário (URU),Uruguai,Chile,2,1
2,08/10,21h30,La Bombonera,Argentina,Equador,1,0
3,09/10,20h30,Metropolitano Roberto Meléndez,Colômbia,Venezuela,3,0



League: Frances


Unnamed: 0,Ranking,Team,PG,J,V,E,D,GC,GP,SG,%
0,1,PSG,34,13,11,1,1,13,29,16,87
1,2,Nice,24,13,7,3,3,10,23,13,62
2,3,Lens,24,13,7,3,3,14,25,11,62
3,4,Olympique,23,13,6,5,2,12,20,8,59


Unnamed: 0,Date,Time,Place,Team1,Team2,Score1,Score2
0,06/08,16h,Louis II,Monaco,Nantes,1,1
1,07/08,12h,Parc Olympique Lyonnais,Lyon,Brest,1,1
2,07/08,16h,De l'Aube,Troyes,PSG,1,2
3,08/08,08h,Roazhon Park,Rennes,Lens,1,1



League: Alemao


Unnamed: 0,Ranking,Team,PG,J,V,E,D,GC,GP,SG,%
0,1,Bayern de Munique,28,12,9,1,2,13,41,28,78
1,2,Borussia Dortmund,24,11,8,0,3,17,28,11,73
2,3,Freiburg,22,11,6,4,1,9,18,9,67
3,4,Wolfsburg,19,11,6,1,4,12,12,0,58


Unnamed: 0,Date,Time,Place,Team1,Team2,Score1,Score2
0,13/08,15h30,Borussia Park,Borussia M'gladbach,Bayern de Munique,1,1
1,14/08,10h30,Volkswagen Arena,Wolfsburg,Bochum,1,0
2,14/08,10h30,An der Alten Försterei,Union Berlin,Bayer Leverkusen,1,1
3,14/08,10h30,Mercedes-Benz Arena,Stuttgart,Greuther Fürth,5,1



League: La-liga


Unnamed: 0,Ranking,Team,PG,J,V,E,D,GC,GP,SG,%
0,1,Real Sociedad,28,13,8,4,1,10,19,9,72
1,2,Real Madrid,27,12,8,3,1,13,28,15,75
2,3,Sevilla,27,12,8,3,1,7,21,14,75
3,4,Atlético de Madri,23,12,6,5,1,13,21,8,64


Unnamed: 0,Date,Time,Place,Team1,Team2,Score1,Score2
0,13/08,16h,Mestalla,Valencia,Getafe,1,0
1,14/08,14h30,Visit Mallorca Estadi,Mallorca,Betis,1,1
2,14/08,14h30,Nuevo Mirandilla,Cádiz,Levante,1,1
3,14/08,17h,Mendizorroza,Alavés,Real Madrid,1,4



League: Italiano


Unnamed: 0,Ranking,Team,PG,J,V,E,D,GC,GP,SG,%
0,1,Napoli,32,12,10,2,0,4,24,20,89
1,2,Milan,32,12,10,2,0,11,26,15,89
2,3,Internazionale,25,12,7,4,1,13,29,16,69
3,4,Atalanta,22,12,6,4,2,15,22,7,61


Unnamed: 0,Date,Time,Place,Team1,Team2,Score1,Score2
0,21/08,13h30,M.A. Bentegodi,Hellas Verona,Sassuolo,2,3
1,21/08,13h30,San Siro,Internazionale,Genoa,4,0
2,21/08,15h45,Carlo Castellani,Empoli,Lazio,1,3
3,21/08,15h45,Olímpico Grande Torino,Torino,Atalanta,1,2



League: Ingles


Unnamed: 0,Ranking,Team,PG,J,V,E,D,GC,GP,SG,%
0,1,Chelsea,26,11,8,2,1,4,27,23,79
1,2,Manchester City,23,11,7,2,2,6,22,16,70
2,3,West Ham,23,11,7,2,2,13,23,10,70
3,4,Liverpool,22,11,6,4,1,11,31,20,67


Unnamed: 0,Date,Time,Place,Team1,Team2,Score1,Score2
0,13/08,16h,Brentford Community Stadium,Brentford,Arsenal,2,0
1,14/08,08h30,Old Trafford,Manchester United,Leeds United,5,1
2,14/08,11h,Turf Moor,Burnley,Brighton,1,2
3,14/08,11h,Stamford Bridge,Chelsea,Crystal Palace,3,0



