# Scarica Storico Partite di Calcio

In [0]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [0]:
def get_matches(url):
    # DataFrame of matches
    df = pd.DataFrame()

    try:
        # Download the page
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        print(f'Scanning url:\n{url}')

        # Get tables and save as dataframes
        for table in soup.find_all(name='table', attrs={'class': 'table-2 table'}):
            tmp = pd.DataFrame(pd.read_html(table.prettify())[0])
            if 'Classifica' not in tmp.columns[0]:
                tmp = tmp.iloc[:, :-2]
                tmp.columns = ['date', 'team1', 'team2', 'score']
                df = pd.concat([df, tmp], axis=0, ignore_index=True)
    except:
        print('Url does not exist.')
        pass

    return df

## Serie A

In [3]:
df_matches_a = pd.DataFrame()

base_url = 'https://www.statistichesulcalcio.com/campionati/Italia/Serie-A_71/anno_'

for i in range(112, 130):
    url = base_url + f'{i}.html'
    new_matches = get_matches(url)
    df_matches_a = pd.concat([df_matches_a, new_matches], axis=0, ignore_index=True)

Scanning url:
https://www.statistichesulcalcio.com/campionati/Italia/Serie-A_71/anno_112.html
Scanning url:
https://www.statistichesulcalcio.com/campionati/Italia/Serie-A_71/anno_113.html
Scanning url:
https://www.statistichesulcalcio.com/campionati/Italia/Serie-A_71/anno_114.html
Scanning url:
https://www.statistichesulcalcio.com/campionati/Italia/Serie-A_71/anno_115.html
Scanning url:
https://www.statistichesulcalcio.com/campionati/Italia/Serie-A_71/anno_116.html
Scanning url:
https://www.statistichesulcalcio.com/campionati/Italia/Serie-A_71/anno_117.html
Scanning url:
https://www.statistichesulcalcio.com/campionati/Italia/Serie-A_71/anno_118.html
Scanning url:
https://www.statistichesulcalcio.com/campionati/Italia/Serie-A_71/anno_119.html
Scanning url:
https://www.statistichesulcalcio.com/campionati/Italia/Serie-A_71/anno_120.html
Scanning url:
https://www.statistichesulcalcio.com/campionati/Italia/Serie-A_71/anno_121.html
Scanning url:
https://www.statistichesulcalcio.com/campionat

In [4]:
df_matches_a['championship'] = 'A'
df_matches_a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040 entries, 0 to 3039
Data columns (total 5 columns):
date            3040 non-null object
team1           3040 non-null object
team2           3040 non-null object
score           2790 non-null object
championship    3040 non-null object
dtypes: object(5)
memory usage: 118.9+ KB


## Serie B

In [5]:
df_matches_b = pd.DataFrame()

base_url = 'https://www.statistichesulcalcio.com/campionati/Italia/Serie-B_72/anno_'

for i in range(112, 130):
    url = base_url + f'{i}.html'
    new_matches = get_matches(url)
    df_matches_b = pd.concat([df_matches_b, new_matches], axis=0, ignore_index=True)

Scanning url:
https://www.statistichesulcalcio.com/campionati/Italia/Serie-B_72/anno_112.html
Scanning url:
https://www.statistichesulcalcio.com/campionati/Italia/Serie-B_72/anno_113.html
Scanning url:
https://www.statistichesulcalcio.com/campionati/Italia/Serie-B_72/anno_114.html
Scanning url:
https://www.statistichesulcalcio.com/campionati/Italia/Serie-B_72/anno_115.html
Scanning url:
https://www.statistichesulcalcio.com/campionati/Italia/Serie-B_72/anno_116.html
Scanning url:
https://www.statistichesulcalcio.com/campionati/Italia/Serie-B_72/anno_117.html
Scanning url:
https://www.statistichesulcalcio.com/campionati/Italia/Serie-B_72/anno_118.html
Scanning url:
https://www.statistichesulcalcio.com/campionati/Italia/Serie-B_72/anno_119.html
Scanning url:
https://www.statistichesulcalcio.com/campionati/Italia/Serie-B_72/anno_120.html
Scanning url:
https://www.statistichesulcalcio.com/campionati/Italia/Serie-B_72/anno_121.html
Scanning url:
https://www.statistichesulcalcio.com/campionat

In [6]:
df_matches_b['championship'] = 'B'
df_matches_b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3494 entries, 0 to 3493
Data columns (total 5 columns):
date            3494 non-null object
team1           3494 non-null object
team2           3494 non-null object
score           3244 non-null object
championship    3494 non-null object
dtypes: object(5)
memory usage: 136.6+ KB


## Merge dei Dataset

In [0]:
df = pd.concat([df_matches_a, df_matches_b], axis=0, ignore_index=True)

In [0]:
df.date = pd.to_datetime(df.date)

In [0]:
df.sort_values(by=['date', 'championship'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [16]:
df.head()

Unnamed: 0,date,team1,team2,score,championship
0,2012-01-09,Torino,Pescara,3-0,A
1,2012-01-09,Bologna,Milan,1-3,A
2,2012-01-09,Novara,Empoli,2-2,B
3,2012-01-09,Ternana,Modena,0-1,B
4,2012-01-09,Ascoli,Bari,1-3,B


In [0]:
df.to_csv('serie_A_B_matches_2012_2020.csv', index=False)