In [0]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed

In [0]:
def get_table(current_round, max_rounds, league, season, flag):
    dfs = pd.read_html(f'https://www.worldfootball.net/schedule/{league}-{season}-{flag}{current_round}', header=0)
    df = dfs[3]
    df.drop(['Team'], inplace=True, axis=1)

    df.rename(index=str, columns={'#': 'pos', 'Team.1': 'team', 'M.': 'current_round', 'W': 'win',
                                  'D': 'draw', 'L': 'loss', 'goals': 'goals',
                                  'Dif.': 'goals_diff', 'Pt.': 'points_for'}, inplace=True)
    aux = df['goals'].str.split(':', n = 1, expand = True)
    df['goals_for'] = aux[0] 
    df['goals_against'] = aux[1] 
    df.drop(columns = ['goals'], inplace = True)
    df = df.apply(pd.to_numeric, errors='ignore')
    df['pos'] =  pd.to_numeric(df.index) + 1
    df['max_rounds'] = max_rounds
    df['rounds_left'] = df['max_rounds'] - df['current_round']
    df['possible_points'] = df['current_round'] * 3
    df['performance'] = np.round((df['points_for'] / df['possible_points']) * 100, 2)
    df['performance'].replace(np.inf, df['points_for'], inplace=True)
    df.fillna(value={'performance': 0}, inplace=True)
    df['total_possible_points'] = df['points_for'] + (df['rounds_left'] * 3)
    df['goals_for_against_ratio'] = df['goals_for'] / df['goals_against']
    df['goals_for_against_ratio'].replace(np.inf, df['goals_for'], inplace=True)
    df.fillna(value={'goals_for_against_ratio': 0}, inplace=True)
    df['goals_for_against_ratio'] = np.round(df['goals_for_against_ratio'], 3)
    df['league'] = league
    df['season'] = season
    columns = ['pos', 'team', 'current_round', 'max_rounds', 'rounds_left', 'win', 'loss', 'draw', 
               'goals_for', 'goals_against', 'goals_diff', 'goals_for_against_ratio', 'points_for', 
               'possible_points', 'total_possible_points', 'performance', 'league', 'season']
    df = df[columns]
    return df

In [0]:
def get_league(max_rounds, league, season, flag):
    table_list = Parallel(n_jobs=38, verbose=0, backend='threading')(delayed(get_table)(current_round, max_rounds, league, season, flag) for current_round in range(1, max_rounds + 1))
    final_table = pd.concat(table_list, ignore_index=True)
    return final_table

In [0]:
def increment_season(season):
    if '-' in season:
        season_years = season.split('-')
        first_year = int(season_years[0]) + 1
        second_year = int(season_years[1]) + 1
        next_season = str(first_year) + '-' + str(second_year)
        return next_season
    else:
        season_year = int(season) + 1
        next_season = str(season_year)
        return next_season

In [0]:
def get_all_leagues(league_dict):
    list_all_seasons = []
    for key in league_dict.keys():
        if (key == 'bra-serie-a') or (key == 'bra-serie-b'):
            while league_dict[key][0] != '2019':
                flag = 'spieltag/'
                print(key)
                print(league_dict[key][0])
                table = get_league(league_dict[key][1], key, league_dict[key][0], flag)
                league_dict[key][0] = increment_season(league_dict[key][0])
                list_all_seasons.append(table)
                print('DONE!')
        else:
            while league_dict[key][0] != '2019-2020':
                if (key == 'esp-primera-division') and (league_dict[key][0] == '2016-2017'):
                    flag = 'spieltag_2/'
                    print(key)
                    print(league_dict[key][0])
                    table = get_league(league_dict[key][1], key, league_dict[key][0], flag)
                    league_dict[key][0] = increment_season(league_dict[key][0])
                    list_all_seasons.append(table)
                    print('DONE!')
                else:
                    flag = 'spieltag/'
                    print(key)
                    print(league_dict[key][0])
                    table = get_league(league_dict[key][1], key, league_dict[key][0], flag)
                    league_dict[key][0] = increment_season(league_dict[key][0])
                    list_all_seasons.append(table)
                    print('DONE!')
    final_table = pd.concat(list_all_seasons, ignore_index=True)
    return final_table

In [0]:
def add_final_points(df):
    df.loc[((df.index == 98550) | (df.index == 98564)) & (df['team'] == 'Grêmio Prudente'), ['team']] = 'Grêmio Barueri - SP'
    df.loc[((df.index == 96999) | (df.index == 97006)), ['current_round', 'rounds_left', 'loss']] = [(38, 0, 10), (38, 0, 12)]
    unique_leagues = df['league'].unique()
    dfs = []
    for loop_league in unique_leagues:
        unique_seasons = df[df['league'] == loop_league]['season'].unique()
        for season_loop in unique_seasons:
            df_of_season = df[(df['league'] == loop_league) & (df['season'] == season_loop)]
            last_round = np.max(df_of_season['current_round'])
            last_round_df = df_of_season[df_of_season['current_round'] == last_round]
            df_of_season = df_of_season.merge(last_round_df[['team', 'points_for']], how='left', on='team')
            dfs.append(df_of_season)
    final_df = pd.concat(dfs)
    final_df.rename(index=str, columns={'points_for_x': 'points_for', 'points_for_y': 'final_points'}, inplace=True)
    return final_df

In [0]:
league_dict = {'eng-premier-league': ['1995-1996', 38], 'esp-primera-division': ['1997-1998', 38], 
                   'fra-ligue-1': ['2002-2003', 38], 'bundesliga': ['1995-1996', 34], 
                   'ita-serie-a': ['2004-2005', 38], 'ned-eredivisie': ['1995-1996', 34], 
                   'bra-serie-a': ['2006', 38], 'bra-serie-b': ['2011', 38]}

In [0]:
final_dataframe = get_all_leagues(league_dict)

eng-premier-league
1995-1996
DONE!
eng-premier-league
1996-1997
DONE!
eng-premier-league
1997-1998
DONE!
eng-premier-league
1998-1999
DONE!
eng-premier-league
1999-2000
DONE!
eng-premier-league
2000-2001
DONE!
eng-premier-league
2001-2002
DONE!
eng-premier-league
2002-2003
DONE!
eng-premier-league
2003-2004
DONE!
eng-premier-league
2004-2005
DONE!
eng-premier-league
2005-2006
DONE!
eng-premier-league
2006-2007
DONE!
eng-premier-league
2007-2008
DONE!
eng-premier-league
2008-2009
DONE!
eng-premier-league
2009-2010
DONE!
eng-premier-league
2010-2011
DONE!
eng-premier-league
2011-2012
DONE!
eng-premier-league
2012-2013
DONE!
eng-premier-league
2013-2014
DONE!
eng-premier-league
2014-2015
DONE!
eng-premier-league
2015-2016
DONE!
eng-premier-league
2016-2017
DONE!
eng-premier-league
2017-2018
DONE!
eng-premier-league
2018-2019
DONE!
esp-primera-division
1997-1998
DONE!
esp-primera-division
1998-1999
DONE!
esp-primera-division
1999-2000
DONE!
esp-primera-division
2000-2001
DONE!
esp-primera-

In [0]:
final_dataframe.sample(5)

Unnamed: 0,pos,team,current_round,max_rounds,rounds_left,win,loss,draw,goals_for,goals_against,goals_diff,goals_for_against_ratio,points_for,possible_points,total_possible_points,performance,league,season
2439,20,Southampton FC,8,38,30,0,7,1,3,21,-18,0.143,1,24,91,4.17,eng-premier-league,1998-1999
93342,7,Flamengo RJ,7,38,31,3,1,3,13,11,2,1.182,12,21,105,57.14,bra-serie-a,2012
76629,16,NEC Nijmegen,13,34,21,2,10,1,16,32,-16,0.5,7,39,70,17.95,ned-eredivisie,1999-2000
45588,9,Angers SCO,38,38,0,13,14,11,40,38,2,1.053,50,114,50,43.86,fra-ligue-1,2015-2016
66118,11,SSC Napoli,26,38,12,10,11,5,32,30,2,1.067,35,78,71,44.87,ita-serie-a,2008-2009


In [0]:
final_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104616 entries, 0 to 104615
Data columns (total 18 columns):
pos                        104616 non-null int64
team                       104616 non-null object
current_round              104616 non-null int64
max_rounds                 104616 non-null int64
rounds_left                104616 non-null int64
win                        104616 non-null int64
loss                       104616 non-null int64
draw                       104616 non-null int64
goals_for                  104616 non-null int64
goals_against              104616 non-null int64
goals_diff                 104616 non-null int64
goals_for_against_ratio    104616 non-null float64
points_for                 104616 non-null int64
possible_points            104616 non-null int64
total_possible_points      104616 non-null int64
performance                104616 non-null float64
league                     104616 non-null object
season                     104616 non-null object


In [0]:
df = add_final_points(final_dataframe)

In [0]:
df.sample(5)

Unnamed: 0,pos,team,current_round,max_rounds,rounds_left,win,loss,draw,goals_for,goals_against,goals_diff,goals_for_against_ratio,points_for,possible_points,total_possible_points,performance,league,season,final_points,final_pos,classe
516,13,AZ Alkmaar,29,34,5,8,14,7,37,52,-15,0.712,31,87,46,35.63,ned-eredivisie,2000-2001,35,13,
705,6,Athletico Paranaense,36,38,2,16,12,8,41,44,-3,0.932,56,108,62,51.85,bra-serie-a,2010,60,5,
518,19,Juventude - RS,26,38,12,7,14,5,29,45,-16,0.644,26,78,62,33.33,bra-serie-a,2007,41,18,Z4
310,5,Feyenoord,18,34,16,9,5,4,29,21,8,1.381,31,54,79,57.41,ned-eredivisie,1997-1998,61,4,G4
190,11,Everton FC,10,38,28,3,3,4,13,13,0,1.0,13,30,97,43.33,eng-premier-league,2015-2016,47,11,


In [0]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 104616 entries, 0 to 759
Data columns (total 21 columns):
pos                        104616 non-null int64
team                       104616 non-null object
current_round              104616 non-null int64
max_rounds                 104616 non-null int64
rounds_left                104616 non-null int64
win                        104616 non-null int64
loss                       104616 non-null int64
draw                       104616 non-null int64
goals_for                  104616 non-null int64
goals_against              104616 non-null int64
goals_diff                 104616 non-null int64
goals_for_against_ratio    104616 non-null float64
points_for                 104616 non-null int64
possible_points            104616 non-null int64
total_possible_points      104616 non-null int64
performance                104616 non-null float64
league                     104616 non-null object
season                     104616 non-null object
final_po

In [0]:
df.to_csv('all_leagues_seasons.csv', sep= ';', index = False)