### Data Processing

In [1]:
import numpy as np
import pandas as pd

data from season 2005/06

In [2]:
serieA = pd.read_csv('../data/serieAData.csv')
serieA.head(2)

Unnamed: 0,HomeTeam,AwayTeam,HomeGoals,AwayGoals
0,Fiorentina,Sampdoria,2.0,1.0
1,Livorno,Lecce,2.0,1.0


In [3]:
# count distinct values
serieA['HomeTeam'].nunique()

41

41 different teams in the dataframe

In [4]:
len(serieA)

6465

In [5]:
games_per_season = 38
number_of_seasons = 17
matches_per_year = 380
len(serieA) / matches_per_year

17.013157894736842

aggregate the data

In [8]:
# aggregate and get results

df_agg = pd.DataFrame(columns=['team','opponent','sum_points','num_games'], dtype=float)
df_results = pd.DataFrame(columns=['team','opponent','result','num_occurences'], dtype=float)

for team in serieA['HomeTeam'].unique():
    df_interim = pd.DataFrame(columns=['team','opponent','result','home'], dtype=float)

    # get all the home games of the team
    home = serieA[serieA['HomeTeam'] == team]
    # for every such game, get the opponent and the result, from the perspective of the home team
    home.reset_index(inplace=True)
    for i in range(len(home)):
        opponent = home['AwayTeam'][i]
        if home['HomeGoals'][i] > home['AwayGoals'][i]: # if the home team won
            df_interim.loc[len(df_interim)]=[team, opponent, 3, 1 ]
        elif home['HomeGoals'][i] < home['AwayGoals'][i]: # if the home team lost
            df_interim.loc[len(df_interim)]=[team, opponent, 0, 1 ]
        else: # if it was a draw
            df_interim.loc[len(df_interim)]=[team, opponent, 1, 1 ]
    
    # get all the matches that this team was away
    away = serieA[serieA['AwayTeam']==team]
    away.reset_index(inplace=True)
    # for every such game, get the opponent and the result, from the perspective of the away team
    for i in range(len(away)):
        opponent = away['HomeTeam'][i]
        if away['HomeGoals'][i] > away['AwayGoals'][i]: # if the away team lost
            df_interim.loc[len(df_interim)]=[team, opponent, 0, 0 ]
        elif away['HomeGoals'][i] < away['AwayGoals'][i]: # if the away team won
            df_interim.loc[len(df_interim)]=[team, opponent, 3, 0 ]
        else: # if it was a draw
            df_interim.loc[len(df_interim)]=[team, opponent, 1, 0 ]
            
    # calculate the result summary and store that
    t = pd.DataFrame({'sum_points': df_interim.groupby(['team', 'opponent'])['result'].sum(),
                      'num_games' : df_interim.groupby(['team', 'opponent'])['result'].size()})
    t['pct_points'] = t['sum_points']/(t['num_games']*3)*100
    t.reset_index(inplace=True)
    df_agg = pd.concat([df_agg, t])
    
    # for more accuracy in reporting, calculate also the exact number of games for each result
    
    t = pd.DataFrame({'num_occurences' : df_interim.groupby(['team', 'opponent','result']).size()})
    t.reset_index(inplace=True)
    df_results = pd.concat([df_results, t])

In [9]:
df_results.to_csv('../data/results_per_team_and_opponent.csv', index=False)
df_agg.to_csv('../data/aggregated_result_per_team_and_opponent.csv', index=False)

In [10]:
df_results.head()

Unnamed: 0,team,opponent,result,num_occurences
0,Fiorentina,Ascoli,1.0,1.0
1,Fiorentina,Ascoli,3.0,3.0
2,Fiorentina,Atalanta,0.0,6.0
3,Fiorentina,Atalanta,1.0,9.0
4,Fiorentina,Atalanta,3.0,15.0


In [11]:
df_agg.head()

Unnamed: 0,team,opponent,sum_points,num_games,pct_points
0,Fiorentina,Ascoli,10.0,4.0,83.333333
1,Fiorentina,Atalanta,54.0,30.0,60.0
2,Fiorentina,Bari,7.0,4.0,58.333333
3,Fiorentina,Benevento,9.0,4.0,75.0
4,Fiorentina,Bologna,51.0,26.0,65.384615


get the total number of games that each team played across all these years

extract the teams that played at least 300 games across all these years

In [12]:
tot_number_of_games = pd.DataFrame({'total_num_games' : df_agg.groupby('team')['num_games'].sum()})
tot_number_of_games.reset_index(inplace = True)

print(len(tot_number_of_games['team']))
print(len(tot_number_of_games[tot_number_of_games['total_num_games'] > 300]))
pd.merge(tot_number_of_games[tot_number_of_games['total_num_games'] > 300], serieA[['HomeTeam']],
                            left_on='team', right_on='HomeTeam', how='inner').drop_duplicates().sort_values(by=['team'])

41
20


Unnamed: 0,team,total_num_games,HomeTeam
0,Atalanta,570.0,Atalanta
285,Bologna,494.0,Bologna
532,Cagliari,608.0,Cagliari
836,Catania,304.0,Catania
988,Chievo,494.0,Chievo
1235,Empoli,304.0,Empoli
1387,Fiorentina,646.0,Fiorentina
1710,Genoa,570.0,Genoa
1995,Inter,646.0,Inter
2318,Juventus,608.0,Juventus


In [13]:
print(games_per_season * number_of_seasons)

646


In [14]:
df_aggregated = pd.merge(tot_number_of_games.drop_duplicates(), df_agg, on ='team', how='inner')

In [15]:
df_aggregated.head()

Unnamed: 0,team,total_num_games,opponent,sum_points,num_games,pct_points
0,Ascoli,76.0,Atalanta,0.0,2.0,0.0
1,Ascoli,76.0,Cagliari,4.0,4.0,33.333333
2,Ascoli,76.0,Catania,2.0,2.0,33.333333
3,Ascoli,76.0,Chievo,5.0,4.0,41.666667
4,Ascoli,76.0,Empoli,6.0,4.0,50.0


In [16]:
df_aggregated.to_csv('../data/aggregated_result_per_team_and_opponent.csv')