In [31]:
# We import all required libraries:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import statsmodels.api as sm
import numpy as np
import matplotlib
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.subplots import make_subplots

init_notebook_mode(connected=True)

sys.path.append('../py_files') # <- py_files relative path
import functions

In [2]:
# Also, we read all required csv files:
fifa_17 = pd.read_csv("../csv_files/fifa17.csv", index_col=0)
fifa_18 = pd.read_csv("../csv_files/fifa18.csv", index_col=0)
fifa_19 = pd.read_csv("../csv_files/fifa19.csv", index_col=0)
fifa_21 = pd.read_csv("../csv_files/fifa21.csv", index_col=0)
fifa_22 = pd.read_csv("../csv_files/fifa22.csv", index_col=0)
fifa_gk = pd.read_csv("../csv_files/fifagk.csv", index_col=0)


In [4]:
# And we sort our dataframes by points obtained in Ballon d'Or ceremonies:
fifa_17 = fifa_17.sort_values(by=['points'], ascending = False)
fifa_18 = fifa_18.sort_values(by=['points'], ascending = False)
fifa_19 = fifa_19.sort_values(by=['points'], ascending = False)
fifa_21 = fifa_21.sort_values(by=['points'], ascending = False)
fifa_22 = fifa_22.sort_values(by=['points'], ascending = False)
last_5_years = pd.concat([fifa_17, fifa_18, fifa_19, fifa_21, fifa_22], ignore_index=True, sort=False)
last_5_years_with_gk = pd.concat([last_5_years, fifa_gk], ignore_index=True, sort=False)
# fifagk -> sorted by year edition

In [3]:
fifa_19

Unnamed: 0,player_id,fifa_version,short_name,long_name,player_positions,general_position,overall,value_eur,wage_eur,age,...,dribbling,defending,physic,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,goalkeeping_speed,points
0,20801,19,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,ST,ATT,94,77000000.0,400000.0,33,...,89.0,35.0,79.0,7,11,15,14,11,,476
1,158023,19,L. Messi,Lionel Andrés Messi Cuccittini,CF,ATT,94,110500000.0,575000.0,31,...,96.0,32.0,61.0,6,11,15,14,8,,686
2,188545,19,R. Lewandowski,Robert Lewandowski,ST,ATT,90,77000000.0,200000.0,29,...,85.0,41.0,82.0,15,6,12,8,10,,44
3,209331,19,M. Salah,Mohamed Salah Ghaly,RW,ATT,88,69500000.0,250000.0,26,...,89.0,45.0,72.0,14,14,9,11,14,,178
4,231747,19,K. Mbappé,Kylian Mbappé Lottin,RW,ATT,87,72000000.0,80000.0,19,...,89.0,39.0,72.0,13,5,7,11,6,,89
5,208722,19,S. Mané,Sadio Mané,LW,ATT,86,52000000.0,190000.0,26,...,87.0,42.0,73.0,10,10,15,7,14,,347
6,203376,19,V. van Dijk,Virgil van Dijk,CB,DEF,85,38500000.0,150000.0,26,...,70.0,85.0,84.0,13,10,13,11,11,,679
7,204485,19,R. Mahrez,Riyad Mahrez,RW,ATT,85,40500000.0,200000.0,27,...,90.0,37.0,59.0,15,9,13,11,6,,33
9,218667,19,Bernardo Silva,Bernardo Mota Veiga de Carvalho e Silva,RW,ATT,84,42500000.0,160000.0,23,...,89.0,46.0,58.0,9,10,14,12,9,,41


## Analysis and visualizations introduction 

According the FIFA ratings file, where we have performed the function with deviations from 2020's FIFA ratings dataset, we have decided that we will use 2019's FIFA ratings dataframe in order to run the regression model. Hence, we start inspecting this dataframe with variables' correlations with points, separating between players and goalkeepers.

From players' datasets, we obtain correlations between every numeric variable and points obtained in Ballon d'Or. This is a very important step since we can define statistical relationships between the dependent variable (points in Ballon d'Or ceremonies), and all explanatory variables (FIFA's ratings and player attributes).

In the case of players, we won't take into account the variables from goalkeeping_diving, since those are specific for goalkeepers. We will drop those columns (in case we want to recover them, they are available at the last_5_years dataframe). After that, we will check the correlations again and we will define the attributes that might be more interesting to study and visualize. 

For goalkeepers, we see that those attributes corresponding to players are already null, so we don't have to make any modification.

In [5]:
def drop_gk_attributes (player_df):
    """
    This function drops all elements belonging to goalkeeping attribute columns, since they can affect the correlation of other 
    meaningful player attributes.
    """
    player_df.drop(['goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking', 'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_speed'], axis = 1, inplace = True)

    return player_df

In [6]:
fifa_19 = drop_gk_attributes(fifa_19)
fifa_19

Unnamed: 0,player_id,fifa_version,short_name,long_name,player_positions,general_position,overall,value_eur,wage_eur,age,...,nationality_name,preferred_foot,release_clause_eur,pace,shooting,passing,dribbling,defending,physic,points
1,158023,19,L. Messi,Lionel Andrés Messi Cuccittini,CF,ATT,94,110500000.0,575000.0,31,...,Argentina,Left,226500000.0,88.0,91.0,88.0,96.0,32.0,61.0,686
6,203376,19,V. van Dijk,Virgil van Dijk,CB,DEF,85,38500000.0,150000.0,26,...,Netherlands,Right,74100000.0,71.0,60.0,67.0,70.0,85.0,84.0,679
0,20801,19,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,ST,ATT,94,77000000.0,400000.0,33,...,Portugal,Right,127100000.0,90.0,93.0,81.0,89.0,35.0,79.0,476
5,208722,19,S. Mané,Sadio Mané,LW,ATT,86,52000000.0,190000.0,26,...,Senegal,Right,100100000.0,94.0,80.0,76.0,87.0,42.0,73.0,347
3,209331,19,M. Salah,Mohamed Salah Ghaly,RW,ATT,88,69500000.0,250000.0,26,...,Egypt,Left,133800000.0,92.0,84.0,79.0,89.0,45.0,72.0,178
4,231747,19,K. Mbappé,Kylian Mbappé Lottin,RW,ATT,87,72000000.0,80000.0,19,...,France,Right,147600000.0,96.0,81.0,79.0,89.0,39.0,72.0,89
2,188545,19,R. Lewandowski,Robert Lewandowski,ST,ATT,90,77000000.0,200000.0,29,...,Poland,Right,127100000.0,78.0,89.0,75.0,85.0,41.0,82.0,44
9,218667,19,Bernardo Silva,Bernardo Mota Veiga de Carvalho e Silva,RW,ATT,84,42500000.0,160000.0,23,...,Portugal,Left,87100000.0,78.0,72.0,83.0,89.0,46.0,58.0,41
7,204485,19,R. Mahrez,Riyad Mahrez,RW,ATT,85,40500000.0,200000.0,27,...,Algeria,Left,74900000.0,85.0,79.0,81.0,90.0,37.0,59.0,33


We choosed the most similar to 2020 among the FIFA ratings dataframes since, in function of the year, the importance of FIFA attributes could dramatically. An example of that is the pace attribute; in 2017, the correlation between Ballon d'Or points and pace was very high (0.83). However, in 2022 it is even negative (-0.04).

Since the most similar of the datasets is the one corresponding to 2019, we print the correlations w.r.t. points below.
The conclusions are the following:
- Overall rating has some incidence to points -> there is low variance, all ratings are close to each other.
- Weight and wage are the variables with strongest correlation -> tough, well paid players seem to have advantage.
- Defending is the most punctuable skill.
- The other player stats were very relevant.

In [7]:
fifa_19_players_corr = pd.DataFrame(fifa_19.corr()['points'])
fifa_19_players_corr

  fifa_19_players_corr = pd.DataFrame(fifa_19.corr()['points'])


Unnamed: 0,points
player_id,-0.426212
fifa_version,
overall,0.449338
value_eur,0.318877
wage_eur,0.596492
age,0.496023
height_cm,0.254753
weight_kg,0.571978
club_jersey_number,-0.574633
release_clause_eur,0.345707


Last on this, we will take the following variables as explanatory for the regression model:
- Overall
- Wage (in €)
- Age
- Weight
- FIFA Stats: pace, shooting, passing, dribbling, defending and physic.

# Non-FIFA attribute data

## Player position
The first plot to start understanding the distribution of Ballon d'Or nominations is the following piechart. In it, we have the proportion of top 10 players for each generic position: attacker, midfielder, defender or goalkeeper. It is observable that attackers have been the most common players in Ballon d'Or top 10's. 

In [8]:
# Pie chart
piechart = pd.DataFrame(last_5_years_with_gk['general_position'].value_counts())
piechart.reset_index(drop=False, inplace=True)

fig = px.pie(piechart, values='general_position', names='index', title='Player Generic Position', color_discrete_sequence= px.colors.sequential.Blues_r)
fig.show()

In [9]:
stacked_positions = last_5_years_with_gk.groupby(['general_position', 'player_positions']).size()
stacked_positions = pd.DataFrame(stacked_positions)
stacked_positions.reset_index(drop=False, inplace=True)

stacked_positions

fig = px.bar(stacked_positions, x="general_position", y=0, color="player_positions", labels={"general_position": "Generic Position","player_positions": "Specific Positions",0: "Number of Players"},title="Distribution of Specific Positions", color_discrete_sequence= px.colors.sequential.Blues_r)
fig.show()


## Country
In the following histogram there is the distribution of last 5 years top 10 players' nationalities. Most of countries that appear are from Europe, except some exceptions from South America and some cases from Africa. Only 3 continents are represented.

In [10]:
fig = px.histogram(last_5_years_with_gk, x="nationality_name", nbins=1000, labels={"nationality_name": "Country"},color_discrete_sequence= px.colors.sequential.Blues_r)
fig.show()

## League
Concerning domestic competitions, all players in five last Ballon d'Or editions play in one of the big five leagues:
- La Liga (Spain)
- Premier League (UK)
- Ligue 1 (France)
- Bundesliga (Germany)
- Serie A (Italy)

La Liga and Premier League have the greatest representation.

In [11]:
league_piechart = pd.DataFrame(last_5_years_with_gk['league_name'].value_counts())
league_piechart.reset_index(drop=False, inplace=True)

fig = px.pie(league_piechart, values='league_name', names='index', title='Nominations per Competition', color_discrete_sequence= px.colors.sequential.Blues_r)
fig.show()

## Wage distribution
In the following graph we observe how wage is distributed among top 10 players. The plot is ordered by top 10 Ballon d'Or results by names, in the x-axis, being the firsts in the left and descending towards the right. it can be seen that Players in the first places have higher wages.

In [52]:
wage_dist = px.line(fifa_19, x="short_name", y="wage_eur", color_discrete_sequence= px.colors.sequential.Blues_r, labels={"short_name": "Short Name", "wage_eur": "Weekly Wage"})
wage_dist.show()

# FIFA attributes

Now we focus on the variables that we want to make explanatory for the results in a Ballon d'Or ceremony: the FIFA ratings and attributes. 
 

In [27]:
fig = px.bar(x=fifa_19['short_name'], y=fifa_19['points'], color_discrete_sequence= px.colors.sequential.Blues_r, labels={"x": "Short Name", "y": "Points"})
fig.show()

Here we have the classification of 2019's Ballon d'Or. A particularity is that a few players tend to get the wide makority of all votes. In this case, Leo Mesi, Virgil Van Dijk and Cristiano Ronaldo have most of the points.
Also, most players in the top 10 are attackers or attacking midfielders, so we can expect that our prediction has a similar trend.

In [43]:
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=fifa_19['short_name'], y=fifa_19['overall'], name="Overall Rating"),
    secondary_y=False,
)

fig.add_trace(
    go.Bar(x=fifa_19['short_name'], y=fifa_19['points'], name="Ballon d'Or Points", opacity=0.7),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Player's Overall Rating and Ballon d'Or Points"
)

# Set x-axis title
fig.update_xaxes(title_text="Short Name")

# Set y-axes titles
fig.update_yaxes(title_text="<b>Rating</b> FIFA", secondary_y=False)
fig.update_yaxes(title_text="<b>Points</b> Ballon d'Or", secondary_y=True)

fig.show()

## Age and Points distribution
Age was one of the variables with strongest correlation to Ballon d'Or points. In the graph below, we see that the higher the age, the more points obtained in the Ballon d'Or classification.

In [55]:
#age_dist = px.bar(fifa_19, x="short_name", y="age", color_discrete_sequence= px.colors.sequential.Blues_r, labels={"short_name": "Short Name", "age": "Age"})
#age_dist.show()

age_points = px.scatter(fifa_19, x="age", y="points", trendline="ols", labels={"age": "Age", "points": "Ballon d'Or Points"})
age_points.show()

## Weight distribution
Now it is the turn of the variable with the strongest correlation to points: weight.
We observe that both lines run almost parallely, there are only two players that are in differ considerably from the expected value: Leo Messi, thinner than most of the rest, and Robert Lewandowski, who is above the mean.

In [58]:
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=fifa_19['short_name'], y=fifa_19['weight_kg'], name="Weight"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=fifa_19['short_name'], y=fifa_19['points'], name="Ballon d'Or Points", opacity=0.7),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Player's Weight and Ballon d'Or Points"
)

# Set x-axis title
fig.update_xaxes(title_text="Short Name")

# Set y-axes titles
fig.update_yaxes(title_text="<b>Weight</b> FIFA", secondary_y=False)
fig.update_yaxes(title_text="<b>Points</b> Ballon d'Or", secondary_y=True)

fig.show()

## Stats distribution
To finish, we plot the graph of player's full stats.
The attributes that get higher values tend to be those corresponding to attacking skills, which is expected due to the fact that most players roles are based on attacking (ST, RW, LW, etc.).
Defending skills have a low average, the only defender in 2019's top 10 was Van Dijk. However, since he is in second position, it somehow averages with Messi's and Cristiano's defending stats and then, the rest of players confirm a decreasing trend, which makes it be quite highly correlated with Points distribution.
In conclusion, since most of Ballon d'Or nominees use to play in offensive positions, it will be more probable that 2020's winner is also an attacker.

In [62]:
fig = make_subplots(specs=[[{"secondary_y": True}]])

'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic'
# Add traces
fig.add_trace(
    go.Scatter(x=fifa_19['short_name'], y=fifa_19['pace'], name="Pace"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=fifa_19['short_name'], y=fifa_19['shooting'], name="Shooting", opacity=0.7),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=fifa_19['short_name'], y=fifa_19['passing'], name="Passing", opacity=0.7),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=fifa_19['short_name'], y=fifa_19['dribbling'], name="Dribbling", opacity=0.7),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=fifa_19['short_name'], y=fifa_19['defending'], name="Defending", opacity=0.7),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=fifa_19['short_name'], y=fifa_19['physic'], name="Physique", opacity=0.7),
    secondary_y=False,
)

# Add figure title
fig.update_layout(
    title_text="Player's Stats"
)

# Set x-axis title
fig.update_xaxes(title_text="Short Name")

# Set y-axes titles
fig.update_yaxes(title_text="<b>Stats</b> FIFA", secondary_y=False)

fig.show()

# Regression Model

The last step of the project is to predict who would have been the winner of 2020's Ballon d'Or. To do so, we run a regression with some of the variables that could be more correlated with the distribution of points.

In [73]:
# X stands for explanatory variables
X = fifa_19[['overall', 'wage_eur', 'age', 'weight_kg']]
# Y stands for the dependent variable
Y = fifa_19['points']

In [79]:
# We add the constant term to the model
X = sm.add_constant(X)

# And run the regression with ordinary least square model
res = sm.OLS(Y, X).fit()

# The summary is the following
res.summary()


kurtosistest only valid for n>=20 ... continuing anyway, n=9



0,1,2,3
Dep. Variable:,points,R-squared:,0.872
Model:,OLS,Adj. R-squared:,0.744
Method:,Least Squares,F-statistic:,6.8
Date:,"Mon, 06 Feb 2023",Prob (F-statistic):,0.0451
Time:,17:56:17,Log-Likelihood:,-53.414
No. Observations:,9,AIC:,116.8
Df Residuals:,4,BIC:,117.8
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3208.8542,2030.328,1.580,0.189,-2428.239,8845.948
overall,-54.4014,26.259,-2.072,0.107,-127.308,18.506
wage_eur,0.0028,0.001,3.653,0.022,0.001,0.005
age,-31.8466,21.887,-1.455,0.219,-92.615,28.922
weight_kg,27.1219,6.849,3.960,0.017,8.106,46.138

0,1,2,3
Omnibus:,12.546,Durbin-Watson:,1.777
Prob(Omnibus):,0.002,Jarque-Bera (JB):,5.173
Skew:,1.625,Prob(JB):,0.0753
Kurtosis:,4.798,Cond. No.,12600000.0


The variables that have most incidence are wage and weight: we reject the null Hyphothesis that both of them do not have any effect on points attribution both at 10 and 5 significance level. Concerning overall and age, we cannot say the same, they are not that representative.

However, now that we've come this far, let's use the model to get a 2020 Ballon d'Or winner!

In [80]:
# The model has a constant term and 4 beta's:
alpha = res.params['const']

beta_overall = res.params['overall']

beta_wage = res.params['wage_eur']

beta_age = res.params['age']

beta_weight = res.params['weight_kg']

In [83]:
fifa_20 = pd.read_csv("../csv_files/fifa20.csv", index_col=0)
fifa_20_reg = fifa_20[["overall", "wage_eur", "age", "weight_kg"]]

In [82]:
points_prediction = []
for column in fifa_20_reg:
    points_prediction.append()
    


Unnamed: 0,player_id,fifa_version,short_name,long_name,player_positions,overall,value_eur,wage_eur,age,height_cm,...,passing,dribbling,defending,physic,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,goalkeeping_speed
0,158023,20,L. Messi,Lionel Andrés Messi Cuccittini,"RW, CF, ST",94,95500000.0,560000.0,32,170,...,92.0,96.0,39.0,66.0,6,11,15,14,8,
1,20801,20,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,"ST, LW",93,58500000.0,410000.0,34,187,...,82.0,89.0,35.0,78.0,7,11,15,14,11,
2,190871,20,Neymar Jr,Neymar da Silva Santos Júnior,"LW, CAM",92,105500000.0,290000.0,27,175,...,87.0,95.0,32.0,58.0,9,9,15,15,11,
3,183277,20,E. Hazard,Eden Michael Hazard,"LW, CF",91,90000000.0,470000.0,28,175,...,86.0,94.0,35.0,66.0,11,12,6,8,8,
4,192985,20,K. De Bruyne,Kevin De Bruyne,"CAM, CM",91,90000000.0,370000.0,28,181,...,92.0,86.0,61.0,78.0,15,13,5,10,13,
6,177003,20,L. Modrić,Luka Modrić,CM,90,45000000.0,340000.0,33,172,...,89.0,89.0,72.0,66.0,13,9,7,14,9,
8,203376,20,V. van Dijk,Virgil van Dijk,CB,90,78000000.0,200000.0,27,193,...,70.0,71.0,90.0,86.0,13,10,13,11,11,
9,209331,20,M. Salah,Mohamed Salah Ghaly,"RW, ST",90,80500000.0,240000.0,27,175,...,81.0,89.0,45.0,74.0,14,14,9,11,14,
10,138956,20,G. Chiellini,Giorgio Chiellini,CB,89,24500000.0,210000.0,34,187,...,58.0,60.0,90.0,82.0,3,3,2,4,3,
11,153079,20,S. Agüero,Sergio Leonel Agüero del Castillo,ST,89,60000000.0,300000.0,31,173,...,77.0,88.0,33.0,74.0,13,15,6,11,14,
