# 6.4.1 Machine Learning Data Cleaning: 

## This script contains the following:

### 01. Importing Libraries and Data
### 02. Data Cleaning
### 03. Aggregating Win/Loss/Draws
### 04. Aggregating Goals For/Against

### 01. Importing Libraries and Data

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import os
import sklearn
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
%matplotlib inline

In [3]:
# Creating path variable
path = r'C:\Users\widne\Documents\CareerFoundry Exercises\Data_Immersion\Achievement 6\06-2024 International Football Matches'

In [4]:
# Importing results dataframe
df = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'results_cleaned.pkl'))

In [5]:
# Checking dataframe
df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,year,month,total_goals,goal_diff,scoring_type,winning_team,losing_team
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,United Kindgom,False,1872,11,0.0,0.0,Low scoring,draw,draw
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,United Kingdom,False,1873,3,6.0,2.0,High scoring,England,Scotland
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,United Kindgom,False,1874,3,3.0,1.0,Medium scoring,Scotland,England
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,United Kingdom,False,1875,3,4.0,0.0,Medium scoring,draw,draw
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,United Kindgom,False,1876,3,3.0,3.0,Medium scoring,Scotland,England


### 02. Data Cleaning 

In [6]:
# Creating a winning_team column to list winning team of match
win_conditions = [(df['home_score'] > df['away_score']), (df['home_score'] < df['away_score'])]
win_choices = [df['home_team'], df['away_team']]

df['winning_team'] = np.select(win_conditions, win_choices, default='draw')

In [7]:
# Creating a losing_team column to list winning team of match
loss_conditions = [(df['home_score'] < df['away_score']), (df['home_score'] > df['away_score'])]
loss_choices = [df['home_team'], df['away_team']]

df['losing_team'] = np.select(loss_conditions, loss_choices, default='draw')

In [8]:
# Checking new columns
df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,year,month,total_goals,goal_diff,scoring_type,winning_team,losing_team
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,United Kindgom,False,1872,11,0.0,0.0,Low scoring,draw,draw
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,United Kingdom,False,1873,3,6.0,2.0,High scoring,England,Scotland
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,United Kindgom,False,1874,3,3.0,1.0,Medium scoring,Scotland,England
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,United Kingdom,False,1875,3,4.0,0.0,Medium scoring,draw,draw
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,United Kindgom,False,1876,3,3.0,3.0,Medium scoring,Scotland,England


### 03. Aggregating Win/Loss/Draws

In [9]:
# Filtering to show only matches where the home team won
df_home_wins = df[(df['home_team']==df['winning_team'])]

In [10]:
# Chekcing dataframe
df_home_wins.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,year,month,total_goals,goal_diff,scoring_type,winning_team,losing_team
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,United Kingdom,False,1873,3,6.0,2.0,High scoring,England,Scotland
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,United Kindgom,False,1874,3,3.0,1.0,Medium scoring,Scotland,England
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,United Kindgom,False,1876,3,3.0,3.0,Medium scoring,Scotland,England
5,1876-03-25,Scotland,Wales,4.0,0.0,Friendly,Glasgow,United Kindgom,False,1876,3,4.0,4.0,Medium scoring,Scotland,Wales
8,1878-03-02,Scotland,England,7.0,2.0,Friendly,Glasgow,United Kindgom,False,1878,3,9.0,5.0,High scoring,Scotland,England


In [11]:
# Aggregating to get the number of home wins each team had
df_home_wins_count = df_home_wins.groupby('home_team', as_index=False)[['winning_team']].agg('count')

In [12]:
# Renaming columns
df_home_wins_count.rename(columns={'home_team':'team', 'winning_team':'home_wins'}, inplace=True)

In [13]:
# Checking dataframe
df_home_wins_count.head()

Unnamed: 0,team,home_wins
0,Abkhazia,11
1,Afghanistan,18
2,Albania,73
3,Alderney,1
4,Algeria,191


In [14]:
# Filtering to show only matches where the away team won
df_away_wins = df[(df['away_team']==df['winning_team'])]

In [15]:
# Checking dataframe
df_away_wins.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,year,month,total_goals,goal_diff,scoring_type,winning_team,losing_team
6,1877-03-03,England,Scotland,1.0,3.0,Friendly,London,United Kingdom,False,1877,3,4.0,2.0,Medium scoring,Scotland,England
7,1877-03-05,Wales,Scotland,0.0,2.0,Friendly,Wrexham,United Kingdom,False,1877,3,2.0,2.0,Low scoring,Scotland,Wales
12,1879-04-07,Wales,Scotland,0.0,3.0,Friendly,Wrexham,United Kingdom,False,1879,4,3.0,3.0,Medium scoring,Scotland,Wales
14,1880-03-15,Wales,England,2.0,3.0,Friendly,Wrexham,United Kingdom,False,1880,3,5.0,1.0,High scoring,England,Wales
16,1881-02-26,England,Wales,0.0,1.0,Friendly,Blackburn,United Kingdom,False,1881,2,1.0,1.0,Low scoring,Wales,England


In [16]:
# Aggregating to get the number of away wins each team had
df_away_wins_count = df_away_wins.groupby('away_team', as_index=False)[['winning_team']].agg('count')

In [17]:
# Renaming columns
df_away_wins_count.rename(columns={'away_team':'team', 'winning_team':'away_wins'}, inplace=True)

In [18]:
df_away_wins_count.head()

Unnamed: 0,team,away_wins
0,Abkhazia,3
1,Afghanistan,17
2,Albania,27
3,Alderney,4
4,Algeria,73


In [19]:
# Filtering to show only matches where the home team lost
df_home_losses = df[(df['home_team']==df['losing_team'])]

In [20]:
# Checking dataframe
df_home_losses.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,year,month,total_goals,goal_diff,scoring_type,winning_team,losing_team
6,1877-03-03,England,Scotland,1.0,3.0,Friendly,London,United Kingdom,False,1877,3,4.0,2.0,Medium scoring,Scotland,England
7,1877-03-05,Wales,Scotland,0.0,2.0,Friendly,Wrexham,United Kingdom,False,1877,3,2.0,2.0,Low scoring,Scotland,Wales
12,1879-04-07,Wales,Scotland,0.0,3.0,Friendly,Wrexham,United Kingdom,False,1879,4,3.0,3.0,Medium scoring,Scotland,Wales
14,1880-03-15,Wales,England,2.0,3.0,Friendly,Wrexham,United Kingdom,False,1880,3,5.0,1.0,High scoring,England,Wales
16,1881-02-26,England,Wales,0.0,1.0,Friendly,Blackburn,United Kingdom,False,1881,2,1.0,1.0,Low scoring,Wales,England


In [21]:
# Aggregating to get the number of home losses each team had
df_home_losses_count = df_home_losses.groupby('home_team', as_index=False)[['losing_team']].agg('count')

In [22]:
# Renaming columns
df_home_losses_count.rename(columns={'home_team':'team', 'losing_team':'home_losses'}, inplace=True)

In [23]:
# Checking dataframe
df_home_losses_count.head()

Unnamed: 0,team,home_losses
0,Abkhazia,3
1,Afghanistan,17
2,Albania,78
3,Alderney,46
4,Algeria,60


In [24]:
# Filtering to show only matches where the away team lost
df_away_losses = df[(df['away_team']==df['losing_team'])]

In [25]:
# Checking dataframe
df_away_losses.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,year,month,total_goals,goal_diff,scoring_type,winning_team,losing_team
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,United Kingdom,False,1873,3,6.0,2.0,High scoring,England,Scotland
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,United Kindgom,False,1874,3,3.0,1.0,Medium scoring,Scotland,England
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,United Kindgom,False,1876,3,3.0,3.0,Medium scoring,Scotland,England
5,1876-03-25,Scotland,Wales,4.0,0.0,Friendly,Glasgow,United Kindgom,False,1876,3,4.0,4.0,Medium scoring,Scotland,Wales
8,1878-03-02,Scotland,England,7.0,2.0,Friendly,Glasgow,United Kindgom,False,1878,3,9.0,5.0,High scoring,Scotland,England


In [26]:
# Aggregating to get the number of away losses each team had
df_away_losses_count = df_away_losses.groupby('away_team', as_index=False)[['losing_team']].agg('count')

In [27]:
# Renaming columns
df_away_losses_count.rename(columns={'away_team':'team', 'losing_team':'away_losses'}, inplace=True)

In [28]:
# Checking dataframe
df_away_losses_count.head()

Unnamed: 0,team,away_losses
0,Abkhazia,2
1,Afghanistan,52
2,Albania,115
3,Alderney,82
4,Algeria,103


In [29]:
# Filtering to show matches that ended in a draw
df_draws = df[(df['winning_team']=='draw')]

In [30]:
# Checking dataframe
df_draws.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,year,month,total_goals,goal_diff,scoring_type,winning_team,losing_team
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,United Kindgom,False,1872,11,0.0,0.0,Low scoring,draw,draw
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,United Kingdom,False,1875,3,4.0,0.0,Medium scoring,draw,draw
28,1883-03-17,Northern Ireland,Wales,1.0,1.0,Friendly,Belfast,Ireland,False,1883,3,2.0,0.0,Low scoring,draw,draw
36,1885-03-14,England,Wales,1.0,1.0,British Home Championship,Blackburn,United Kingdom,False,1885,3,2.0,0.0,Low scoring,draw,draw
38,1885-03-21,England,Scotland,1.0,1.0,British Home Championship,London,United Kingdom,False,1885,3,2.0,0.0,Low scoring,draw,draw


In [31]:
# Aggregating to get the number of home draws each team had
df_home_draws = df_draws.groupby('home_team', as_index=False)[['winning_team']].agg('count')

In [32]:
# Renaming columns
df_home_draws.rename(columns={'home_team':'team', 'winning_team':'home_draws'}, inplace=True)

In [33]:
# Cheking dataframe
df_home_draws.head()

Unnamed: 0,team,home_draws
0,Abkhazia,8
1,Afghanistan,10
2,Albania,46
3,Alderney,1
4,Algeria,82


In [34]:
# Aggregating to get the number of away draws each team had
df_away_draws = df_draws.groupby('away_team', as_index=False)[['winning_team']].agg('count')

In [35]:
# Renaming columns
df_away_draws.rename(columns={'away_team':'team', 'winning_team':'away_draws'}, inplace=True)

In [36]:
# Checking dataframe
df_away_draws.head()

Unnamed: 0,team,away_draws
0,Abkhazia,5
1,Afghanistan,21
2,Albania,34
3,Alderney,1
4,Algeria,75


In [37]:
# Merging home wins and away wins dataframes
df_wins = df_home_wins_count.merge(df_away_wins_count, how='outer', on='team')

In [38]:
# Checking dataframe
df_wins.head()

Unnamed: 0,team,home_wins,away_wins
0,Abkhazia,11.0,3.0
1,Afghanistan,18.0,17.0
2,Albania,73.0,27.0
3,Alderney,1.0,4.0
4,Algeria,191.0,73.0


In [39]:
# Checking for missing values
df_wins.isnull().sum()

team          0
home_wins     7
away_wins    15
dtype: int64

In [40]:
# Replacing null values with 0
df_wins.fillna({'home_wins':0, 'away_wins':0}, inplace=True)
# Checking for missing values
df_wins.isnull().sum()

team         0
home_wins    0
away_wins    0
dtype: int64

In [41]:
# Adding home wins and away wins together to get a total wins column
df_wins['total_wins'] = df_wins['home_wins'] + df_wins['away_wins']

In [42]:
df_wins.head()

Unnamed: 0,team,home_wins,away_wins,total_wins
0,Abkhazia,11.0,3.0,14.0
1,Afghanistan,18.0,17.0,35.0
2,Albania,73.0,27.0,100.0
3,Alderney,1.0,4.0,5.0
4,Algeria,191.0,73.0,264.0


In [43]:
# Merging home losses and away losses dataframes
df_losses = df_home_losses_count.merge(df_away_losses_count, how='outer', on='team')

In [44]:
# Checking dataframe
df_losses.head()

Unnamed: 0,team,home_losses,away_losses
0,Abkhazia,3.0,2.0
1,Afghanistan,17.0,52.0
2,Albania,78.0,115.0
3,Alderney,46.0,82.0
4,Algeria,60.0,103.0


In [45]:
# Checking for missing values
df_losses.isnull().sum()

team            0
home_losses    20
away_losses    17
dtype: int64

In [46]:
# Replacing null values with 0
df_losses.fillna({'home_losses':0, 'away_losses':0}, inplace=True)
# Checking for missing values
df_losses.isnull().sum()

team           0
home_losses    0
away_losses    0
dtype: int64

In [47]:
# Adding home and away losses to get a total losses column
df_losses['total_losses'] = df_losses['home_losses'] + df_losses['away_losses']

In [48]:
# Checking dataframe
df_losses.head()

Unnamed: 0,team,home_losses,away_losses,total_losses
0,Abkhazia,3.0,2.0,5.0
1,Afghanistan,17.0,52.0,69.0
2,Albania,78.0,115.0,193.0
3,Alderney,46.0,82.0,128.0
4,Algeria,60.0,103.0,163.0


In [50]:
# Merging home draws and away draws dataframes
df_team_draws = df_home_draws.merge(df_away_draws, how='outer', on='team')

In [51]:
# Checking dataframe
df_team_draws.head()

Unnamed: 0,team,home_draws,away_draws
0,Abkhazia,8.0,5.0
1,Afghanistan,10.0,21.0
2,Albania,46.0,34.0
3,Alderney,1.0,1.0
4,Algeria,82.0,75.0


In [52]:
# Checking for missing values
df_team_draws.isnull().sum()

team           0
home_draws    13
away_draws    23
dtype: int64

In [54]:
# Replacing null values with 0
df_team_draws.fillna({'home_draws':0, 'away_draws':0}, inplace=True)
df_team_draws.isnull().sum()

team          0
home_draws    0
away_draws    0
dtype: int64

In [55]:
# Adding home and away draws to get a total draws column
df_team_draws['total_draws'] = df_team_draws['home_draws'] + df_team_draws['away_draws']

In [56]:
# Checking dataframe
df_team_draws.head()

Unnamed: 0,team,home_draws,away_draws,total_draws
0,Abkhazia,8.0,5.0,13.0
1,Afghanistan,10.0,21.0,31.0
2,Albania,46.0,34.0,80.0
3,Alderney,1.0,1.0,2.0
4,Algeria,82.0,75.0,157.0


In [57]:
# Merging wins, losses, and draws dataframes for a team result dataframe
df_team_results = df_wins.merge(df_losses, how='outer', on='team')

In [58]:
# Checking for missing values
df_team_results.isnull().sum()

team             0
home_wins       26
away_wins       26
total_wins      26
home_losses      4
away_losses      4
total_losses     4
dtype: int64

In [59]:
# Replacing null values with 0
df_team_results.fillna(0, inplace=True)
df_team_results.isnull().sum()

team            0
home_wins       0
away_wins       0
total_wins      0
home_losses     0
away_losses     0
total_losses    0
dtype: int64

In [60]:
df_team_results = df_team_results.merge(df_team_draws, how='outer', on='team')

In [61]:
# Checking for missing values
df_team_results.isnull().sum()

team             0
home_wins        2
away_wins        2
total_wins       2
home_losses      2
away_losses      2
total_losses     2
home_draws      39
away_draws      39
total_draws     39
dtype: int64

In [62]:
# Replacing null values with 0
df_team_results.fillna(0, inplace=True)
df_team_results.isnull().sum()

team            0
home_wins       0
away_wins       0
total_wins      0
home_losses     0
away_losses     0
total_losses    0
home_draws      0
away_draws      0
total_draws     0
dtype: int64

In [63]:
df_team_results.head()

Unnamed: 0,team,home_wins,away_wins,total_wins,home_losses,away_losses,total_losses,home_draws,away_draws,total_draws
0,Abkhazia,11.0,3.0,14.0,3.0,2.0,5.0,8.0,5.0,13.0
1,Afghanistan,18.0,17.0,35.0,17.0,52.0,69.0,10.0,21.0,31.0
2,Albania,73.0,27.0,100.0,78.0,115.0,193.0,46.0,34.0,80.0
3,Alderney,1.0,4.0,5.0,46.0,82.0,128.0,1.0,1.0,2.0
4,Algeria,191.0,73.0,264.0,60.0,103.0,163.0,82.0,75.0,157.0


In [64]:
# Calculating home, away, and total games played by each team
df_team_results['home_games_played'] = df_team_results['home_wins']+df_team_results['home_losses']+df_team_results['home_draws']
df_team_results['away_games_played'] = df_team_results['away_wins']+df_team_results['away_losses']+df_team_results['away_draws']
df_team_results['total_games_played'] = df_team_results['home_games_played']+df_team_results['away_games_played']
df_team_results.head()

Unnamed: 0,team,home_wins,away_wins,total_wins,home_losses,away_losses,total_losses,home_draws,away_draws,total_draws,home_games_played,away_games_played,total_games_played
0,Abkhazia,11.0,3.0,14.0,3.0,2.0,5.0,8.0,5.0,13.0,22.0,10.0,32.0
1,Afghanistan,18.0,17.0,35.0,17.0,52.0,69.0,10.0,21.0,31.0,45.0,90.0,135.0
2,Albania,73.0,27.0,100.0,78.0,115.0,193.0,46.0,34.0,80.0,197.0,176.0,373.0
3,Alderney,1.0,4.0,5.0,46.0,82.0,128.0,1.0,1.0,2.0,48.0,87.0,135.0
4,Algeria,191.0,73.0,264.0,60.0,103.0,163.0,82.0,75.0,157.0,333.0,251.0,584.0


In [65]:
# Calculating the percentage of home matches each team wins (home_win_pct) and does not lose (home_unbeaten_pct)
df_team_results['home_win_pct'] = df_team_results['home_wins']/df_team_results['home_games_played']
df_team_results['home_unbeaten_pct'] = (df_team_results['home_wins']+df_team_results['home_draws'])/df_team_results['home_games_played']
df_team_results.head()

Unnamed: 0,team,home_wins,away_wins,total_wins,home_losses,away_losses,total_losses,home_draws,away_draws,total_draws,home_games_played,away_games_played,total_games_played,home_win_pct,home_unbeaten_pct
0,Abkhazia,11.0,3.0,14.0,3.0,2.0,5.0,8.0,5.0,13.0,22.0,10.0,32.0,0.5,0.863636
1,Afghanistan,18.0,17.0,35.0,17.0,52.0,69.0,10.0,21.0,31.0,45.0,90.0,135.0,0.4,0.622222
2,Albania,73.0,27.0,100.0,78.0,115.0,193.0,46.0,34.0,80.0,197.0,176.0,373.0,0.370558,0.604061
3,Alderney,1.0,4.0,5.0,46.0,82.0,128.0,1.0,1.0,2.0,48.0,87.0,135.0,0.020833,0.041667
4,Algeria,191.0,73.0,264.0,60.0,103.0,163.0,82.0,75.0,157.0,333.0,251.0,584.0,0.573574,0.81982


In [66]:
# Calculating the percentage of away matches each team wins (away_win_pct) and does not lose (away_unbeaten_pct)
df_team_results['away_win_pct'] = df_team_results['away_wins']/df_team_results['away_games_played']
df_team_results['away_unbeaten_pct'] = (df_team_results['away_wins']+df_team_results['away_draws'])/df_team_results['away_games_played']
df_team_results.head()

Unnamed: 0,team,home_wins,away_wins,total_wins,home_losses,away_losses,total_losses,home_draws,away_draws,total_draws,home_games_played,away_games_played,total_games_played,home_win_pct,home_unbeaten_pct,away_win_pct,away_unbeaten_pct
0,Abkhazia,11.0,3.0,14.0,3.0,2.0,5.0,8.0,5.0,13.0,22.0,10.0,32.0,0.5,0.863636,0.3,0.8
1,Afghanistan,18.0,17.0,35.0,17.0,52.0,69.0,10.0,21.0,31.0,45.0,90.0,135.0,0.4,0.622222,0.188889,0.422222
2,Albania,73.0,27.0,100.0,78.0,115.0,193.0,46.0,34.0,80.0,197.0,176.0,373.0,0.370558,0.604061,0.153409,0.346591
3,Alderney,1.0,4.0,5.0,46.0,82.0,128.0,1.0,1.0,2.0,48.0,87.0,135.0,0.020833,0.041667,0.045977,0.057471
4,Algeria,191.0,73.0,264.0,60.0,103.0,163.0,82.0,75.0,157.0,333.0,251.0,584.0,0.573574,0.81982,0.290837,0.589641


In [67]:
# Calculating the total win percentage and unbeaten percentage of teams
df_team_results['win_pct'] = df_team_results['total_wins']/df_team_results['total_games_played']
df_team_results['unbeaten_pct'] = (df_team_results['total_wins']+df_team_results['total_draws'])/df_team_results['total_games_played']
df_team_results.head()

Unnamed: 0,team,home_wins,away_wins,total_wins,home_losses,away_losses,total_losses,home_draws,away_draws,total_draws,home_games_played,away_games_played,total_games_played,home_win_pct,home_unbeaten_pct,away_win_pct,away_unbeaten_pct,win_pct,unbeaten_pct
0,Abkhazia,11.0,3.0,14.0,3.0,2.0,5.0,8.0,5.0,13.0,22.0,10.0,32.0,0.5,0.863636,0.3,0.8,0.4375,0.84375
1,Afghanistan,18.0,17.0,35.0,17.0,52.0,69.0,10.0,21.0,31.0,45.0,90.0,135.0,0.4,0.622222,0.188889,0.422222,0.259259,0.488889
2,Albania,73.0,27.0,100.0,78.0,115.0,193.0,46.0,34.0,80.0,197.0,176.0,373.0,0.370558,0.604061,0.153409,0.346591,0.268097,0.482574
3,Alderney,1.0,4.0,5.0,46.0,82.0,128.0,1.0,1.0,2.0,48.0,87.0,135.0,0.020833,0.041667,0.045977,0.057471,0.037037,0.051852
4,Algeria,191.0,73.0,264.0,60.0,103.0,163.0,82.0,75.0,157.0,333.0,251.0,584.0,0.573574,0.81982,0.290837,0.589641,0.452055,0.72089


In [68]:
# Filtering to show only friendly matches where the home team won
df_friendly_home_wins = df[(df['home_team']==df['winning_team']) & (df['tournament']=='Friendly')]

In [69]:
# Checking dataframe
df_friendly_home_wins.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,year,month,total_goals,goal_diff,scoring_type,winning_team,losing_team
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,United Kingdom,False,1873,3,6.0,2.0,High scoring,England,Scotland
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,United Kindgom,False,1874,3,3.0,1.0,Medium scoring,Scotland,England
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,United Kindgom,False,1876,3,3.0,3.0,Medium scoring,Scotland,England
5,1876-03-25,Scotland,Wales,4.0,0.0,Friendly,Glasgow,United Kindgom,False,1876,3,4.0,4.0,Medium scoring,Scotland,Wales
8,1878-03-02,Scotland,England,7.0,2.0,Friendly,Glasgow,United Kindgom,False,1878,3,9.0,5.0,High scoring,Scotland,England


In [70]:
# Aggregating to get the number of home friendly wins each team had
df_friendly_home_wins_count = df_friendly_home_wins.groupby('home_team', as_index=False)[['winning_team']].agg('count')

In [71]:
# Renaming columns
df_friendly_home_wins_count.rename(columns={'home_team':'team', 'winning_team':'home_friendly_wins'}, inplace=True)
df_friendly_home_wins_count.head()

Unnamed: 0,team,home_friendly_wins
0,Abkhazia,2
1,Afghanistan,5
2,Albania,33
3,Algeria,67
4,Andalusia,7


In [72]:
# Filtering to show only friendly matches where the away team won
df_friendly_away_wins = df[(df['away_team']==df['winning_team']) & (df['tournament']=='Friendly')]

In [73]:
# Checking dataframe
df_friendly_away_wins.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,year,month,total_goals,goal_diff,scoring_type,winning_team,losing_team
6,1877-03-03,England,Scotland,1.0,3.0,Friendly,London,United Kingdom,False,1877,3,4.0,2.0,Medium scoring,Scotland,England
7,1877-03-05,Wales,Scotland,0.0,2.0,Friendly,Wrexham,United Kingdom,False,1877,3,2.0,2.0,Low scoring,Scotland,Wales
12,1879-04-07,Wales,Scotland,0.0,3.0,Friendly,Wrexham,United Kingdom,False,1879,4,3.0,3.0,Medium scoring,Scotland,Wales
14,1880-03-15,Wales,England,2.0,3.0,Friendly,Wrexham,United Kingdom,False,1880,3,5.0,1.0,High scoring,England,Wales
16,1881-02-26,England,Wales,0.0,1.0,Friendly,Blackburn,United Kingdom,False,1881,2,1.0,1.0,Low scoring,Wales,England


In [74]:
# Aggregating to get the number of away friendly wins each team had
df_friendly_away_wins_count = df_friendly_away_wins.groupby('away_team', as_index=False)[['winning_team']].agg('count')

In [75]:
# Renaming columns
df_friendly_away_wins_count.rename(columns={'away_team':'team', 'winning_team':'away_friendly_wins'}, inplace=True)
df_friendly_away_wins_count.head()

Unnamed: 0,team,away_friendly_wins
0,Afghanistan,3
1,Albania,8
2,Algeria,26
3,Andalusia,1
4,Angola,15


In [101]:
# Filtering to show only friendly matches where the home team lost
df_friendly_home_losses = df[(df['home_team']==df['losing_team']) & (df['tournament']=='Friendly')]

In [102]:
# Checking dataframe
df_friendly_home_losses.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,year,month,total_goals,goal_diff,scoring_type,winning_team,losing_team
6,1877-03-03,England,Scotland,1.0,3.0,Friendly,London,United Kingdom,False,1877,3,4.0,2.0,Medium scoring,Scotland,England
7,1877-03-05,Wales,Scotland,0.0,2.0,Friendly,Wrexham,United Kingdom,False,1877,3,2.0,2.0,Low scoring,Scotland,Wales
12,1879-04-07,Wales,Scotland,0.0,3.0,Friendly,Wrexham,United Kingdom,False,1879,4,3.0,3.0,Medium scoring,Scotland,Wales
14,1880-03-15,Wales,England,2.0,3.0,Friendly,Wrexham,United Kingdom,False,1880,3,5.0,1.0,High scoring,England,Wales
16,1881-02-26,England,Wales,0.0,1.0,Friendly,Blackburn,United Kingdom,False,1881,2,1.0,1.0,Low scoring,Wales,England


In [103]:
# Aggregating to get the number of home friendly losses each team had
df_friendly_home_losses_count = df_friendly_home_losses.groupby('home_team', as_index=False)[['losing_team']].agg('count')

In [104]:
# Renaming columns
df_friendly_home_losses_count.rename(columns={'home_team':'team', 'losing_team':'home_friendly_losses'}, inplace=True)
df_friendly_home_losses_count.head()

Unnamed: 0,team,home_friendly_losses
0,Abkhazia,1
1,Afghanistan,3
2,Albania,16
3,Algeria,35
4,Andalusia,1


In [105]:
# Filtering to show only friendly matches where the away team lost
df_friendly_away_losses = df[(df['away_team']==df['losing_team']) & (df['tournament']=='Friendly')]

In [106]:
# Checking dataframe
df_friendly_away_losses.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,year,month,total_goals,goal_diff,scoring_type,winning_team,losing_team
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,United Kingdom,False,1873,3,6.0,2.0,High scoring,England,Scotland
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,United Kindgom,False,1874,3,3.0,1.0,Medium scoring,Scotland,England
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,United Kindgom,False,1876,3,3.0,3.0,Medium scoring,Scotland,England
5,1876-03-25,Scotland,Wales,4.0,0.0,Friendly,Glasgow,United Kindgom,False,1876,3,4.0,4.0,Medium scoring,Scotland,Wales
8,1878-03-02,Scotland,England,7.0,2.0,Friendly,Glasgow,United Kindgom,False,1878,3,9.0,5.0,High scoring,Scotland,England


In [107]:
# Aggregating to get the number of home friendly losses each team had
df_friendly_away_losses_count = df_friendly_away_losses.groupby('away_team', as_index=False)[['losing_team']].agg('count')

In [110]:
# Renaming columns
df_friendly_away_losses_count.rename(columns={'away_team':'team', 'losing_team':'away_friendly_losses'}, inplace=True)
df_friendly_away_losses_count.head()

Unnamed: 0,team,away_friendly_losses
0,Abkhazia,1
1,Afghanistan,14
2,Albania,27
3,Alderney,2
4,Algeria,34


In [84]:
# Filtering to get only friendly matches that ended in a draw
df_friendly_draws = df[(df['winning_team']=='draw') & (df['tournament']=='Friendly')]

In [85]:
# Checking dataframe
df_friendly_draws.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,year,month,total_goals,goal_diff,scoring_type,winning_team,losing_team
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,United Kindgom,False,1872,11,0.0,0.0,Low scoring,draw,draw
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,United Kingdom,False,1875,3,4.0,0.0,Medium scoring,draw,draw
28,1883-03-17,Northern Ireland,Wales,1.0,1.0,Friendly,Belfast,Ireland,False,1883,3,2.0,0.0,Low scoring,draw,draw
173,1905-04-09,Hungary,Austria,0.0,0.0,Friendly,Budapest,Hungary,False,1905,4,0.0,0.0,Low scoring,draw,draw
185,1906-04-01,Hungary,Czechoslovakia,1.0,1.0,Friendly,Budapest,Hungary,False,1906,4,2.0,0.0,Low scoring,draw,draw


In [86]:
# Aggregating the number of home friendly draws
df_home_friendly_draws = df_friendly_draws.groupby('home_team', as_index=False)[['winning_team']].agg('count')

In [87]:
# Renaming columns
df_home_friendly_draws.rename(columns={'home_team':'team', 'winning_team':'home_friendly_draws'}, inplace=True)
df_home_friendly_draws.head()

Unnamed: 0,team,home_friendly_draws
0,Abkhazia,2
1,Afghanistan,2
2,Albania,17
3,Alderney,1
4,Algeria,38


In [88]:
# Aggregating the number of away friendly draws
df_away_friendly_draws = df_friendly_draws.groupby('away_team', as_index=False)[['winning_team']].agg('count')

In [89]:
# Renaming columns
df_away_friendly_draws.rename(columns={'away_team':'team', 'winning_team':'away_friendly_draws'}, inplace=True)
df_away_friendly_draws.head()

Unnamed: 0,team,away_friendly_draws
0,Afghanistan,6
1,Albania,13
2,Algeria,26
3,Ambazonia,1
4,Andorra,4


In [90]:
# Merging friendly home wins and friendly away wins dataframe
df_friendly_wins = df_friendly_home_wins_count.merge(df_friendly_away_wins_count, how='outer', on='team')

In [91]:
df_friendly_wins.head()

Unnamed: 0,team,home_friendly_wins,away_friendly_wins
0,Abkhazia,2.0,
1,Afghanistan,5.0,3.0
2,Albania,33.0,8.0
3,Algeria,67.0,26.0
4,Andalusia,7.0,1.0


In [92]:
# Checking for null values
df_friendly_wins.isnull().sum()

team                   0
home_friendly_wins    13
away_friendly_wins    35
dtype: int64

In [93]:
# Replacing null values with 0
df_friendly_wins.fillna(0, inplace=True)
df_friendly_wins.isnull().sum()

team                  0
home_friendly_wins    0
away_friendly_wins    0
dtype: int64

In [94]:
# Calculating the total friendly wins each team has
df_friendly_wins['total_friendly_wins'] = df_friendly_wins['home_friendly_wins']+df_friendly_wins['away_friendly_wins']

In [95]:
df_friendly_wins.head()

Unnamed: 0,team,home_friendly_wins,away_friendly_wins,total_friendly_wins
0,Abkhazia,2.0,0.0,2.0
1,Afghanistan,5.0,3.0,8.0
2,Albania,33.0,8.0,41.0
3,Algeria,67.0,26.0,93.0
4,Andalusia,7.0,1.0,8.0


In [111]:
# Merging friendly home losses and friendly away losses dataframe
df_friendly_losses = df_friendly_home_losses_count.merge(df_friendly_away_losses_count, how='outer', on='team')

In [114]:
df_friendly_losses.head()

Unnamed: 0,team,home_friendly_losses,away_friendly_losses
0,Abkhazia,1.0,1.0
1,Afghanistan,3.0,14.0
2,Albania,16.0,27.0
3,Algeria,35.0,34.0
4,Andalusia,1.0,0.0


In [112]:
# Checking for null values
df_friendly_losses.isnull().sum()

team                     0
home_friendly_losses    26
away_friendly_losses    13
dtype: int64

In [113]:
# Replacing null values with 0
df_friendly_losses.fillna(0, inplace=True)
df_friendly_losses.isnull().sum()

team                    0
home_friendly_losses    0
away_friendly_losses    0
dtype: int64

In [115]:
# Calculating total friendly losses
df_friendly_losses['total_friendly_losses'] = df_friendly_losses['home_friendly_losses']+df_friendly_losses['away_friendly_losses']

In [116]:
df_friendly_losses.head()

Unnamed: 0,team,home_friendly_losses,away_friendly_losses,total_friendly_losses
0,Abkhazia,1.0,1.0,2.0
1,Afghanistan,3.0,14.0,17.0
2,Albania,16.0,27.0,43.0
3,Algeria,35.0,34.0,69.0
4,Andalusia,1.0,0.0,1.0


In [117]:
# Merging friendly home and away draws
df_friendly_team_draws = df_home_friendly_draws.merge(df_away_friendly_draws, how='outer', on='team')

In [118]:
df_friendly_team_draws.head()

Unnamed: 0,team,home_friendly_draws,away_friendly_draws
0,Abkhazia,2.0,
1,Afghanistan,2.0,6.0
2,Albania,17.0,13.0
3,Alderney,1.0,
4,Algeria,38.0,26.0


In [119]:
# Checking for missing values
df_friendly_team_draws.isnull().sum()

team                    0
home_friendly_draws    17
away_friendly_draws    20
dtype: int64

In [120]:
# Replacing missing values with 0
df_friendly_team_draws.fillna(0, inplace=True)
df_friendly_team_draws.isnull().sum()

team                   0
home_friendly_draws    0
away_friendly_draws    0
dtype: int64

In [121]:
# Calculating total friendly draws
df_friendly_team_draws['total_friendly_draws'] = df_friendly_team_draws['home_friendly_draws']+df_friendly_team_draws['away_friendly_draws']

In [122]:
df_friendly_team_draws.head()

Unnamed: 0,team,home_friendly_draws,away_friendly_draws,total_friendly_draws
0,Abkhazia,2.0,0.0,2.0
1,Afghanistan,2.0,6.0,8.0
2,Albania,17.0,13.0,30.0
3,Alderney,1.0,0.0,1.0
4,Algeria,38.0,26.0,64.0


In [123]:
# Merging friendly wins, losses, and draws into friendly results dataframe
df_friendly_results = df_friendly_wins.merge(df_friendly_losses, how='outer', on='team')

In [124]:
df_friendly_results.head()

Unnamed: 0,team,home_friendly_wins,away_friendly_wins,total_friendly_wins,home_friendly_losses,away_friendly_losses,total_friendly_losses
0,Abkhazia,2.0,0.0,2.0,1.0,1.0,2.0
1,Afghanistan,5.0,3.0,8.0,3.0,14.0,17.0
2,Albania,33.0,8.0,41.0,16.0,27.0,43.0
3,Algeria,67.0,26.0,93.0,35.0,34.0,69.0
4,Andalusia,7.0,1.0,8.0,1.0,0.0,1.0


In [125]:
# Checking for null values
df_friendly_results.isnull().sum()

team                      0
home_friendly_wins       21
away_friendly_wins       21
total_friendly_wins      21
home_friendly_losses     13
away_friendly_losses     13
total_friendly_losses    13
dtype: int64

In [126]:
# Replacing null values with 0
df_friendly_results.fillna(0, inplace=True)
df_friendly_results.isnull().sum()

team                     0
home_friendly_wins       0
away_friendly_wins       0
total_friendly_wins      0
home_friendly_losses     0
away_friendly_losses     0
total_friendly_losses    0
dtype: int64

In [127]:
df_friendly_results = df_friendly_results.merge(df_friendly_team_draws, how='outer', on='team')

In [128]:
df_friendly_results.head()

Unnamed: 0,team,home_friendly_wins,away_friendly_wins,total_friendly_wins,home_friendly_losses,away_friendly_losses,total_friendly_losses,home_friendly_draws,away_friendly_draws,total_friendly_draws
0,Abkhazia,2.0,0.0,2.0,1.0,1.0,2.0,2.0,0.0,2.0
1,Afghanistan,5.0,3.0,8.0,3.0,14.0,17.0,2.0,6.0,8.0
2,Albania,33.0,8.0,41.0,16.0,27.0,43.0,17.0,13.0,30.0
3,Algeria,67.0,26.0,93.0,35.0,34.0,69.0,38.0,26.0,64.0
4,Andalusia,7.0,1.0,8.0,1.0,0.0,1.0,4.0,0.0,4.0


In [129]:
# Checking for null values
df_friendly_results.isnull().sum()

team                      0
home_friendly_wins        4
away_friendly_wins        4
total_friendly_wins       4
home_friendly_losses      4
away_friendly_losses      4
total_friendly_losses     4
home_friendly_draws      38
away_friendly_draws      38
total_friendly_draws     38
dtype: int64

In [130]:
# Replacing null values with 0
df_friendly_results.fillna(0, inplace=True)
df_friendly_results.isnull().sum()

team                     0
home_friendly_wins       0
away_friendly_wins       0
total_friendly_wins      0
home_friendly_losses     0
away_friendly_losses     0
total_friendly_losses    0
home_friendly_draws      0
away_friendly_draws      0
total_friendly_draws     0
dtype: int64

In [131]:
# Calculating home, away, and total frigames played by each team
df_friendly_results['home_friendly_games_played'] = df_friendly_results['home_friendly_wins']+df_friendly_results['home_friendly_losses']+df_friendly_results['home_friendly_draws']
df_friendly_results['away_friendly_games_played'] = df_friendly_results['away_friendly_wins']+df_friendly_results['away_friendly_losses']+df_friendly_results['away_friendly_draws']
df_friendly_results['total_friendly_games_played'] = df_friendly_results['home_friendly_games_played']+df_friendly_results['away_friendly_games_played']

In [132]:
df_friendly_results.head()

Unnamed: 0,team,home_friendly_wins,away_friendly_wins,total_friendly_wins,home_friendly_losses,away_friendly_losses,total_friendly_losses,home_friendly_draws,away_friendly_draws,total_friendly_draws,home_friendly_games_played,away_friendly_games_played,total_friendly_games_played
0,Abkhazia,2.0,0.0,2.0,1.0,1.0,2.0,2.0,0.0,2.0,5.0,1.0,6.0
1,Afghanistan,5.0,3.0,8.0,3.0,14.0,17.0,2.0,6.0,8.0,10.0,23.0,33.0
2,Albania,33.0,8.0,41.0,16.0,27.0,43.0,17.0,13.0,30.0,66.0,48.0,114.0
3,Algeria,67.0,26.0,93.0,35.0,34.0,69.0,38.0,26.0,64.0,140.0,86.0,226.0
4,Andalusia,7.0,1.0,8.0,1.0,0.0,1.0,4.0,0.0,4.0,12.0,1.0,13.0


In [136]:
# Calculating the win and unbeaten percent for home teams
df_friendly_results['home_friendly_win_pct'] = df_friendly_results['home_friendly_wins']/df_friendly_results['home_friendly_games_played']
df_friendly_results['home_friendly_unbeaten_pct'] = (df_friendly_results['home_friendly_wins']+df_friendly_results['home_friendly_draws'])/df_friendly_results['home_friendly_games_played']
df_friendly_results.head()

Unnamed: 0,team,home_friendly_wins,away_friendly_wins,total_friendly_wins,home_friendly_losses,away_friendly_losses,total_friendly_losses,home_friendly_draws,away_friendly_draws,total_friendly_draws,home_friendly_games_played,away_friendly_games_played,total_friendly_games_played,home_friendly_win_pct,home_friendly_unbeaten_pct
0,Abkhazia,2.0,0.0,2.0,1.0,1.0,2.0,2.0,0.0,2.0,5.0,1.0,6.0,0.4,0.8
1,Afghanistan,5.0,3.0,8.0,3.0,14.0,17.0,2.0,6.0,8.0,10.0,23.0,33.0,0.5,0.7
2,Albania,33.0,8.0,41.0,16.0,27.0,43.0,17.0,13.0,30.0,66.0,48.0,114.0,0.5,0.757576
3,Algeria,67.0,26.0,93.0,35.0,34.0,69.0,38.0,26.0,64.0,140.0,86.0,226.0,0.478571,0.75
4,Andalusia,7.0,1.0,8.0,1.0,0.0,1.0,4.0,0.0,4.0,12.0,1.0,13.0,0.583333,0.916667


In [137]:
# Calculating the win and unbeaten percent for away teams
df_friendly_results['away_friendly_win_pct'] = df_friendly_results['away_friendly_wins']/df_friendly_results['away_friendly_games_played']
df_friendly_results['away_friendly_unbeaten_pct'] = (df_friendly_results['away_friendly_wins']+df_friendly_results['away_friendly_draws'])/df_friendly_results['away_friendly_games_played']
df_friendly_results.head()

Unnamed: 0,team,home_friendly_wins,away_friendly_wins,total_friendly_wins,home_friendly_losses,away_friendly_losses,total_friendly_losses,home_friendly_draws,away_friendly_draws,total_friendly_draws,home_friendly_games_played,away_friendly_games_played,total_friendly_games_played,home_friendly_win_pct,home_friendly_unbeaten_pct,away_friendly_win_pct,away_friendly_unbeaten_pct
0,Abkhazia,2.0,0.0,2.0,1.0,1.0,2.0,2.0,0.0,2.0,5.0,1.0,6.0,0.4,0.8,0.0,0.0
1,Afghanistan,5.0,3.0,8.0,3.0,14.0,17.0,2.0,6.0,8.0,10.0,23.0,33.0,0.5,0.7,0.130435,0.391304
2,Albania,33.0,8.0,41.0,16.0,27.0,43.0,17.0,13.0,30.0,66.0,48.0,114.0,0.5,0.757576,0.166667,0.4375
3,Algeria,67.0,26.0,93.0,35.0,34.0,69.0,38.0,26.0,64.0,140.0,86.0,226.0,0.478571,0.75,0.302326,0.604651
4,Andalusia,7.0,1.0,8.0,1.0,0.0,1.0,4.0,0.0,4.0,12.0,1.0,13.0,0.583333,0.916667,1.0,1.0


In [138]:
# Calculating total win and unbeaten percent for friendly matches
# Calculating the win and unbeaten percent for home teams
df_friendly_results['friendly_win_pct'] = df_friendly_results['total_friendly_wins']/df_friendly_results['total_friendly_games_played']
df_friendly_results['friendly_unbeaten_pct'] = (df_friendly_results['total_friendly_wins']+df_friendly_results['total_friendly_draws'])/df_friendly_results['total_friendly_games_played']
df_friendly_results.head()

Unnamed: 0,team,home_friendly_wins,away_friendly_wins,total_friendly_wins,home_friendly_losses,away_friendly_losses,total_friendly_losses,home_friendly_draws,away_friendly_draws,total_friendly_draws,home_friendly_games_played,away_friendly_games_played,total_friendly_games_played,home_friendly_win_pct,home_friendly_unbeaten_pct,away_friendly_win_pct,away_friendly_unbeaten_pct,friendly_win_pct,friendly_unbeaten_pct
0,Abkhazia,2.0,0.0,2.0,1.0,1.0,2.0,2.0,0.0,2.0,5.0,1.0,6.0,0.4,0.8,0.0,0.0,0.333333,0.666667
1,Afghanistan,5.0,3.0,8.0,3.0,14.0,17.0,2.0,6.0,8.0,10.0,23.0,33.0,0.5,0.7,0.130435,0.391304,0.242424,0.484848
2,Albania,33.0,8.0,41.0,16.0,27.0,43.0,17.0,13.0,30.0,66.0,48.0,114.0,0.5,0.757576,0.166667,0.4375,0.359649,0.622807
3,Algeria,67.0,26.0,93.0,35.0,34.0,69.0,38.0,26.0,64.0,140.0,86.0,226.0,0.478571,0.75,0.302326,0.604651,0.411504,0.69469
4,Andalusia,7.0,1.0,8.0,1.0,0.0,1.0,4.0,0.0,4.0,12.0,1.0,13.0,0.583333,0.916667,1.0,1.0,0.615385,0.923077


In [139]:
# Merging team results with friendly results
df_team_results = df_team_results.merge(df_friendly_results, how='outer', on='team')

In [140]:
df_team_results.head()

Unnamed: 0,team,home_wins,away_wins,total_wins,home_losses,away_losses,total_losses,home_draws,away_draws,total_draws,...,total_friendly_draws,home_friendly_games_played,away_friendly_games_played,total_friendly_games_played,home_friendly_win_pct,home_friendly_unbeaten_pct,away_friendly_win_pct,away_friendly_unbeaten_pct,friendly_win_pct,friendly_unbeaten_pct
0,Abkhazia,11.0,3.0,14.0,3.0,2.0,5.0,8.0,5.0,13.0,...,2.0,5.0,1.0,6.0,0.4,0.8,0.0,0.0,0.333333,0.666667
1,Afghanistan,18.0,17.0,35.0,17.0,52.0,69.0,10.0,21.0,31.0,...,8.0,10.0,23.0,33.0,0.5,0.7,0.130435,0.391304,0.242424,0.484848
2,Albania,73.0,27.0,100.0,78.0,115.0,193.0,46.0,34.0,80.0,...,30.0,66.0,48.0,114.0,0.5,0.757576,0.166667,0.4375,0.359649,0.622807
3,Alderney,1.0,4.0,5.0,46.0,82.0,128.0,1.0,1.0,2.0,...,1.0,1.0,2.0,3.0,0.0,1.0,0.0,0.0,0.0,0.333333
4,Algeria,191.0,73.0,264.0,60.0,103.0,163.0,82.0,75.0,157.0,...,64.0,140.0,86.0,226.0,0.478571,0.75,0.302326,0.604651,0.411504,0.69469


In [141]:
# Checking for missing values
df_team_results.isnull().sum()

team                            0
home_wins                       0
away_wins                       0
total_wins                      0
home_losses                     0
away_losses                     0
total_losses                    0
home_draws                      0
away_draws                      0
total_draws                     0
home_games_played               0
away_games_played               0
total_games_played              0
home_win_pct                   10
home_unbeaten_pct              10
away_win_pct                   15
away_unbeaten_pct              15
win_pct                         0
unbeaten_pct                    0
home_friendly_wins             48
away_friendly_wins             48
total_friendly_wins            48
home_friendly_losses           48
away_friendly_losses           48
total_friendly_losses          48
home_friendly_draws            48
away_friendly_draws            48
total_friendly_draws           48
home_friendly_games_played     48
away_friendly_

In [142]:
# Replacing null values with 0
df_team_results.fillna(0, inplace=True)
df_team_results.isnull().sum()

team                           0
home_wins                      0
away_wins                      0
total_wins                     0
home_losses                    0
away_losses                    0
total_losses                   0
home_draws                     0
away_draws                     0
total_draws                    0
home_games_played              0
away_games_played              0
total_games_played             0
home_win_pct                   0
home_unbeaten_pct              0
away_win_pct                   0
away_unbeaten_pct              0
win_pct                        0
unbeaten_pct                   0
home_friendly_wins             0
away_friendly_wins             0
total_friendly_wins            0
home_friendly_losses           0
away_friendly_losses           0
total_friendly_losses          0
home_friendly_draws            0
away_friendly_draws            0
total_friendly_draws           0
home_friendly_games_played     0
away_friendly_games_played     0
total_frie

In [143]:
# Calculating the wins, losses, draws, and total games played in tournaments
df_team_results['tournament_wins'] = df_team_results['total_wins']-df_team_results['total_friendly_wins']
df_team_results['tournament_losses'] = df_team_results['total_losses']-df_team_results['total_friendly_losses']
df_team_results['tournament_draws'] = df_team_results['total_draws']-df_team_results['total_friendly_draws']
df_team_results['tournament_games_played'] = df_team_results['total_games_played']-df_team_results['total_friendly_games_played']
df_team_results.head()

Unnamed: 0,team,home_wins,away_wins,total_wins,home_losses,away_losses,total_losses,home_draws,away_draws,total_draws,...,home_friendly_win_pct,home_friendly_unbeaten_pct,away_friendly_win_pct,away_friendly_unbeaten_pct,friendly_win_pct,friendly_unbeaten_pct,tournament_wins,tournament_losses,tournament_draws,tournament_games_played
0,Abkhazia,11.0,3.0,14.0,3.0,2.0,5.0,8.0,5.0,13.0,...,0.4,0.8,0.0,0.0,0.333333,0.666667,12.0,3.0,11.0,26.0
1,Afghanistan,18.0,17.0,35.0,17.0,52.0,69.0,10.0,21.0,31.0,...,0.5,0.7,0.130435,0.391304,0.242424,0.484848,27.0,52.0,23.0,102.0
2,Albania,73.0,27.0,100.0,78.0,115.0,193.0,46.0,34.0,80.0,...,0.5,0.757576,0.166667,0.4375,0.359649,0.622807,59.0,150.0,50.0,259.0
3,Alderney,1.0,4.0,5.0,46.0,82.0,128.0,1.0,1.0,2.0,...,0.0,1.0,0.0,0.0,0.0,0.333333,5.0,126.0,1.0,132.0
4,Algeria,191.0,73.0,264.0,60.0,103.0,163.0,82.0,75.0,157.0,...,0.478571,0.75,0.302326,0.604651,0.411504,0.69469,171.0,94.0,93.0,358.0


In [144]:
# Calculating the win and unbeaten percentage for teams in tournaments
df_team_results['tournament_win_pct'] = df_team_results['tournament_wins']/df_team_results['tournament_games_played']
df_team_results['tournament_unbeaten_pct'] = (df_team_results['tournament_wins']+df_team_results['tournament_draws'])/df_team_results['tournament_games_played']
df_team_results.head()

Unnamed: 0,team,home_wins,away_wins,total_wins,home_losses,away_losses,total_losses,home_draws,away_draws,total_draws,...,away_friendly_win_pct,away_friendly_unbeaten_pct,friendly_win_pct,friendly_unbeaten_pct,tournament_wins,tournament_losses,tournament_draws,tournament_games_played,tournament_win_pct,tournament_unbeaten_pct
0,Abkhazia,11.0,3.0,14.0,3.0,2.0,5.0,8.0,5.0,13.0,...,0.0,0.0,0.333333,0.666667,12.0,3.0,11.0,26.0,0.461538,0.884615
1,Afghanistan,18.0,17.0,35.0,17.0,52.0,69.0,10.0,21.0,31.0,...,0.130435,0.391304,0.242424,0.484848,27.0,52.0,23.0,102.0,0.264706,0.490196
2,Albania,73.0,27.0,100.0,78.0,115.0,193.0,46.0,34.0,80.0,...,0.166667,0.4375,0.359649,0.622807,59.0,150.0,50.0,259.0,0.227799,0.420849
3,Alderney,1.0,4.0,5.0,46.0,82.0,128.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.333333,5.0,126.0,1.0,132.0,0.037879,0.045455
4,Algeria,191.0,73.0,264.0,60.0,103.0,163.0,82.0,75.0,157.0,...,0.302326,0.604651,0.411504,0.69469,171.0,94.0,93.0,358.0,0.477654,0.73743


### 04. Aggregating Goals For/Against

In [160]:
# Aggregating the sum and average goals for and against for home teams
df_home_goals = df.groupby('home_team', as_index=False)[['home_score', 'away_score']].agg(['sum', 'mean'])

In [161]:
df_home_goals.head()

Unnamed: 0_level_0,home_team,home_score,home_score,away_score,away_score
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,sum,mean
0,Abkhazia,39.0,1.772727,13.0,0.590909
1,Afghanistan,55.0,1.222222,69.0,1.533333
2,Albania,224.0,1.137056,220.0,1.116751
3,Alderney,20.0,0.416667,199.0,4.145833
4,Algeria,626.0,1.87988,281.0,0.843844


In [162]:
df_home_goals.columns = df_home_goals.columns.droplevel(0)

Unnamed: 0,index,Unnamed: 2,sum,mean,sum.1,mean.1
0,0,Abkhazia,39.0,1.772727,13.0,0.590909
1,1,Afghanistan,55.0,1.222222,69.0,1.533333
2,2,Albania,224.0,1.137056,220.0,1.116751
3,3,Alderney,20.0,0.416667,199.0,4.145833
4,4,Algeria,626.0,1.879880,281.0,0.843844
...,...,...,...,...,...,...
323,323,Zambia,625.0,1.917178,282.0,0.865031
324,324,Zanzibar,63.0,1.086207,100.0,1.724138
325,325,Zimbabwe,317.0,1.625641,188.0,0.964103
326,326,Åland,3.0,0.750000,7.0,1.750000


In [164]:
df_home_goals.head()

Unnamed: 0,Unnamed: 1,sum,mean,sum.1,mean.1
0,Abkhazia,39.0,1.772727,13.0,0.590909
1,Afghanistan,55.0,1.222222,69.0,1.533333
2,Albania,224.0,1.137056,220.0,1.116751
3,Alderney,20.0,0.416667,199.0,4.145833
4,Algeria,626.0,1.87988,281.0,0.843844


In [167]:
# Renaming column headers
df_home_goals= df_home_goals.set_axis(['team', 'home_goals_for', 'average_home_goals_for', 'home_goals_against', 'average_home_goals_against'], axis=1)
df_home_goals.head()

Unnamed: 0,team,home_goals_for,average_home_goals_for,home_goals_against,average_home_goals_against
0,Abkhazia,39.0,1.772727,13.0,0.590909
1,Afghanistan,55.0,1.222222,69.0,1.533333
2,Albania,224.0,1.137056,220.0,1.116751
3,Alderney,20.0,0.416667,199.0,4.145833
4,Algeria,626.0,1.87988,281.0,0.843844


In [168]:
# Calculating the total and average goal differences for home teams
df_home_goals['home_goal_difference'] = df_home_goals['home_goals_for']-df_home_goals['home_goals_against']
df_home_goals['average_home_goal_difference'] = df_home_goals['average_home_goals_for']-df_home_goals['average_home_goals_against']
df_home_goals.head()

Unnamed: 0,team,home_goals_for,average_home_goals_for,home_goals_against,average_home_goals_against,home_goal_difference,average_home_goal_difference
0,Abkhazia,39.0,1.772727,13.0,0.590909,26.0,1.181818
1,Afghanistan,55.0,1.222222,69.0,1.533333,-14.0,-0.311111
2,Albania,224.0,1.137056,220.0,1.116751,4.0,0.020305
3,Alderney,20.0,0.416667,199.0,4.145833,-179.0,-3.729167
4,Algeria,626.0,1.87988,281.0,0.843844,345.0,1.036036


In [169]:
# Aggregating the sum and average goals for and against for away teams
df_away_goals = df.groupby('away_team', as_index=False)[['away_score', 'home_score']].agg(['sum', 'mean'])

In [170]:
df_away_goals.head()

Unnamed: 0_level_0,away_team,away_score,away_score,home_score,home_score
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,sum,mean
0,Abkhazia,12.0,1.2,13.0,1.3
1,Afghanistan,83.0,0.922222,208.0,2.311111
2,Albania,127.0,0.721591,345.0,1.960227
3,Alderney,53.0,0.609195,421.0,4.83908
4,Algeria,252.0,1.003984,312.0,1.243028


In [171]:
df_away_goals.columns = df_away_goals.columns.droplevel(0)

In [172]:
df_away_goals.head()

Unnamed: 0,Unnamed: 1,sum,mean,sum.1,mean.1
0,Abkhazia,12.0,1.2,13.0,1.3
1,Afghanistan,83.0,0.922222,208.0,2.311111
2,Albania,127.0,0.721591,345.0,1.960227
3,Alderney,53.0,0.609195,421.0,4.83908
4,Algeria,252.0,1.003984,312.0,1.243028


In [173]:
# Renaming column headers
df_away_goals = df_away_goals.set_axis(['team', 'away_goals_for', 'average_away_goals_for', 'away_goals_against', 'average_away_goals_against'], axis=1)
df_away_goals.head()

Unnamed: 0,team,away_goals_for,average_away_goals_for,away_goals_against,average_away_goals_against
0,Abkhazia,12.0,1.2,13.0,1.3
1,Afghanistan,83.0,0.922222,208.0,2.311111
2,Albania,127.0,0.721591,345.0,1.960227
3,Alderney,53.0,0.609195,421.0,4.83908
4,Algeria,252.0,1.003984,312.0,1.243028


In [174]:
# Calculating the total and average goal differences for home teams
df_away_goals['away_goal_difference'] = df_away_goals['away_goals_for']-df_away_goals['away_goals_against']
df_away_goals['average_away_goal_difference'] = df_away_goals['average_away_goals_for']-df_away_goals['average_away_goals_against']
df_away_goals.head()

Unnamed: 0,team,away_goals_for,average_away_goals_for,away_goals_against,average_away_goals_against,away_goal_difference,average_away_goal_difference
0,Abkhazia,12.0,1.2,13.0,1.3,-1.0,-0.1
1,Afghanistan,83.0,0.922222,208.0,2.311111,-125.0,-1.388889
2,Albania,127.0,0.721591,345.0,1.960227,-218.0,-1.238636
3,Alderney,53.0,0.609195,421.0,4.83908,-368.0,-4.229885
4,Algeria,252.0,1.003984,312.0,1.243028,-60.0,-0.239044


In [175]:
# Merging home and away goals
df_goals = df_home_goals.merge(df_away_goals, how='outer', on='team')
df_goals.head()

Unnamed: 0,team,home_goals_for,average_home_goals_for,home_goals_against,average_home_goals_against,home_goal_difference,average_home_goal_difference,away_goals_for,average_away_goals_for,away_goals_against,average_away_goals_against,away_goal_difference,average_away_goal_difference
0,Abkhazia,39.0,1.772727,13.0,0.590909,26.0,1.181818,12.0,1.2,13.0,1.3,-1.0,-0.1
1,Afghanistan,55.0,1.222222,69.0,1.533333,-14.0,-0.311111,83.0,0.922222,208.0,2.311111,-125.0,-1.388889
2,Albania,224.0,1.137056,220.0,1.116751,4.0,0.020305,127.0,0.721591,345.0,1.960227,-218.0,-1.238636
3,Alderney,20.0,0.416667,199.0,4.145833,-179.0,-3.729167,53.0,0.609195,421.0,4.83908,-368.0,-4.229885
4,Algeria,626.0,1.87988,281.0,0.843844,345.0,1.036036,252.0,1.003984,312.0,1.243028,-60.0,-0.239044


In [176]:
# Checking for missing values
df_goals.isnull().sum()

team                             0
home_goals_for                  10
average_home_goals_for          10
home_goals_against              10
average_home_goals_against      10
home_goal_difference            10
average_home_goal_difference    10
away_goals_for                  15
average_away_goals_for          15
away_goals_against              15
average_away_goals_against      15
away_goal_difference            15
average_away_goal_difference    15
dtype: int64

In [177]:
# Replacing null values with 0
df_goals.fillna(0, inplace=True)
df_goals.isnull().sum()

team                            0
home_goals_for                  0
average_home_goals_for          0
home_goals_against              0
average_home_goals_against      0
home_goal_difference            0
average_home_goal_difference    0
away_goals_for                  0
average_away_goals_for          0
away_goals_against              0
average_away_goals_against      0
away_goal_difference            0
average_away_goal_difference    0
dtype: int64

In [178]:
# Merging goals with results dataframe
df_team_results = df_team_results.merge(df_goals, how='outer', on='team')
df_team_results.head()

Unnamed: 0,team,home_wins,away_wins,total_wins,home_losses,away_losses,total_losses,home_draws,away_draws,total_draws,...,home_goals_against,average_home_goals_against,home_goal_difference,average_home_goal_difference,away_goals_for,average_away_goals_for,away_goals_against,average_away_goals_against,away_goal_difference,average_away_goal_difference
0,Abkhazia,11.0,3.0,14.0,3.0,2.0,5.0,8.0,5.0,13.0,...,13.0,0.590909,26.0,1.181818,12.0,1.2,13.0,1.3,-1.0,-0.1
1,Afghanistan,18.0,17.0,35.0,17.0,52.0,69.0,10.0,21.0,31.0,...,69.0,1.533333,-14.0,-0.311111,83.0,0.922222,208.0,2.311111,-125.0,-1.388889
2,Albania,73.0,27.0,100.0,78.0,115.0,193.0,46.0,34.0,80.0,...,220.0,1.116751,4.0,0.020305,127.0,0.721591,345.0,1.960227,-218.0,-1.238636
3,Alderney,1.0,4.0,5.0,46.0,82.0,128.0,1.0,1.0,2.0,...,199.0,4.145833,-179.0,-3.729167,53.0,0.609195,421.0,4.83908,-368.0,-4.229885
4,Algeria,191.0,73.0,264.0,60.0,103.0,163.0,82.0,75.0,157.0,...,281.0,0.843844,345.0,1.036036,252.0,1.003984,312.0,1.243028,-60.0,-0.239044


In [179]:
# Checking for missing values
df_team_results.isnull().sum()

team                             0
home_wins                        0
away_wins                        0
total_wins                       0
home_losses                      0
away_losses                      0
total_losses                     0
home_draws                       0
away_draws                       0
total_draws                      0
home_games_played                0
away_games_played                0
total_games_played               0
home_win_pct                     0
home_unbeaten_pct                0
away_win_pct                     0
away_unbeaten_pct                0
win_pct                          0
unbeaten_pct                     0
home_friendly_wins               0
away_friendly_wins               0
total_friendly_wins              0
home_friendly_losses             0
away_friendly_losses             0
total_friendly_losses            0
home_friendly_draws              0
away_friendly_draws              0
total_friendly_draws             0
home_friendly_games_

In [181]:
# Replacing null values with 0
df_team_results.fillna(0, inplace=True)
df_team_results.isnull().sum()

team                            0
home_wins                       0
away_wins                       0
total_wins                      0
home_losses                     0
away_losses                     0
total_losses                    0
home_draws                      0
away_draws                      0
total_draws                     0
home_games_played               0
away_games_played               0
total_games_played              0
home_win_pct                    0
home_unbeaten_pct               0
away_win_pct                    0
away_unbeaten_pct               0
win_pct                         0
unbeaten_pct                    0
home_friendly_wins              0
away_friendly_wins              0
total_friendly_wins             0
home_friendly_losses            0
away_friendly_losses            0
total_friendly_losses           0
home_friendly_draws             0
away_friendly_draws             0
total_friendly_draws            0
home_friendly_games_played      0
away_friendly_

In [183]:
# Calculating total and average goals for, goals against, and goal difference
df_team_results['goals_for'] = df_team_results['home_goals_for']+df_team_results['away_goals_for']
df_team_results['goals_against'] = df_team_results['home_goals_against']+df_team_results['away_goals_against']
df_team_results['average_goals_for'] = df_team_results['goals_for']/df_team_results['total_games_played']
df_team_results['average_goals_against'] = df_team_results['goals_against']/df_team_results['total_games_played']
df_team_results['goal_difference'] = df_team_results['home_goal_difference']+df_team_results['away_goal_difference']
df_team_results['average_goal_difference'] = df_team_results['goal_difference']/df_team_results['total_games_played']
df_team_results.head()

Unnamed: 0,team,home_wins,away_wins,total_wins,home_losses,away_losses,total_losses,home_draws,away_draws,total_draws,...,away_goals_against,average_away_goals_against,away_goal_difference,average_away_goal_difference,goals_for,average_goals_for,goal_difference,average_goal_difference,goals_against,average_goals_against
0,Abkhazia,11.0,3.0,14.0,3.0,2.0,5.0,8.0,5.0,13.0,...,13.0,1.3,-1.0,-0.1,51.0,1.59375,25.0,0.78125,26.0,0.8125
1,Afghanistan,18.0,17.0,35.0,17.0,52.0,69.0,10.0,21.0,31.0,...,208.0,2.311111,-125.0,-1.388889,138.0,1.022222,-139.0,-1.02963,277.0,2.051852
2,Albania,73.0,27.0,100.0,78.0,115.0,193.0,46.0,34.0,80.0,...,345.0,1.960227,-218.0,-1.238636,351.0,0.941019,-214.0,-0.573727,565.0,1.514745
3,Alderney,1.0,4.0,5.0,46.0,82.0,128.0,1.0,1.0,2.0,...,421.0,4.83908,-368.0,-4.229885,73.0,0.540741,-547.0,-4.051852,620.0,4.592593
4,Algeria,191.0,73.0,264.0,60.0,103.0,163.0,82.0,75.0,157.0,...,312.0,1.243028,-60.0,-0.239044,878.0,1.503425,285.0,0.488014,593.0,1.015411


In [184]:
# Filtering to show all frendlies
df_frendlies = df[(df['tournament']=='Friendly')]
df_frendlies.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,year,month,total_goals,goal_diff,scoring_type,winning_team,losing_team
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,United Kindgom,False,1872,11,0.0,0.0,Low scoring,draw,draw
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,United Kingdom,False,1873,3,6.0,2.0,High scoring,England,Scotland
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,United Kindgom,False,1874,3,3.0,1.0,Medium scoring,Scotland,England
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,United Kingdom,False,1875,3,4.0,0.0,Medium scoring,draw,draw
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,United Kindgom,False,1876,3,3.0,3.0,Medium scoring,Scotland,England


In [190]:
# Aggregating the home goals for and against in frendlies
df_friendly_home_goals = df_frendlies.groupby('home_team', as_index=False)[['home_score', 'away_score']].agg(['sum', 'mean'])
df_friendly_home_goals.head()

Unnamed: 0_level_0,home_team,home_score,home_score,away_score,away_score
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,sum,mean
0,Abkhazia,4.0,0.8,3.0,0.6
1,Afghanistan,11.0,1.1,11.0,1.1
2,Albania,90.0,1.363636,53.0,0.80303
3,Alderney,1.0,1.0,1.0,1.0
4,Algeria,223.0,1.592857,136.0,0.971429


In [191]:
# Renaming Columns
df_friendly_home_goals.columns = df_friendly_home_goals.columns.droplevel(0)
df_friendly_home_goals= df_friendly_home_goals.set_axis(['team', 'friendly_home_goals_for', 'average_friendly_home_goals_for', 'friendly_home_goals_against', 'average_friendly_home_goals_against'], axis=1)
df_friendly_home_goals.head()

Unnamed: 0,team,friendly_home_goals_for,average_friendly_home_goals_for,friendly_home_goals_against,average_friendly_home_goals_against
0,Abkhazia,4.0,0.8,3.0,0.6
1,Afghanistan,11.0,1.1,11.0,1.1
2,Albania,90.0,1.363636,53.0,0.80303
3,Alderney,1.0,1.0,1.0,1.0
4,Algeria,223.0,1.592857,136.0,0.971429


In [192]:
# Calating total and average home goal difference for friendlies
df_friendly_home_goals['friendly_home_goal_difference'] = df_friendly_home_goals['friendly_home_goals_for']-df_friendly_home_goals['friendly_home_goals_against']
df_friendly_home_goals['average_friendly_home_goal_difference'] = df_friendly_home_goals['average_friendly_home_goals_for']-df_friendly_home_goals['average_friendly_home_goals_against']
df_friendly_home_goals.head()

Unnamed: 0,team,friendly_home_goals_for,average_friendly_home_goals_for,friendly_home_goals_against,average_friendly_home_goals_against,friendly_home_goal_difference,average_friendly_home_goal_difference
0,Abkhazia,4.0,0.8,3.0,0.6,1.0,0.2
1,Afghanistan,11.0,1.1,11.0,1.1,0.0,0.0
2,Albania,90.0,1.363636,53.0,0.80303,37.0,0.560606
3,Alderney,1.0,1.0,1.0,1.0,0.0,0.0
4,Algeria,223.0,1.592857,136.0,0.971429,87.0,0.621429


In [193]:
# Aggregating the away goals for and against in frendlies
df_friendly_away_goals = df_frendlies.groupby('away_team', as_index=False)[['away_score', 'home_score']].agg(['sum', 'mean'])
df_friendly_away_goals.head()

Unnamed: 0_level_0,away_team,away_score,away_score,home_score,home_score
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,sum,mean
0,Abkhazia,0.0,0.0,3.0,3.0
1,Afghanistan,18.0,0.782609,38.0,1.652174
2,Albania,45.0,0.9375,87.0,1.8125
3,Alderney,2.0,1.0,11.0,5.5
4,Algeria,74.0,0.860465,103.0,1.197674


In [194]:
# Renaming Columns
df_friendly_away_goals.columns = df_friendly_away_goals.columns.droplevel(0)
df_friendly_away_goals= df_friendly_away_goals.set_axis(['team', 'friendly_away_goals_for', 'average_friendly_away_goals_for', 'friendly_away_goals_against', 'average_friendly_away_goals_against'], axis=1)
df_friendly_away_goals.head()

Unnamed: 0,team,friendly_away_goals_for,average_friendly_away_goals_for,friendly_away_goals_against,average_friendly_away_goals_against
0,Abkhazia,0.0,0.0,3.0,3.0
1,Afghanistan,18.0,0.782609,38.0,1.652174
2,Albania,45.0,0.9375,87.0,1.8125
3,Alderney,2.0,1.0,11.0,5.5
4,Algeria,74.0,0.860465,103.0,1.197674


In [195]:
# Calating total and average home goal difference for friendlies
df_friendly_away_goals['friendly_away_goal_difference'] = df_friendly_away_goals['friendly_away_goals_for']-df_friendly_away_goals['friendly_away_goals_against']
df_friendly_away_goals['average_friendly_away_goal_difference'] = df_friendly_away_goals['average_friendly_away_goals_for']-df_friendly_away_goals['average_friendly_away_goals_against']
df_friendly_away_goals.head()

Unnamed: 0,team,friendly_away_goals_for,average_friendly_away_goals_for,friendly_away_goals_against,average_friendly_away_goals_against,friendly_away_goal_difference,average_friendly_away_goal_difference
0,Abkhazia,0.0,0.0,3.0,3.0,-3.0,-3.0
1,Afghanistan,18.0,0.782609,38.0,1.652174,-20.0,-0.869565
2,Albania,45.0,0.9375,87.0,1.8125,-42.0,-0.875
3,Alderney,2.0,1.0,11.0,5.5,-9.0,-4.5
4,Algeria,74.0,0.860465,103.0,1.197674,-29.0,-0.337209


In [197]:
# Merging home and away friendly goals
df_friendly_goals = df_friendly_home_goals.merge(df_friendly_away_goals, how='outer', on='team')
df_friendly_goals.head()

Unnamed: 0,team,friendly_home_goals_for,average_friendly_home_goals_for,friendly_home_goals_against,average_friendly_home_goals_against,friendly_home_goal_difference,average_friendly_home_goal_difference,friendly_away_goals_for,average_friendly_away_goals_for,friendly_away_goals_against,average_friendly_away_goals_against,friendly_away_goal_difference,average_friendly_away_goal_difference
0,Abkhazia,4.0,0.8,3.0,0.6,1.0,0.2,0.0,0.0,3.0,3.0,-3.0,-3.0
1,Afghanistan,11.0,1.1,11.0,1.1,0.0,0.0,18.0,0.782609,38.0,1.652174,-20.0,-0.869565
2,Albania,90.0,1.363636,53.0,0.80303,37.0,0.560606,45.0,0.9375,87.0,1.8125,-42.0,-0.875
3,Alderney,1.0,1.0,1.0,1.0,0.0,0.0,2.0,1.0,11.0,5.5,-9.0,-4.5
4,Algeria,223.0,1.592857,136.0,0.971429,87.0,0.621429,74.0,0.860465,103.0,1.197674,-29.0,-0.337209


In [198]:
# Checking for missing values
df_friendly_goals.isnull().sum()

team                                      0
friendly_home_goals_for                  14
average_friendly_home_goals_for          14
friendly_home_goals_against              14
average_friendly_home_goals_against      14
friendly_home_goal_difference            14
average_friendly_home_goal_difference    14
friendly_away_goals_for                  16
average_friendly_away_goals_for          16
friendly_away_goals_against              16
average_friendly_away_goals_against      16
friendly_away_goal_difference            16
average_friendly_away_goal_difference    16
dtype: int64

In [199]:
# Replacing null values with 0
df_friendly_goals.fillna(0, inplace=True)
df_friendly_goals.isnull().sum()

team                                     0
friendly_home_goals_for                  0
average_friendly_home_goals_for          0
friendly_home_goals_against              0
average_friendly_home_goals_against      0
friendly_home_goal_difference            0
average_friendly_home_goal_difference    0
friendly_away_goals_for                  0
average_friendly_away_goals_for          0
friendly_away_goals_against              0
average_friendly_away_goals_against      0
friendly_away_goal_difference            0
average_friendly_away_goal_difference    0
dtype: int64

In [200]:
# Merging friendly goals to team results
df_team_results = df_team_results.merge(df_friendly_goals, how='outer', on='team')
df_team_results.head()

Unnamed: 0,team,home_wins,away_wins,total_wins,home_losses,away_losses,total_losses,home_draws,away_draws,total_draws,...,friendly_home_goals_against,average_friendly_home_goals_against,friendly_home_goal_difference,average_friendly_home_goal_difference,friendly_away_goals_for,average_friendly_away_goals_for,friendly_away_goals_against,average_friendly_away_goals_against,friendly_away_goal_difference,average_friendly_away_goal_difference
0,Abkhazia,11.0,3.0,14.0,3.0,2.0,5.0,8.0,5.0,13.0,...,3.0,0.6,1.0,0.2,0.0,0.0,3.0,3.0,-3.0,-3.0
1,Afghanistan,18.0,17.0,35.0,17.0,52.0,69.0,10.0,21.0,31.0,...,11.0,1.1,0.0,0.0,18.0,0.782609,38.0,1.652174,-20.0,-0.869565
2,Albania,73.0,27.0,100.0,78.0,115.0,193.0,46.0,34.0,80.0,...,53.0,0.80303,37.0,0.560606,45.0,0.9375,87.0,1.8125,-42.0,-0.875
3,Alderney,1.0,4.0,5.0,46.0,82.0,128.0,1.0,1.0,2.0,...,1.0,1.0,0.0,0.0,2.0,1.0,11.0,5.5,-9.0,-4.5
4,Algeria,191.0,73.0,264.0,60.0,103.0,163.0,82.0,75.0,157.0,...,136.0,0.971429,87.0,0.621429,74.0,0.860465,103.0,1.197674,-29.0,-0.337209


In [201]:
# Checking for missing values
df_team_results.isnull().sum()

team                                      0
home_wins                                 0
away_wins                                 0
total_wins                                0
home_losses                               0
                                         ..
average_friendly_away_goals_for          48
friendly_away_goals_against              48
average_friendly_away_goals_against      48
friendly_away_goal_difference            48
average_friendly_away_goal_difference    48
Length: 73, dtype: int64

In [203]:
# Replacing null values with 0
df_team_results.fillna(0, inplace=True)
df_team_results.isnull().sum()

team                                     0
home_wins                                0
away_wins                                0
total_wins                               0
home_losses                              0
                                        ..
average_friendly_away_goals_for          0
friendly_away_goals_against              0
average_friendly_away_goals_against      0
friendly_away_goal_difference            0
average_friendly_away_goal_difference    0
Length: 73, dtype: int64

In [6]:
df_team_results= pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'team_results.pkl'))
df_team_results.head()

Unnamed: 0,team,home_wins,away_wins,total_wins,home_losses,away_losses,total_losses,home_draws,away_draws,total_draws,...,friendly_home_goals_against,average_friendly_home_goals_against,friendly_home_goal_difference,average_friendly_home_goal_difference,friendly_away_goals_for,average_friendly_away_goals_for,friendly_away_goals_against,average_friendly_away_goals_against,friendly_away_goal_difference,average_friendly_away_goal_difference
0,Abkhazia,11.0,3.0,14.0,3.0,2.0,5.0,8.0,5.0,13.0,...,3.0,0.6,1.0,0.2,0.0,0.0,3.0,3.0,-3.0,-3.0
1,Afghanistan,18.0,17.0,35.0,17.0,52.0,69.0,10.0,21.0,31.0,...,11.0,1.1,0.0,0.0,18.0,0.782609,38.0,1.652174,-20.0,-0.869565
2,Albania,73.0,27.0,100.0,78.0,115.0,193.0,46.0,34.0,80.0,...,53.0,0.80303,37.0,0.560606,45.0,0.9375,87.0,1.8125,-42.0,-0.875
3,Alderney,1.0,4.0,5.0,46.0,82.0,128.0,1.0,1.0,2.0,...,1.0,1.0,0.0,0.0,2.0,1.0,11.0,5.5,-9.0,-4.5
4,Algeria,191.0,73.0,264.0,60.0,103.0,163.0,82.0,75.0,157.0,...,136.0,0.971429,87.0,0.621429,74.0,0.860465,103.0,1.197674,-29.0,-0.337209


In [7]:
df_team_results.columns

Index(['team', 'home_wins', 'away_wins', 'total_wins', 'home_losses',
       'away_losses', 'total_losses', 'home_draws', 'away_draws',
       'total_draws', 'home_games_played', 'away_games_played',
       'total_games_played', 'home_win_pct', 'home_unbeaten_pct',
       'away_win_pct', 'away_unbeaten_pct', 'win_pct', 'unbeaten_pct',
       'home_friendly_wins', 'away_friendly_wins', 'total_friendly_wins',
       'home_friendly_losses', 'away_friendly_losses', 'total_friendly_losses',
       'home_friendly_draws', 'away_friendly_draws', 'total_friendly_draws',
       'home_friendly_games_played', 'away_friendly_games_played',
       'total_friendly_games_played', 'home_friendly_win_pct',
       'home_friendly_unbeaten_pct', 'away_friendly_win_pct',
       'away_friendly_unbeaten_pct', 'friendly_win_pct',
       'friendly_unbeaten_pct', 'tournament_wins', 'tournament_losses',
       'tournament_draws', 'tournament_games_played', 'tournament_win_pct',
       'tournament_unbeaten_pct', '

In [8]:
# Caclulating total and average goals and goal difference for friendlies
df_team_results['friendly_goals_for'] = df_team_results['friendly_home_goals_for']+df_team_results['friendly_away_goals_for']
df_team_results['average_friendly_goals_for'] = df_team_results['friendly_goals_for']/df_team_results['total_friendly_games_played']
df_team_results['friendly_goals_against'] = df_team_results['friendly_home_goals_against']+df_team_results['friendly_away_goals_against']
df_team_results['average_friendly_goals_against'] = df_team_results['friendly_goals_against']/df_team_results['total_friendly_games_played']
df_team_results['friendly_goal_difference'] = df_team_results['friendly_home_goal_difference']+df_team_results['friendly_away_goal_difference']
df_team_results['friendly_average_goal_difference'] = df_team_results['friendly_goal_difference']/df_team_results['total_friendly_games_played']
df_team_results.head()

Unnamed: 0,team,home_wins,away_wins,total_wins,home_losses,away_losses,total_losses,home_draws,away_draws,total_draws,...,friendly_away_goals_against,average_friendly_away_goals_against,friendly_away_goal_difference,average_friendly_away_goal_difference,friendly_goals_for,average_friendly_goals_for,friendly_goals_against,average_friendly_goals_against,friendly_goal_difference,friendly_average_goal_difference
0,Abkhazia,11.0,3.0,14.0,3.0,2.0,5.0,8.0,5.0,13.0,...,3.0,3.0,-3.0,-3.0,4.0,0.666667,6.0,1.0,-2.0,-0.333333
1,Afghanistan,18.0,17.0,35.0,17.0,52.0,69.0,10.0,21.0,31.0,...,38.0,1.652174,-20.0,-0.869565,29.0,0.878788,49.0,1.484848,-20.0,-0.606061
2,Albania,73.0,27.0,100.0,78.0,115.0,193.0,46.0,34.0,80.0,...,87.0,1.8125,-42.0,-0.875,135.0,1.184211,140.0,1.22807,-5.0,-0.04386
3,Alderney,1.0,4.0,5.0,46.0,82.0,128.0,1.0,1.0,2.0,...,11.0,5.5,-9.0,-4.5,3.0,1.0,12.0,4.0,-9.0,-3.0
4,Algeria,191.0,73.0,264.0,60.0,103.0,163.0,82.0,75.0,157.0,...,103.0,1.197674,-29.0,-0.337209,297.0,1.314159,239.0,1.057522,58.0,0.256637


In [12]:
# Calculating tounament goals for/against and goal difference
df_team_results['tournament_goals_for'] = df_team_results['goals_for']-df_team_results['friendly_goals_for']
df_team_results['average_tournament_goals_for'] = df_team_results['tournament_goals_for']/df_team_results['tournament_games_played']
df_team_results['tournament_goals_against'] = df_team_results['goals_against']-df_team_results['friendly_goals_against']
df_team_results['average_tournament_goals_against'] = df_team_results['tournament_goals_against']/df_team_results['tournament_games_played']
df_team_results['tournament_goal_difference'] = df_team_results['tournament_goals_for']-df_team_results['tournament_goals_against']
df_team_results['average_tournament_goal_difference'] = df_team_results['tournament_goal_difference']/df_team_results['tournament_games_played']
df_team_results.head()

Unnamed: 0,team,home_wins,away_wins,total_wins,home_losses,away_losses,total_losses,home_draws,away_draws,total_draws,...,friendly_goals_against,average_friendly_goals_against,friendly_goal_difference,friendly_average_goal_difference,tournament_goals_for,average_tournament_goals_for,tournament_goals_against,average_tournament_goals_against,tournament_goal_difference,average_tournament_goal_difference
0,Abkhazia,11.0,3.0,14.0,3.0,2.0,5.0,8.0,5.0,13.0,...,6.0,1.0,-2.0,-0.333333,47.0,1.807692,20.0,0.769231,27.0,1.038462
1,Afghanistan,18.0,17.0,35.0,17.0,52.0,69.0,10.0,21.0,31.0,...,49.0,1.484848,-20.0,-0.606061,109.0,1.068627,228.0,2.235294,-119.0,-1.166667
2,Albania,73.0,27.0,100.0,78.0,115.0,193.0,46.0,34.0,80.0,...,140.0,1.22807,-5.0,-0.04386,216.0,0.833977,425.0,1.640927,-209.0,-0.80695
3,Alderney,1.0,4.0,5.0,46.0,82.0,128.0,1.0,1.0,2.0,...,12.0,4.0,-9.0,-3.0,70.0,0.530303,608.0,4.606061,-538.0,-4.075758
4,Algeria,191.0,73.0,264.0,60.0,103.0,163.0,82.0,75.0,157.0,...,239.0,1.057522,58.0,0.256637,581.0,1.622905,354.0,0.988827,227.0,0.634078


### 04. Saving Dataframes 

In [13]:
# Saving dataframes
df.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'results_cleaned.pkl'))
df_team_results.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'team_results.pkl'))