# International data - World Cup 2018 predictions

Ported from Excel - see original [here](models/World cup 2018 CALC.xlsx)

In [1]:
import pandas as pd

In [24]:
HOME_TEAMS = ["Russia"]
HOME_TEAMS

['Russia']

In [5]:
fixtures = pd.read_csv("../data/raw/whs/whs_fix/whs_fix_wcm_2018.csv")
fixtures.columns = ["Date", "Time", "ignore_1", "Team1", "ignore_2", "Team2", "ignore_3"]
fixtures.drop(columns=["ignore_1", "ignore_2", "ignore_3"], inplace=True)
fixtures.Date.fillna(method="ffill", inplace=True)
fixtures.dropna(axis="index", subset=["Team1"], inplace=True)
fixtures.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64 entries, 1 to 88
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    64 non-null     object
 1   Time    64 non-null     object
 2   Team1   64 non-null     object
 3   Team2   64 non-null     object
dtypes: object(4)
memory usage: 2.5+ KB


In [26]:
fixtures["HomeAdv1"] = 0
fixtures["HomeAdv2"] = 0
fixtures.loc[fixtures.Team1.isin(HOME_TEAMS), "HomeAdv1"] = 1
fixtures.loc[fixtures.Team2.isin(HOME_TEAMS), "HomeAdv2"] = 1

fixtures.head(5)

Unnamed: 0,Date,Time,Team1,Team2,HomeAdv1,HomeAdv2
1,"Thursday, Jun 14 2018",16:00,Russia,Saudi Arabia,1,0
3,"Friday, Jun 15 2018",13:00,Egypt,Uruguay,0,0
4,"Friday, Jun 15 2018",16:00,Morocco,Iran,0,0
5,"Friday, Jun 15 2018",19:00,Portugal,Spain,0,0
7,"Saturday, Jun 16 2018",11:00,France,Australia,0,0


In [27]:
elo = pd.read_csv("../data/raw/wkp/wkp_elo/World_Football_Elo_Ratings.csv")
elo.columns = ["Team", "EloRank", "ignore_1", "ignore_2", "ignore_3", "EloRating", "FIFARank", "ignore_4", "ignore_5"]
elo.drop(columns=["ignore_1", "ignore_2", "ignore_3", "ignore_4", "ignore_5"], inplace=True)
elo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Team       100 non-null    object
 1   EloRank    100 non-null    int64 
 2   EloRating  100 non-null    int64 
 3   FIFARank   100 non-null    object
dtypes: int64(2), object(2)
memory usage: 3.2+ KB


In [28]:
elo.head(5)

Unnamed: 0,Team,EloRank,EloRating,FIFARank
0,Brazil,1,2131,2
1,Germany,2,2092,1
2,Spain,3,2049,8
3,France,4,1987,7
4,Argentina,5,1985,5


In [32]:
qualifying = pd.read_csv("../data/raw/fif/Qualifying_goals.csv")
qualifying = qualifying[["Team", "Rank"]]
qualifying.columns = ["Team", "QualifyGoalsRank"]
qualifying.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Team              32 non-null     object
 1   QualifyGoalsRank  32 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 640.0+ bytes


In [33]:
qualifying.head(5)

Unnamed: 0,Team,QualifyGoalsRank
0,Russia,16
1,Saudi Arabia,6
2,Egypt,26
3,Uruguay,11
4,Portugal,5


In [63]:
data = fixtures.merge(elo, how="inner", left_on="Team1", right_on="Team", suffixes=["","1"])\
    .drop(columns=["Team"])\
    .merge(elo, how="inner", left_on="Team2", right_on="Team", suffixes=["1","2"])\
    .merge(qualifying, how="inner", left_on="Team1", right_on="Team")\
    .drop(columns=["Team_x", "Team_y"])\
    .merge(qualifying, how="inner", left_on="Team2", right_on="Team")\
    .drop(columns=["Team"])\
    .rename(columns={"QualifyGoalsRank_x": "QualifyGoalsRank1", "QualifyGoalsRank_y": "QualifyGoalsRank2"})
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64 entries, 0 to 63
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Date               64 non-null     object
 1   Time               64 non-null     object
 2   Team1              64 non-null     object
 3   Team2              64 non-null     object
 4   HomeAdv1           64 non-null     int64 
 5   HomeAdv2           64 non-null     int64 
 6   EloRank1           64 non-null     int64 
 7   EloRating1         64 non-null     int64 
 8   FIFARank1          64 non-null     object
 9   EloRank2           64 non-null     int64 
 10  EloRating2         64 non-null     int64 
 11  FIFARank2          64 non-null     object
 12  QualifyGoalsRank1  64 non-null     int64 
 13  QualifyGoalsRank2  64 non-null     int64 
dtypes: int64(8), object(6)
memory usage: 7.5+ KB


In [58]:
GOAL_WEIGHT = 4.
GOAL_BOOST = 19.
GOAL_WEIGHT, GOAL_BOOST

(4.0, 19.0)

In [68]:
data["EloRatingDiff"] = data["EloRating1"] - data["EloRating2"]
data["EloRatingDiffWithHomeAdv"] = data["EloRatingDiff"] + (100 * data.HomeAdv1) - (100 * data.HomeAdv2)
data["WinExpectency1Square"] = (10**((-data.EloRatingDiffWithHomeAdv)/400))+1
data["WinExpectency1"] = data["WinExpectency1Square"]**-1
data["RawGoalDiff"] = (GOAL_WEIGHT * (data.WinExpectency1 - 0.5)).round(0)
data["RawGoalDiffAbs"] = data["RawGoalDiff"].abs()
data["EitherWins"] = 0
data.loc[data.RawGoalDiffAbs > 0, "EitherWins"] = 1
data["QualifyGoalsRankAvg"] = (data["QualifyGoalsRank1"] + data["QualifyGoalsRank2"]) / 2
data["ApplyGoalBoost"] = 0
data.loc[data.QualifyGoalsRankAvg <= GOAL_BOOST, "ApplyGoalBoost"] = 1
data["Goals1"] = data["ApplyGoalBoost"]
data.loc[data.RawGoalDiff > 0, "Goals1"] = data.RawGoalDiff + data.ApplyGoalBoost
data["Goals2"] = data["ApplyGoalBoost"]
data.loc[data.RawGoalDiff <= 0, "Goals1"] = data.ApplyGoalBoost - data.RawGoalDiff
data["GoalDiff"] = data.Goals1 - data.Goals2
data["GoalDiffAbs"] = data.GoalDiff.abs()
data["GoalTotal"] = data.Goals1 + data.Goals2
data.iloc[:, -14:].head(5)

Unnamed: 0,EloRatingDiff,EloRatingDiffWithHomeAdv,WinExpectency1Square,WinExpectency1,RawGoalDiff,RawGoalDiffAbs,EitherWins,QualifyGoalsRankAvg,ApplyGoalBoost,Goals1,Goals2,GoalDiff,GoalDiffAbs,GoalTotal
0,88,188,1.338844,0.746913,1.0,1.0,1,11.0,1,2.0,1,1.0,1.0,3.0
1,294,294,1.184077,0.84454,1.0,1.0,1,8.5,1,2.0,1,1.0,1.0,3.0
2,42,142,1.44157,0.693688,1.0,1.0,1,21.0,0,1.0,0,1.0,1.0,1.0
3,-46,-46,2.303167,0.434185,-0.0,0.0,0,16.0,1,1.0,1,0.0,0.0,2.0
4,-168,-68,2.479108,0.403371,-0.0,0.0,0,22.0,0,0.0,0,0.0,0.0,0.0


In [69]:
data.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Date,64,25.0,"Monday, Jun 25 2018",4.0,,,,,,,
Time,64,8.0,19:00,25.0,,,,,,,
Team1,64,32.0,France,5.0,,,,,,,
Team2,64,32.0,England,5.0,,,,,,,
HomeAdv1,64,,,,0.046875,0.213042,0.0,0.0,0.0,0.0,1.0
HomeAdv2,64,,,,0.03125,0.175368,0.0,0.0,0.0,0.0,1.0
EloRank1,64,,,,19.1562,15.9187,1.0,6.0,16.5,27.0,63.0
EloRating1,64,,,,1864.58,137.747,1597.0,1751.0,1855.0,1967.0,2131.0
FIFARank1,64,32.0,7,5.0,,,,,,,
EloRank2,64,,,,22.5625,16.9742,1.0,8.0,17.0,40.0,63.0


## TODO

(parity with Excel)
* Evaluate vs historical data
* Turn model into class/function
* Input actual results
* Compare predictions to actual

(enhancements)
* Tune hyperparameters
* Output predictions