Setup

In [903]:
pip install elo



In [904]:
pip install elosports



In [905]:
import pandas as pd
import numpy as np
import elo
from scipy import stats
from elosports.elo import Elo
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn import preprocessing
pd.set_option('display.max_columns', None)

In [906]:
train = pd.read_csv('matches_train_set.csv')
test_orig = pd.read_csv('matches_submission.csv')
philly_union_orig = pd.read_csv('philly_union_submission.csv')

In [907]:
train.winner.unique()
train[train.winner == 'yet unknown']
train = train.drop(train[train.winner == 'yet unknown'].index)

Elo Ratings Calculation

inspired by 
https://github.com/danielguerreros/InternationalELo/blob/main/calculate_elo.ipynb

In [908]:
train = train[['date_time_utc','season','squad_id_a','squad_id_b','score_a','score_b','winner']].sort_values(by='date_time_utc')

In [909]:
def expected_result(loc, aw):
  dr = loc-aw
  we=(1/(10**(-dr/400)+1))
  return [np.round(we,3),1-np.round(we,3)]

def actual_result(loc, aw):
  if loc < aw:
    wa = 1
    wl = 0
  elif loc > aw:
    wa = 0
    wl = 1
  elif loc == aw:
    wa = 0.5
    wl = 0.5
  return [wl, wa]

def calculate_elo(elo_l, elo_v, local_goals, away_goals):
  k = 20
  wl, wv = actual_result(local_goals, away_goals)
  wel, wev = expected_result(elo_l, elo_v)

  elo_ln = elo_l + k*(wl-wel)
  elo_vn = elo_v + k*(wv-wev)

  return elo_ln, elo_vn

In [910]:
current_elo={}
for idx,row in train.iterrows():
    
    local=row['squad_id_a']
    away=row['squad_id_b']
    local_goals=row['score_a']
    away_goals=row['score_b']
    

    if local not in current_elo.keys():
        current_elo[local]=1300
    
    if away not in current_elo.keys():
        current_elo[away]=1300
    
    elo_l=current_elo[local]
    elo_v=current_elo[away]
    elo_ln,elo_vn=calculate_elo(elo_l,elo_v,local_goals,away_goals)

    current_elo[local]=elo_ln
    current_elo[away]=elo_vn
    
    train.loc[idx,'Elo_h_after']=elo_ln
    train.loc[idx,'Elo_a_after']=elo_vn 
    train.loc[idx,'Elo_h_before']=elo_l
    train.loc[idx,'Elo_a_before']=elo_v

In [911]:
elos=train[['date_time_utc','squad_id_a','Elo_h_after']].rename(columns={'home_team':'Team','Elo_h_after':'Elo'}).append(train[['date_time_utc','squad_id_b','Elo_a_after']].rename(columns={'away_team':'Team','Elo_a_after':'Elo'}))

In [912]:
elo_lasthome = elos.groupby('squad_id_a').last()
elo_lasthome = elo_lasthome.reset_index()[['squad_id_a','date_time_utc','Elo']]
elo_lasthome.columns = ['squad_id', 'date', 'Elo']

In [913]:
elo_lastaway = elos.groupby('squad_id_b').last()
elo_lastaway = elo_lastaway.reset_index()[['squad_id_b','date_time_utc','Elo']]
elo_lastaway.columns = ['squad_id', 'date', 'Elo']

In [914]:
elos_final = elo_lasthome.append(elo_lastaway).sort_values(by=['squad_id','date'])
elos_final = elos_final.iloc[1::2,:]

In [915]:
train = train[['season','squad_id_a','squad_id_b','winner']]
train['squad_id_a_elo'] = 0
train['squad_id_b_elo'] = 0
train = pd.merge(train, elos_final, left_on='squad_id_a', right_on='squad_id')[['season','squad_id_a','squad_id_b','winner','date','Elo']]
train.columns=['season','squad_id_a','squad_id_b','winner','date','Elo_a']
train= pd.merge(train, elos_final, left_on='squad_id_b', right_on='squad_id')[['season','squad_id_a','squad_id_b','winner','date_x','Elo_a','Elo']]
train.columns=['season','squad_id_a','squad_id_b','winner','date','Elo_a','Elo_b']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Model Fitting

In [916]:
train_X = train[['season','squad_id_a','squad_id_b','Elo_a','Elo_b']]
train_y = train.winner
test = test_orig[['season','squad_id_a','squad_id_b','winner']]
philly_union = philly_union_orig[['season','squad_id_a','squad_id_b']]

In [917]:
test = pd.merge(test, elos_final, left_on='squad_id_a', right_on='squad_id', how='left')[['season','squad_id_a','squad_id_b','date','Elo','winner']]
test.columns=['season','squad_id_a','squad_id_b','date','Elo_a','winner']
test= pd.merge(test, elos_final, left_on='squad_id_b', right_on='squad_id', how='left')[['season','squad_id_a','squad_id_b','date_x','Elo_a','Elo','winner']]
test.columns=['season','squad_id_a','squad_id_b','date','Elo_a','Elo_b','winner']
test.drop('date', axis=1, inplace=True)
test

Unnamed: 0,season,squad_id_a,squad_id_b,Elo_a,Elo_b,winner
0,2022,46024eeb,529ba333,1370.48,1307.86,team_A
1,2022,ca460650,529ba333,1270.10,1307.86,draw
2,2022,4acb0537,529ba333,1343.22,1307.86,draw
3,2022,69a0fb10,529ba333,1330.70,1307.86,draw
4,2022,0d885416,f7d86a43,1192.44,1287.80,draw
...,...,...,...,...,...,...
108,2022,46024eeb,eb57545a,1370.48,,team_A
109,2022,3c079def,eb57545a,1413.16,,team_A
110,2022,1ebc1a5b,eb57545a,1329.20,,team_A
111,2022,44117292,eb57545a,1285.92,,team_A


In [918]:
philly_union = pd.merge(philly_union, elos_final, left_on='squad_id_a', right_on='squad_id', how='left')[['season','squad_id_a','squad_id_b','date','Elo']]
philly_union.columns=['season','squad_id_a','squad_id_b','date','Elo_a']
philly_union= pd.merge(philly_union, elos_final, left_on='squad_id_b', right_on='squad_id', how='left')[['season','squad_id_a','squad_id_b','date_x','Elo_a','Elo']]
philly_union.columns=['season','squad_id_a','squad_id_b','date','Elo_a','Elo_b']
philly_union.drop('date', axis=1, inplace=True)
philly_union

Unnamed: 0,season,squad_id_a,squad_id_b,Elo_a,Elo_b
0,2022,46024eeb,99ea75a6,1370.48,1330.4
1,2022,46024eeb,ca460650,1370.48,1270.1
2,2022,46024eeb,eb57545a,1370.48,
3,2022,46024eeb,529ba333,1370.48,1307.86
4,2022,46024eeb,fc22273c,1370.48,1267.92
5,2022,46024eeb,69a0fb10,1370.48,1330.7
6,2022,46024eeb,cb8b86a2,1370.48,1258.54
7,2022,46024eeb,e9ea41b2,1370.48,1102.78
8,2022,46024eeb,64e81410,1370.48,1355.28
9,2022,46024eeb,44117292,1370.48,1285.92


In [919]:
le = preprocessing.LabelEncoder()
teams = test.squad_id_a
teams = teams.append(train.squad_id_a)
unique_teams = teams.unique()
le.fit(unique_teams)

LabelEncoder()

In [920]:
train_X.squad_id_a = le.transform(train_X.squad_id_a)
train_X.squad_id_b = le.transform(train_X.squad_id_b)
test.squad_id_a = le.transform(test.squad_id_a)
test.squad_id_b = le.transform(test.squad_id_b)
philly_union.squad_id_a = le.transform(philly_union.squad_id_a)
philly_union.squad_id_b = le.transform(philly_union.squad_id_b)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [921]:
condition1 = test['Elo_a'].isna()
condition2 = test['Elo_b'].isna()
condition3 = philly_union['Elo_a'].isna()
condition4 = philly_union['Elo_b'].isna()

#new starting Elo for a new team for this new season in the test dataset (Charlotte FC's first season, so they have no previous data)
test.loc[condition1,'Elo_a'] = 1300.00
test.loc[condition2,'Elo_b'] = 1300.00
philly_union.loc[condition3, 'Elo_a'] = 1300.00
philly_union.loc[condition4, 'Elo_b'] = 1300.00

In [922]:
def get_winner(row):
  eloLeague = Elo(k = 20)
  eloLeague.addPlayer(row.squad_id_a, row.Elo_a)
  eloLeague.addPlayer(row.squad_id_b, row.Elo_b)
  rdiff = np.abs(row.Elo_a-row.Elo_b)
  avgrating = (row.Elo_a+row.Elo_b)/2
  draw_prob = -rdiff/32.49 + np.exp((avgrating-2254.7)/208.49)+23.87
  elements = [0, 1]
  probabilities = [1-draw_prob, draw_prob]
  is_tie = np.random.choice(elements, 1, probabilities)[0]
  if is_tie == 1: return "draw"
  else:
    win_prob = eloLeague.expectResult(eloLeague.ratingDict[row.squad_id_a],eloLeague.ratingDict[row.squad_id_b])
    if win_prob > 0.50: return "team_A"
    else: return "team_B"

Background domain inspiration:

https://web.archive.org/web/20160920160857/http://kirill-kryukov.com/chess/kcec/draw_rate.html

https://web.archive.org/web/20160806071058/http://chess-db.com/public/research/draw_rate.html

In [923]:
test['pred_winner'] = test.apply(lambda row: get_winner(row),axis=1)

Predictions for Philly

In [924]:
philly_union['pred_winner'] = philly_union.apply(lambda row: get_winner(row),axis=1)

In [925]:
philly_union

Unnamed: 0,season,squad_id_a,squad_id_b,Elo_a,Elo_b,pred_winner
0,2022,8,16,1370.48,1330.4,team_A
1,2022,8,19,1370.48,1270.1,draw
2,2022,8,24,1370.48,1300.0,draw
3,2022,8,11,1370.48,1307.86,draw
4,2022,8,27,1370.48,1267.92,draw
5,2022,8,14,1370.48,1330.7,draw
6,2022,8,20,1370.48,1258.54,draw
7,2022,8,23,1370.48,1102.78,draw
8,2022,8,13,1370.48,1355.28,team_A
9,2022,8,7,1370.48,1285.92,team_A


First 20 Predictions

In [926]:
test.head(20)

Unnamed: 0,season,squad_id_a,squad_id_b,Elo_a,Elo_b,winner,pred_winner
0,2022,8,11,1370.48,1307.86,team_A,team_A
1,2022,19,11,1270.1,1307.86,draw,team_B
2,2022,10,11,1343.22,1307.86,draw,team_A
3,2022,14,11,1330.7,1307.86,draw,team_A
4,2022,0,25,1192.44,1287.8,draw,team_B
5,2022,10,25,1343.22,1287.8,team_A,team_A
6,2022,5,25,1413.16,1287.8,team_B,team_A
7,2022,13,25,1355.28,1287.8,team_A,team_A
8,2022,6,25,1373.96,1287.8,draw,draw
9,2022,21,25,1357.1,1287.8,draw,draw


F1 Score

In [927]:
print("F1 Score: " + str(f1_score(test.winner, test.pred_winner, average='macro')))

F1 Score: 0.3569062674753732


Files Output

In [934]:
philly_union.to_csv('philly_union_predictions.csv')
test.to_csv('submission_predictions.csv')