In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
import xgboost

## load data

Data sources:
- Historical match results
- FIFA ranking
- GDP
- Bookie odds
- 2018 match

http://www.football-data.co.uk/data.php

In [6]:
# historical match results
matches = pd.read_csv("../data/results.csv") #parse_dates=['date'])

# monthly ranking since Aug 1993
rankings = pd.read_csv("../data/fifa_ranking.csv",) #parse_dates=['rank_date'])

# 2018 match
matches_2018 = pd.read_csv("../data/World Cup 2018 Dataset.csv")

# GDP data
gdp = pd.read_excel("../data/Download-GDPPC-USD-countries.xls", skiprows=[0, 1])

# odds data
odds = pd.read_csv("../data/historical_odds.csv", na_values='-')

matches.shape, rankings.shape, matches_2018.shape, gdp.shape, odds.shape

((39008, 9), (57793, 16), (33, 20), (220, 49), (3192, 6))

In [7]:
matches.head()
# neutral: match is in 3rd country

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False


In [8]:
matches_2018.head()

Unnamed: 0,Team,Group,Previous appearances,Previous titles,Previous  finals,Previous  semifinals,Current FIFA rank,First match against,Match index,history with first opponent  W-L,history with  first opponent  goals,Second match  against,Match index.1,history with  second opponent  W-L,history with  second opponent  goals,Third match  against,Match index.2,history with  third opponent  W-L,history with  third opponent  goals,Unnamed: 19
0,Russia,A,10.0,0.0,0.0,1.0,65.0,Saudi Arabia,1.0,-1.0,-2.0,Egypt,17.0,,,Uruguay,33.0,0.0,0.0,
1,Saudi Arabia,A,4.0,0.0,0.0,0.0,63.0,Russia,1.0,1.0,2.0,Uruguay,18.0,1.0,1.0,Egypt,34.0,-5.0,-5.0,
2,Egypt,A,2.0,0.0,0.0,0.0,31.0,Uruguay,2.0,-1.0,-2.0,Russia,17.0,,,Saudi Arabia,34.0,5.0,5.0,
3,Uruguay,A,12.0,2.0,2.0,5.0,21.0,Egypt,2.0,1.0,2.0,Saudi Arabia,18.0,-1.0,-1.0,Russia,33.0,0.0,0.0,
4,Porugal,B,6.0,0.0,0.0,2.0,3.0,Spain,3.0,-12.0,-31.0,Morocco,19.0,-1.0,-2.0,Iran,35.0,2.0,5.0,


In [9]:
rankings.tail(5)

Unnamed: 0,rank,country_full,country_abrv,total_points,previous_points,rank_change,cur_year_avg,cur_year_avg_weighted,last_year_avg,last_year_avg_weighted,two_year_ago_avg,two_year_ago_weighted,three_year_ago_avg,three_year_ago_weighted,confederation,rank_date
57788,206,Anguilla,AIA,0.0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CONCACAF,2018-06-07
57789,206,Bahamas,BAH,0.0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CONCACAF,2018-06-07
57790,206,Eritrea,ERI,0.0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CAF,2018-06-07
57791,206,Somalia,SOM,0.0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CAF,2018-06-07
57792,206,Tonga,TGA,0.0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,OFC,2018-06-07


In [10]:
# some cleaning needs to be done: some country names in different files are differnt
print(matches.home_team.nunique(), rankings.country_full.nunique(), matches_2018.Team.nunique())
set(rankings.country_full) - set(matches.home_team) 

(241, 216, 32)


{'Bosnia and Herzegovina',
 'Brunei Darussalam',
 'Cape Verde Islands',
 'China PR',
 'Chinese Taipei',
 "C\xc3\xb4te d'Ivoire",
 'FYR Macedonia',
 'IR Iran',
 'Kyrgyz Republic',
 'Myanmar',
 'Netherlands Antilles',
 'RCS',
 'Republic of Ireland',
 'Serbia and Montenegro',
 'St Kitts and Nevis',
 'St Lucia',
 'St Vincent and the Grenadines',
 'S\xc3\xa3o Tom\xc3\xa9 e Pr\xc3\xadncipe',
 'Timor-Leste',
 'US Virgin Islands',
 'Zaire'}

In [11]:
# clean up data
rankings = rankings.replace({'IR Iran': 'Iran', 
                             'China PR': 'China'})
matches_2018.dropna(how='all', inplace=True)
matches_2018 = matches_2018.replace({'Columbia': 'Colombia',
                                     'Costarica': 'Costa Rica',
                                     'IRAN': 'Iran',
                                     'Porugal': 'Portugal',
                                     'Korea': 'Korea Republic'})

In [12]:
print(set(matches_2018.Team) - set(matches.home_team) )
print(set(matches_2018.Team) - set(rankings.country_full))

set([])
set([])


In [13]:
gdp = gdp.replace({'Russian Federation': 'Russia',
                   'United Kingdom': 'England',
                   'Congo': 'Congo DR',
                   'Republic of Korea': 'Korea Republic',
                   'Republic of Moldova': 'Moldova',
                   'Iran (Islamic Republic of)': 'Iran'})

gdp = gdp.drop('CountryID', axis=1).set_index('Country').stack()

gdp = gdp.reset_index().rename(columns={'level_1': 'year', 0: 'gdppc'})

odds = odds.replace({'South Korea': 'Korea Republic'})

In [14]:
matches_2018.set_index('Team', inplace=True)

## Feature engineering
What to predict: goal difference between two teams given teams' historical data
- Team level features:
    - weighted avg ranking in past x years 
    - home advantage - host or not
    - home advantage - same continent as hosting country or not
    - confederation
    - history with the opponent
    - Outcome of previous 3 matches
    - GDP per capita
    - 

### create outcome variable: 
- 1: home team won 
- 0: draw 
- -1: away team won

In [15]:
matches['result'] = matches.home_score - matches.away_score
matches.result = matches.result.clip(upper=1, lower=-1)

In [16]:
matches.result.value_counts()

 1    18969
-1    10991
 0     9048
Name: result, dtype: int64

### features from historical matches

In [17]:
# remove friendly matches and use only data starting from 1996
matches = matches.set_index('date')
matches.index = pd.to_datetime(matches.index)
matches = matches[matches.tournament != 'Friendly']
matches = matches['1996':]

In [18]:
matches.shape

(12414, 9)

#### home advantage

In [19]:
matches['home_advantage'] = matches.country == matches.home_team

#### historical records with this opponent
for each home-away pair, calculate the win/draw count and rate    
All historical data is used. Time is not considered here. __NOTE:__ This introduces data leakage because stats are calculated using all data.. should fix later if have time

In [20]:
def get_win_count(x): 
    return (x==1).sum()

def get_draw_count(x): 
    return (x==0).sum()

historical_stats_w_oppo = matches.groupby(['home_team', 'away_team']).agg({'home_score': 'sum', 'away_score': 'sum', 
                                                 'result': ['count', get_win_count, get_draw_count]}).reset_index()
historical_stats_w_oppo.columns = ['home_team', 'away_team', 'total_away_score', 'total_home_score', 
                                   'total_count', 'home_win_count', 'home_draw_count']

# historical_stats_w_oppo.columns = ['_'.join(c) for c in historical_stats_w_oppo.columns]

historical_stats_w_oppo.head()

Unnamed: 0,home_team,away_team,total_away_score,total_home_score,total_count,home_win_count,home_draw_count
0,Afghanistan,Bangladesh,0,4,1,1,0
1,Afghanistan,Bhutan,3,6,3,2,0
2,Afghanistan,Cambodia,1,5,2,2,0
3,Afghanistan,India,4,0,1,0,0
4,Afghanistan,Japan,6,0,1,0,0


In [21]:
# matches[(matches.home_team == 'Australia') & (matches.away_team == 'France')]

In [22]:
historical_stats_w_oppo[(historical_stats_w_oppo.home_team == 'Australia') & (historical_stats_w_oppo.away_team == 'France')]

Unnamed: 0,home_team,away_team,total_away_score,total_home_score,total_count,home_win_count,home_draw_count
279,Australia,France,0,1,1,1,0


In [23]:
hist_stats_w_oppo_agg = historical_stats_w_oppo.copy()



In [24]:
hist_stats_w_oppo_agg = historical_stats_w_oppo.copy()

# flip home team, away team so that they are in alphabetical order 
for i in hist_stats_w_oppo_agg.index:
    if hist_stats_w_oppo_agg.loc[i, 'home_team'] < hist_stats_w_oppo_agg.loc[i, 'away_team']: 
        hist_stats_w_oppo_agg.loc[i, 'home_team'], hist_stats_w_oppo_agg.loc[i, 'away_team'] = hist_stats_w_oppo_agg.loc[i, 'away_team'], hist_stats_w_oppo_agg.loc[i, 'home_team']
        hist_stats_w_oppo_agg.loc[i, 'total_home_score'], hist_stats_w_oppo_agg.loc[i, 'total_away_score'] = hist_stats_w_oppo_agg.loc[i, 'total_away_score'], hist_stats_w_oppo_agg.loc[i, 'total_home_score']
        hist_stats_w_oppo_agg.loc[i, 'home_win_count'] = hist_stats_w_oppo_agg.loc[i, 'total_count'] \
                - hist_stats_w_oppo_agg.loc[i, 'home_win_count'] - hist_stats_w_oppo_agg.loc[i, 'home_draw_count']

# then consolidate same home/away pairs
hist_stats_w_oppo_agg = hist_stats_w_oppo_agg.groupby(['home_team', 'away_team']).sum().reset_index()

In [25]:
# hist_stats_w_oppo_agg[(hist_stats_w_oppo_agg.home_team == 'France') & (hist_stats_w_oppo_agg.away_team == 'Australia')]

In [26]:
# hist_stats_w_oppo_agg[(hist_stats_w_oppo_agg.home_team == 'Australia') & (hist_stats_w_oppo_agg.away_team == 'France')]

In [27]:
# now duplicate and flip home away for easier joining later
hist_stats_w_oppo_agg2 = hist_stats_w_oppo_agg.copy()
hist_stats_w_oppo_agg2.columns = ['away_team', 'home_team', 'total_home_score', 'total_away_score', 
                                  'total_count', 'home_win_count', 'home_draw_count']
hist_stats_w_oppo_agg2.home_win_count = hist_stats_w_oppo_agg2.total_count - hist_stats_w_oppo_agg2.home_draw_count\
                                        - hist_stats_w_oppo_agg2.home_win_count

hist_stats_w_oppo_agg = pd.concat([hist_stats_w_oppo_agg, hist_stats_w_oppo_agg2], axis=0)

hist_stats_w_oppo_agg['home_win_rate'] = hist_stats_w_oppo_agg.home_win_count * 1.0 / hist_stats_w_oppo_agg.total_count
hist_stats_w_oppo_agg['home_draw_rate'] = hist_stats_w_oppo_agg.home_draw_count * 1.0 / hist_stats_w_oppo_agg.total_count

In [28]:
hist_stats_w_oppo_agg['total_score_diff'] = hist_stats_w_oppo_agg.total_home_score - hist_stats_w_oppo_agg.total_away_score

In [29]:
hist_stats_w_oppo_agg.head()

Unnamed: 0,away_team,home_draw_count,home_team,home_win_count,total_away_score,total_count,total_home_score,home_win_rate,home_draw_rate,total_score_diff
0,Albania,0,Andorra,0,3,1,0,0.0,0.0,-3
1,Algeria,3,Angola,1,6,5,6,0.2,0.6,0
2,Anguilla,0,Antigua and Barbuda,1,3,1,5,1.0,0.0,2
3,Albania,1,Armenia,1,6,4,5,0.25,0.25,-1
4,Andorra,0,Armenia,6,2,6,18,1.0,0.0,16


In [30]:
X = pd.merge(matches, hist_stats_w_oppo_agg, on=['home_team', 'away_team'], how='left')

### ranking features
use ranking difference between home and away team

In [31]:
# ranking is monthly. join with matches on year-month
rankings['year_month'] = rankings.rank_date.apply(lambda x: x[:-3])
X['year_month'] = matches.index.map(lambda x: pd.datetime.strftime(x, '%Y-%m'))

In [32]:
X = pd.merge(rankings, X, 
             left_on=['country_full', 'year_month'],
             right_on=['home_team', 'year_month'])

X = pd.merge(rankings, X,
             left_on=['country_full', 'year_month'],
             right_on=['away_team', 'year_month'],
             suffixes=('_away', '_home'))

In [33]:
# create diff features
for c in [u'rank_away', 
       u'total_points_away', u'previous_points_away', u'rank_change_away',
       u'cur_year_avg_away', u'cur_year_avg_weighted_away',
       u'last_year_avg_away', u'last_year_avg_weighted_away',
       u'two_year_ago_avg_away', u'two_year_ago_weighted_away',
       u'three_year_ago_avg_away', u'three_year_ago_weighted_away']:
    feat_name = c[:-5]
    X[feat_name + '_diff'] = X[feat_name + '_home'] - X[c]

### GDP features

In [34]:
X['year'] = X.year_month.apply(lambda x: int(x[:-3]))

In [35]:
X = pd.merge(X, gdp, left_on=['home_team', 'year'], right_on=['Country', 'year'], how='left')
X = pd.merge(X, gdp, left_on=['away_team', 'year'], right_on=['Country', 'year'], how='left',
             suffixes=('_home', '_away'))

### Odds features

In [36]:
# just merge with odds data and see...
odds.head()

Unnamed: 0,away,cross,home,ones,twos,year
0,Argentina,3.29,Germany,2.36,3.35,2014
1,Netherlands,3.71,Brazil,2.16,3.36,2014
2,Argentina,3.23,Netherlands,3.07,2.54,2014
3,Germany,3.21,Brazil,2.65,2.93,2014
4,Costa Rica,4.16,Netherlands,1.53,7.15,2014


In [37]:
# flip home and away for easy joining
odds2 = odds.copy()
odds2.columns = ['home', 'cross', 'away', 'twos', 'ones', 'year']
odds = pd.concat([odds, odds2], axis=0)

In [38]:
X = pd.merge(X, odds, left_on=['home_team', 'away_team', 'year'],
                  right_on=['home', 'away', 'year'], how='left')

In [39]:
X.columns

Index([                   u'rank_away',            u'country_full_away',
                  u'country_abrv_away',            u'total_points_away',
               u'previous_points_away',             u'rank_change_away',
                  u'cur_year_avg_away',   u'cur_year_avg_weighted_away',
                 u'last_year_avg_away',  u'last_year_avg_weighted_away',
              u'two_year_ago_avg_away',   u'two_year_ago_weighted_away',
            u'three_year_ago_avg_away', u'three_year_ago_weighted_away',
                 u'confederation_away',               u'rank_date_away',
                         u'year_month',                    u'rank_home',
                  u'country_full_home',            u'country_abrv_home',
                  u'total_points_home',         u'previous_points_home',
                   u'rank_change_home',            u'cur_year_avg_home',
         u'cur_year_avg_weighted_home',           u'last_year_avg_home',
        u'last_year_avg_weighted_home',        u'tw

## Model

In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV

In [60]:
features = ['rank_diff',
 u'total_points_diff',
 u'previous_points_diff',
#  u'rank_change_diff',
 u'cur_year_avg_diff',
 u'cur_year_avg_weighted_diff',
 u'last_year_avg_diff',
 u'last_year_avg_weighted_diff',
 u'two_year_ago_avg_diff',
 u'two_year_ago_weighted_diff',
 u'three_year_ago_avg_diff',
 u'three_year_ago_weighted_diff',
 u'neutral', 
#                                                      'gdppc_home', 'gdppc_away', 
#                                                      'confederation_home', 'confederation_away',
                                                     'home_advantage',
#                                                      'cross', 'ones', 'twos', 
#                                                      'total_away_score', 'total_home_score', 
# 'total_score_diff',
# 'home_win_rate', 'home_draw_rate'
           ]
features

['rank_diff',
 u'total_points_diff',
 u'previous_points_diff',
 u'cur_year_avg_diff',
 u'cur_year_avg_weighted_diff',
 u'last_year_avg_diff',
 u'last_year_avg_weighted_diff',
 u'two_year_ago_avg_diff',
 u'two_year_ago_weighted_diff',
 u'three_year_ago_avg_diff',
 u'three_year_ago_weighted_diff',
 u'neutral',
 'home_advantage']

In [63]:
XX = X.fillna(0)
y = X.result
XX = pd.get_dummies(XX[features])
XX.shape

(10596, 13)

In [64]:
X_train, X_test, y_train, y_test = train_test_split(XX, y)
X_train.shape, X_test.shape

((7947, 13), (2649, 13))

In [65]:
y_test.value_counts()

 1    1307
-1     731
 0     611
Name: result, dtype: int64

In [66]:
rf_model = RandomForestClassifier(n_estimators=200, max_depth=6)
rf_model.fit(X_train, y_train)
y_pred_prob = rf_model.predict_proba(X_test)
y_pred = rf_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.60815402038505095

In [67]:
# let's do some hyperparameter tuning
param_grid = [{"max_depth": [5, 9, 11], "n_estimators": [200, 400, 600]}]

rf_model = GridSearchCV(RandomForestClassifier(n_jobs=-1), param_grid, cv=5, scoring='accuracy')

rf_model.fit(X_train, y_train)
best_model = rf_model.best_estimator_

In [68]:
print("Grid scores on development set:")
means = rf_model.cv_results_['mean_test_score']
stds = rf_model.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, rf_model.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
    
print("Best parameters set found on development set:")
print(rf_model.best_params_)

y_pred = best_model.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred,))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

Grid scores on development set:
0.618 (+/-0.015) for {'n_estimators': 200, 'max_depth': 5}
0.618 (+/-0.016) for {'n_estimators': 400, 'max_depth': 5}
0.617 (+/-0.015) for {'n_estimators': 600, 'max_depth': 5}
0.635 (+/-0.008) for {'n_estimators': 200, 'max_depth': 9}
0.635 (+/-0.012) for {'n_estimators': 400, 'max_depth': 9}
0.636 (+/-0.008) for {'n_estimators': 600, 'max_depth': 9}
0.638 (+/-0.011) for {'n_estimators': 200, 'max_depth': 11}
0.640 (+/-0.009) for {'n_estimators': 400, 'max_depth': 11}
0.640 (+/-0.009) for {'n_estimators': 600, 'max_depth': 11}
Best parameters set found on development set:
{'n_estimators': 600, 'max_depth': 11}
('Accuracy: ', 0.61381653454133633)
Confusion matrix:
[[ 504   18  209]
 [ 250   31  330]
 [ 202   14 1091]]


In [69]:
np.unique(y_pred, return_counts=True)

(array([-1,  0,  1]), array([ 956,   63, 1630]))

### Try xgboost

In [49]:
import xgboost as xgb

In [50]:
# dtrain = xgb.DMatrix(train, label=label)


## Simulations

### group rounds
- win - 3 points
- draw - 1 point
- lose - 0 point

In [70]:
# prepare features for the 2018 matches - get from latest ranking
def get_features(country1, country2):
    feats = [c[:-5] for c in features[:12]]
    country1_feats = rankings[rankings.country_full == country1].iloc[-1, :][feats]
    country2_feats = rankings[rankings.country_full == country2].iloc[-1, :][feats]
#     print(country1_feats, country2_feats)
    final_feats = country1_feats - country2_feats
    final_feats.index = [i + '_diff' for i in final_feats.index]
    final_feats = pd.DataFrame(final_feats).T
    final_feats['home_team'] = country1
    final_feats['away_team'] = country2
    # add home information
    final_feats['home_advantage'] = (final_feats.home_team == 'Russia')
    final_feats['neutral'] = (final_feats.home_team != 'Russia') & (final_feats.away_team != 'Russia')
    final_feats = pd.merge(final_feats, hist_stats_w_oppo_agg, on=['home_team', 'away_team'], how='left')

    
#     final_feats['total_home_score'] = hist_stats_w_oppo_agg[(hist_stats_w_oppo_agg.home_team == country1) 
#                                                 & (hist_stats_w_oppo_agg.away_team == country2)].total_home_score.values[0]
#     final_feats['home_win_rate'] = hist_stats_w_oppo_agg[(hist_stats_w_oppo_agg.home_team == country1) 
#                                                 & (hist_stats_w_oppo_agg.away_team == country2)].home_win_rate.values[0]
#     final_feats['total_away_score'] = hist_stats_w_oppo_agg[(hist_stats_w_oppo_agg.home_team == country1) 
#                                                 & (hist_stats_w_oppo_agg.away_team == country2)].total_away_score.values[0]
#     final_feats['home_draw_rate'] = hist_stats_w_oppo_agg[(hist_stats_w_oppo_agg.home_team == country1) 
#                                                 & (hist_stats_w_oppo_agg.away_team == country2)].home_draw_rate.values[0]
    
        
    final_feats.fillna(0, inplace=True)
    return final_feats[features]

In [71]:
get_features('Brazil', 'Serbia')#.values.reshape(1, -1).shape

Unnamed: 0,rank_diff,total_points_diff,previous_points_diff,cur_year_avg_diff,cur_year_avg_weighted_diff,last_year_avg_diff,last_year_avg_weighted_diff,two_year_ago_avg_diff,two_year_ago_weighted_diff,three_year_ago_avg_diff,three_year_ago_weighted_diff,neutral,home_advantage
0,-32,679.82,652,141.83,141.83,675.24,337.62,254.97,76.49,619.4,123.88,True,False


In [72]:
best_model.predict_proba(get_features('Brazil', 'Serbia'))

array([[ 0.09040994,  0.23173606,  0.677854  ]])

In [73]:
rankings[rankings.country_full == 'Switzerland'].tail()

Unnamed: 0,rank,country_full,country_abrv,total_points,previous_points,rank_change,cur_year_avg,cur_year_avg_weighted,last_year_avg,last_year_avg_weighted,two_year_ago_avg,two_year_ago_weighted,three_year_ago_avg,three_year_ago_weighted,confederation,rank_date,year_month
56745,8,Switzerland,SUI,1190.44,1190,0,637.56,637.56,602.17,301.08,490.21,147.06,523.66,104.73,UEFA,2018-02-15,2018-02
56956,8,Switzerland,SUI,1196.61,1190,0,637.56,637.56,602.17,301.08,490.21,147.06,554.54,110.91,UEFA,2018-03-15,2018-03
57165,6,Switzerland,SUI,1179.21,1197,2,594.33,594.33,716.99,358.49,390.85,117.26,545.66,109.13,UEFA,2018-04-12,2018-04
57376,6,Switzerland,SUI,1179.21,1179,0,594.33,594.33,716.99,358.49,390.85,117.26,545.66,109.13,UEFA,2018-05-17,2018-05
57587,6,Switzerland,SUI,1198.72,1179,0,578.59,578.59,808.58,404.29,338.09,101.43,572.09,114.42,UEFA,2018-06-07,2018-06


In [74]:
# start from latest points as of 22 Jun

# matches_2018['points'] = []

In [75]:
from itertools import combinations

opponents = ['First match \nagainst', 'Second match\n against', 'Third match\n against']

matches_2018['points'] = 0
matches_2018['total_prob'] = 0

for group in set(matches_2018['Group']):
    print('___Group {}:___'.format(group))
    for home, away in combinations(matches_2018[matches_2018.Group == group].index, 2):
        print("{} vs. {}: ".format(home, away))
        feats = get_features(home, away)
        pred_prob = best_model.predict_proba(feats.values.reshape(1, -1))
#         result = rf_model.predict(feats.values.reshape(1, -1))
        matches_2018.loc[home, 'total_prob'] += pred_prob[0, 2]
        matches_2018.loc[away, 'total_prob'] += pred_prob[0, 0]
        # calculate the expected score for each team
        matches_2018.loc[home, 'points'] += (3 * pred_prob[0, 2] + 1 * pred_prob[0, 1])
        matches_2018.loc[away, 'points'] += (3 * pred_prob[0, 0] + 1 * pred_prob[0, 1])
        print(pred_prob)
        
#         if result == 1: # home win
#             matches_2018.loc[home, 'points'] += 3
#             print("  {} wins ({})".format(home, pred_prob[0, 2]))
#         elif result == 0: # draw
#             matches_2018.loc[home, 'points'] += 1
#             matches_2018.loc[away, 'points'] += 1
#             print("  Draw ({})".format(home, pred_prob[0, 1]))
#         else: # away win 
#             matches_2018.loc[away, 'points'] += 3
#             print("  {} wins ({})".format(away, pred_prob[0, 0]))


___Group A:___
Russia vs. Saudi Arabia: 
[[ 0.18269101  0.1786084   0.63870059]]
Russia vs. Egypt: 
[[ 0.23196089  0.51102603  0.25701308]]
Russia vs. Uruguay: 
[[ 0.6839496   0.22994288  0.08610752]]
Saudi Arabia vs. Egypt: 
[[ 0.44665682  0.35462456  0.19871862]]
Saudi Arabia vs. Uruguay: 
[[ 0.76927905  0.16298515  0.06773579]]
Egypt vs. Uruguay: 
[[ 0.57088928  0.25937874  0.16973198]]
___Group C:___
France vs. Australia: 
[[ 0.15781242  0.17644324  0.66574434]]
France vs. Peru: 
[[ 0.24884901  0.22742135  0.52372965]]
France vs. Denmark: 
[[ 0.38464743  0.25196318  0.36338939]]
Australia vs. Peru: 
[[ 0.39408278  0.37529387  0.23062335]]
Australia vs. Denmark: 
[[ 0.44566831  0.33954389  0.2147878 ]]
Peru vs. Denmark: 
[[ 0.33856005  0.31547164  0.34596831]]
___Group B:___
Portugal vs. Spain: 
[[ 0.18144506  0.14611699  0.67243795]]
Portugal vs. Morocco: 
[[ 0.10699265  0.52976339  0.36324396]]
Portugal vs. Iran: 
[[ 0.15193437  0.37270668  0.47535895]]
Spain vs. Morocco: 
[[ 0.19

In [76]:
next_round = matches_2018.groupby('Group').apply(lambda x: 
                                    x.sort_values(by=['points', 'total_prob'], ascending=False).iloc[:2, :])


next_round = next_round.drop('Group', axis=1).reset_index()[['Group', 'Team']]

In [77]:
pairing = [0,3,4,7,8,11,12,15,1,2,5,6,9,10,13,14]

next_round = next_round.loc[pairing]
next_round.set_index('Team', inplace=True)

In [78]:
finals = ['round_of_16', 'quarterfinal', 'semifinal', 'final']

for f in finals:
    print("___Starting of the {}___".format(f))
    iterations = int(len(next_round) / 2)
    winners = []

    for i in range(iterations):
        home = next_round.index[i*2]
        away = next_round.index[i*2+1]
        print("{} vs. {}: ".format(home, away))
        feats = get_features(home, away)
        pred_prob = best_model.predict_proba(feats.values.reshape(1, -1))
        result = best_model.predict(feats.values.reshape(1, -1))
        
        if result == 1: # home win
            print("  {} wins ({})".format(home, pred_prob[0, 2]))
            winners.append(home)
        elif result == 0: # draw
            print("  Draw ({})".format(home, pred_prob[0, 1]))
            if pred_prob[0, 2] > pred_prob[0, 0]:
                winners.append(home)
            else:
                winners.append(away)
        else: # away win 
            print("  {} wins ({})".format(away, pred_prob[0, 0]))
            winners.append(away)
                
    next_round = next_round.loc[winners]
    print("\n")

___Starting of the round_of_16___
Uruguay vs. Spain: 
  Draw (Uruguay)
France vs. Iceland: 
  France wins (0.669893799304)
Brazil vs. Mexico: 
  Mexico wins (0.521008154384)
Belgium vs. Senegal: 
  Belgium wins (0.519185208141)
Russia vs. Portugal: 
  Portugal wins (0.751185645771)
Denmark vs. Argentina: 
  Argentina wins (0.46263252079)
Costa Rica vs. Germany: 
  Germany wins (0.76369786469)
England vs. Poland: 
  Poland wins (0.380727550755)


___Starting of the quarterfinal___
Uruguay vs. France: 
  France wins (0.374918416244)
Mexico vs. Belgium: 
  Belgium wins (0.476121036484)
Portugal vs. Argentina: 
  Portugal wins (0.451408619403)
Germany vs. Poland: 
  Germany wins (0.522529093798)


___Starting of the semifinal___
France vs. Belgium: 
  Draw (France)
Portugal vs. Germany: 
  Germany wins (0.462918249886)


___Starting of the final___
France vs. Germany: 
  Germany wins (0.529659975492)




## Further improvements:
- How to deal with draw? Take the model prediction mean/sd and run simulations?

- Other useful data/features:
    - weather
    - time of match
    - player data
    - twitter sentiment