In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## load data

Data sources:
- Historical match results
- FIFA ranking
- GDP
- Bookie odds
- 2018 match

In [2]:
# historical match results
matches = pd.read_csv("../data/results_w_2018fifa.csv", parse_dates=['date'], dayfirst=True)

# monthly ranking since Aug 1993
rankings = pd.read_csv("../data/fifa_ranking.csv", )#parse_dates=['rank_date'], dayfirst=True)

# 2018 match
matches_2018 = pd.read_csv("../data/matches_20180626.csv")

# GDP data
gdp = pd.read_excel("../data/Download-GDPPC-USD-countries.xls", skiprows=[0, 1])

# odds data
odds = pd.read_csv("../data/historical_odds.csv", na_values='-')

matches.shape, rankings.shape, matches_2018.shape, gdp.shape, odds.shape

((39044, 9), (57793, 16), (32, 21), (220, 49), (3192, 6))

In [3]:
matches.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30 00:00:00,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,1873-03-08 00:00:00,England,Scotland,4,2,Friendly,London,England,False
2,1874-03-07 00:00:00,Scotland,England,2,1,Friendly,Glasgow,Scotland,False
3,1875-03-06 00:00:00,England,Scotland,2,2,Friendly,London,England,False
4,1876-03-04 00:00:00,Scotland,England,3,0,Friendly,Glasgow,Scotland,False


In [4]:
# some cleaning needs to be done: some country names in different files are differnt
print(matches.home_team.nunique(), rankings.country_full.nunique(), matches_2018.Team.nunique())
set(rankings.country_full) - set(matches.home_team) 

(241, 216, 32)


{'Bosnia and Herzegovina',
 'Brunei Darussalam',
 'Cape Verde Islands',
 'China PR',
 'Chinese Taipei',
 "C\xc3\xb4te d'Ivoire",
 'FYR Macedonia',
 'IR Iran',
 'Kyrgyz Republic',
 'Myanmar',
 'Netherlands Antilles',
 'RCS',
 'Republic of Ireland',
 'Serbia and Montenegro',
 'St Kitts and Nevis',
 'St Lucia',
 'St Vincent and the Grenadines',
 'S\xc3\xa3o Tom\xc3\xa9 e Pr\xc3\xadncipe',
 'Timor-Leste',
 'US Virgin Islands',
 'Zaire'}

In [5]:
# clean up data
rankings = rankings.replace({'IR Iran': 'Iran', 
                             'China PR': 'China'})
matches_2018.dropna(how='all', inplace=True)
matches_2018 = matches_2018.replace({'Columbia': 'Colombia',
                                     'Costarica': 'Costa Rica',
                                     'IRAN': 'Iran',
                                     'Porugal': 'Portugal',
                                     'Korea': 'Korea Republic'})

In [6]:
print(set(matches_2018.Team) - set(matches.home_team) )
print(set(matches_2018.Team) - set(rankings.country_full))

set([])
set([])


In [7]:
gdp = gdp.replace({'Russian Federation': 'Russia',
                   'United Kingdom': 'England',
                   'Congo': 'Congo DR',
                   'Republic of Korea': 'Korea Republic',
                   'Republic of Moldova': 'Moldova',
                   'Iran (Islamic Republic of)': 'Iran'})

gdp = gdp.drop('CountryID', axis=1).set_index('Country').stack()

gdp = gdp.reset_index().rename(columns={'level_1': 'year', 0: 'gdppc'})

odds = odds.replace({'South Korea': 'Korea Republic'})

In [8]:
matches_2018.set_index('Team', inplace=True)

In [9]:
matches_2018.head()

Unnamed: 0_level_0,Group,Previous appearances,Previous titles,Previous  finals,Previous  semifinals,Current FIFA rank,First match against,Match index,history with first opponent  W-L,history with  first opponent  goals,Second match  against,Match index.1,history with  second opponent  W-L,history with  second opponent  goals,Third match  against,Match index.2,history with  third opponent  W-L,history with  third opponent  goals,points,GD
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Russia,A,10,0,0,1,65,Saudi Arabia,1,-1.0,-2.0,Egypt,17,,,Uruguay,33,0.0,0.0,6,4
Saudi Arabia,A,4,0,0,0,63,Russia,1,1.0,2.0,Uruguay,18,1.0,1.0,Egypt,34,-5.0,-5.0,3,-5
Egypt,A,2,0,0,0,31,Uruguay,2,-1.0,-2.0,Russia,17,,,Saudi Arabia,34,5.0,5.0,0,-4
Uruguay,A,12,2,2,5,21,Egypt,2,1.0,2.0,Saudi Arabia,18,-1.0,-1.0,Russia,33,0.0,0.0,9,5
Portugal,B,6,0,0,2,3,Spain,3,-12.0,-31.0,Morocco,19,-1.0,-2.0,Iran,35,2.0,5.0,5,1


## Feature engineering
- Team level features:
    - weighted avg ranking in past x years 
    - home advantage - host or not
    - home advantage - same continent as hosting country or not
    - confederation
    - history with the opponent
    - Outcome of previous 3 matches
    - GDP per capita

### create outcome variable: 
- 1: home team won 
- 0: draw 
- -1: away team won

In [10]:
matches['result'] = matches.home_score - matches.away_score
matches.result = matches.result.clip(upper=1, lower=-1)

In [11]:
matches.result.value_counts()

 1    18986
-1    11003
 0     9055
Name: result, dtype: int64

### features from historical matches

In [12]:
# remove friendly matches and use only data starting from 1996
matches = matches.set_index('date')
# matches.index = pd.to_datetime(matches.index)
matches = matches[matches.tournament != 'Friendly']
matches = matches['1996':]

In [13]:
matches.shape

(12450, 9)

In [14]:
matches.head()

Unnamed: 0_level_0,home_team,away_team,home_score,away_score,tournament,city,country,neutral,result
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1996-01-10,Canada,Honduras,3,1,Gold Cup,Anaheim,USA,True,1
1996-01-10,Trinidad and Tobago,El Salvador,2,3,Gold Cup,Anaheim,USA,True,-1
1996-01-11,Mexico,St. Vincent and the Grenadines,5,0,Gold Cup,San Diego,USA,True,1
1996-01-12,Brazil,Canada,4,1,Gold Cup,Los Angeles,USA,True,1
1996-01-13,South Africa,Cameroon,3,0,African Cup of Nations,Johannesburg,South Africa,False,1


#### home advantage

In [15]:
matches['home_advantage'] = matches.country == matches.home_team

#### historical records with this opponent
for each home-away pair, calculate the win/draw count and rate    
All historical data is used. Time is not considered here. __NOTE:__ This introduces data leakage because stats are calculated using all data.. should fix later if have time

In [16]:
def get_win_count(x): 
    return (x==1).sum()

def get_draw_count(x): 
    return (x==0).sum()

historical_stats_w_oppo = matches.groupby(['home_team', 'away_team']).agg({'home_score': 'sum', 'away_score': 'sum', 
                                                 'result': ['count', get_win_count, get_draw_count]}).reset_index()
historical_stats_w_oppo.columns = ['home_team', 'away_team', 'total_away_score', 'total_home_score', 
                                   'total_count', 'home_win_count', 'home_draw_count']

# historical_stats_w_oppo.columns = ['_'.join(c) for c in historical_stats_w_oppo.columns]

historical_stats_w_oppo.head()

Unnamed: 0,home_team,away_team,total_away_score,total_home_score,total_count,home_win_count,home_draw_count
0,Afghanistan,Bangladesh,0,4,1,1,0
1,Afghanistan,Bhutan,3,6,3,2,0
2,Afghanistan,Cambodia,1,5,2,2,0
3,Afghanistan,India,4,0,1,0,0
4,Afghanistan,Japan,6,0,1,0,0


In [17]:
# matches[(matches.home_team == 'Australia') & (matches.away_team == 'France')]

In [18]:
historical_stats_w_oppo[(historical_stats_w_oppo.home_team == 'Australia') & (historical_stats_w_oppo.away_team == 'France')]

Unnamed: 0,home_team,away_team,total_away_score,total_home_score,total_count,home_win_count,home_draw_count
281,Australia,France,0,1,1,1,0


In [19]:
hist_stats_w_oppo_agg = historical_stats_w_oppo.copy()

# flip home team, away team so that they are in alphabetical order 
for i in hist_stats_w_oppo_agg.index:
    if hist_stats_w_oppo_agg.loc[i, 'home_team'] < hist_stats_w_oppo_agg.loc[i, 'away_team']: 
        hist_stats_w_oppo_agg.loc[i, 'home_team'], hist_stats_w_oppo_agg.loc[i, 'away_team'] = hist_stats_w_oppo_agg.loc[i, 'away_team'], hist_stats_w_oppo_agg.loc[i, 'home_team']
        hist_stats_w_oppo_agg.loc[i, 'total_home_score'], hist_stats_w_oppo_agg.loc[i, 'total_away_score'] = hist_stats_w_oppo_agg.loc[i, 'total_away_score'], hist_stats_w_oppo_agg.loc[i, 'total_home_score']
        hist_stats_w_oppo_agg.loc[i, 'home_win_count'] = hist_stats_w_oppo_agg.loc[i, 'total_count'] \
                - hist_stats_w_oppo_agg.loc[i, 'home_win_count'] - hist_stats_w_oppo_agg.loc[i, 'home_draw_count']

# then consolidate same home/away pairs
hist_stats_w_oppo_agg = hist_stats_w_oppo_agg.groupby(['home_team', 'away_team']).sum().reset_index()

In [20]:
# hist_stats_w_oppo_agg[(hist_stats_w_oppo_agg.home_team == 'France') & (hist_stats_w_oppo_agg.away_team == 'Australia')]

In [21]:
# hist_stats_w_oppo_agg[(hist_stats_w_oppo_agg.home_team == 'Australia') & (hist_stats_w_oppo_agg.away_team == 'France')]

In [22]:
# now duplicate and flip home away for easier joining later
hist_stats_w_oppo_agg2 = hist_stats_w_oppo_agg.copy()
hist_stats_w_oppo_agg2.columns = ['away_team', 'home_team', 'total_home_score', 'total_away_score', 
                                  'total_count', 'home_win_count', 'home_draw_count']
hist_stats_w_oppo_agg2.home_win_count = hist_stats_w_oppo_agg2.total_count - hist_stats_w_oppo_agg2.home_draw_count\
                                        - hist_stats_w_oppo_agg2.home_win_count

hist_stats_w_oppo_agg = pd.concat([hist_stats_w_oppo_agg, hist_stats_w_oppo_agg2], axis=0)

hist_stats_w_oppo_agg['home_win_rate'] = hist_stats_w_oppo_agg.home_win_count * 1.0 / hist_stats_w_oppo_agg.total_count
hist_stats_w_oppo_agg['home_draw_rate'] = hist_stats_w_oppo_agg.home_draw_count * 1.0 / hist_stats_w_oppo_agg.total_count

In [23]:
hist_stats_w_oppo_agg['total_score_diff'] = hist_stats_w_oppo_agg.total_home_score - hist_stats_w_oppo_agg.total_away_score

In [24]:
hist_stats_w_oppo_agg.head()

Unnamed: 0,away_team,home_draw_count,home_team,home_win_count,total_away_score,total_count,total_home_score,home_win_rate,home_draw_rate,total_score_diff
0,Albania,0,Andorra,0,3,1,0,0.0,0.0,-3
1,Algeria,3,Angola,1,6,5,6,0.2,0.6,0
2,Anguilla,0,Antigua and Barbuda,1,3,1,5,1.0,0.0,2
3,Albania,1,Armenia,1,6,4,5,0.25,0.25,-1
4,Andorra,0,Armenia,6,2,6,18,1.0,0.0,16


In [25]:
X = pd.merge(matches, hist_stats_w_oppo_agg, on=['home_team', 'away_team'], how='left')

### ranking features
use ranking difference between home and away team

In [26]:
# ranking data is missing for some months. resample rankings so that it is monthly. use previous month's data to fill if missing
rankings.rank_date = pd.to_datetime(rankings.rank_date)

rankings.set_index("rank_date", inplace=True)

rankings = rankings.groupby('country_full').resample('1M').ffill().reset_index(level=-1)

In [27]:
# ranking is monthly. join with matches on year-month
rankings['year_month'] = rankings.rank_date.apply(lambda x: pd.datetime.strftime(x, '%Y-%m'))
X['year_month'] = matches.index.map(lambda x: pd.datetime.strftime(x, '%Y-%m'))

In [28]:
X = pd.merge(rankings, X, 
             left_on=['country_full', 'year_month'],
             right_on=['home_team', 'year_month'])

X = pd.merge(rankings, X,
             left_on=['country_full', 'year_month'],
             right_on=['away_team', 'year_month'],
             suffixes=('_away', '_home'))

In [29]:
# create diff features
for c in [u'rank_away', 
       u'total_points_away', u'previous_points_away', u'rank_change_away',
       u'cur_year_avg_away', u'cur_year_avg_weighted_away',
       u'last_year_avg_away', u'last_year_avg_weighted_away',
       u'two_year_ago_avg_away', u'two_year_ago_weighted_away',
       u'three_year_ago_avg_away', u'three_year_ago_weighted_away']:
    feat_name = c[:-5]
    X[feat_name + '_diff'] = X[feat_name + '_home'] - X[c]

### GDP features

In [30]:
X['year'] = X.year_month.apply(lambda x: int(x[:-3]))

In [31]:
X = pd.merge(X, gdp, left_on=['home_team', 'year'], right_on=['Country', 'year'], how='left')
X = pd.merge(X, gdp, left_on=['away_team', 'year'], right_on=['Country', 'year'], how='left',
             suffixes=('_home', '_away'))

### Odds features

In [32]:
# just merge with odds data and see...
odds.head()

Unnamed: 0,away,cross,home,ones,twos,year
0,Argentina,3.29,Germany,2.36,3.35,2014
1,Netherlands,3.71,Brazil,2.16,3.36,2014
2,Argentina,3.23,Netherlands,3.07,2.54,2014
3,Germany,3.21,Brazil,2.65,2.93,2014
4,Costa Rica,4.16,Netherlands,1.53,7.15,2014


In [33]:
# flip home and away for easy joining
odds2 = odds.copy()
odds2.columns = ['home', 'cross', 'away', 'twos', 'ones', 'year']
odds = pd.concat([odds, odds2], axis=0)

In [34]:
X = pd.merge(X, odds, left_on=['home_team', 'away_team', 'year'],
                  right_on=['home', 'away', 'year'], how='left')

In [35]:
X.columns

Index([              u'rank_date_away',                    u'rank_away',
                  u'country_full_away',            u'country_abrv_away',
                  u'total_points_away',         u'previous_points_away',
                   u'rank_change_away',            u'cur_year_avg_away',
         u'cur_year_avg_weighted_away',           u'last_year_avg_away',
        u'last_year_avg_weighted_away',        u'two_year_ago_avg_away',
         u'two_year_ago_weighted_away',      u'three_year_ago_avg_away',
       u'three_year_ago_weighted_away',           u'confederation_away',
                         u'year_month',               u'rank_date_home',
                          u'rank_home',            u'country_full_home',
                  u'country_abrv_home',            u'total_points_home',
               u'previous_points_home',             u'rank_change_home',
                  u'cur_year_avg_home',   u'cur_year_avg_weighted_home',
                 u'last_year_avg_home',  u'last_yea

odds and gdp features do not appear to be significant in prediction, so they are removed from training

## Model

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV

In [37]:
features = ['rank_diff',
 u'total_points_diff',
 u'previous_points_diff',
#  u'rank_change_diff',
 u'cur_year_avg_diff',
 u'cur_year_avg_weighted_diff',
 u'last_year_avg_diff',
 u'last_year_avg_weighted_diff',
 u'two_year_ago_avg_diff',
 u'two_year_ago_weighted_diff',
 u'three_year_ago_avg_diff',
 u'three_year_ago_weighted_diff',
 u'neutral', 
#                                                      'gdppc_home', 'gdppc_away', 
#                                                      'confederation_home', 'confederation_away',
                                                     'home_advantage',
#                                                      'cross', 'ones', 'twos', 
#                                                      'total_away_score', 'total_home_score', 
'total_score_diff',
'home_win_rate', 'home_draw_rate'
           ]
features

['rank_diff',
 u'total_points_diff',
 u'previous_points_diff',
 u'cur_year_avg_diff',
 u'cur_year_avg_weighted_diff',
 u'last_year_avg_diff',
 u'last_year_avg_weighted_diff',
 u'two_year_ago_avg_diff',
 u'two_year_ago_weighted_diff',
 u'three_year_ago_avg_diff',
 u'three_year_ago_weighted_diff',
 u'neutral',
 'home_advantage',
 'total_score_diff',
 'home_win_rate',
 'home_draw_rate']

In [38]:
# XX = X.fillna(0)
y = X.result
# XX = pd.get_dummies(XX[features])
XX = X[features]
XX.shape

(11269, 16)

In [39]:
X_train, X_test, y_train, y_test = train_test_split(XX, y)
X_train.shape, X_test.shape

((8451, 16), (2818, 16))

In [40]:
y_test.value_counts()

 1    1398
-1     794
 0     626
Name: result, dtype: int64

In [41]:
rf_model = RandomForestClassifier(n_estimators=200, max_depth=6)
rf_model.fit(X_train, y_train)
y_pred_prob = rf_model.predict_proba(X_test)
y_pred = rf_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.73598296664300922

In [42]:
# let's do some hyperparameter tuning
param_grid = [{"max_depth": [5, 7, 9], "n_estimators": [200, 400, 600]}]

rf_model = GridSearchCV(RandomForestClassifier(n_jobs=-1), param_grid, cv=5, scoring='accuracy')

rf_model.fit(X_train, y_train)
best_model = rf_model.best_estimator_

print("Grid scores on development set:")
means = rf_model.cv_results_['mean_test_score']
stds = rf_model.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, rf_model.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
    
print("Best parameters set found on development set:")
print(rf_model.best_params_)

y_pred = best_model.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred,))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

Grid scores on development set:
0.729 (+/-0.019) for {'n_estimators': 200, 'max_depth': 5}
0.727 (+/-0.018) for {'n_estimators': 400, 'max_depth': 5}
0.728 (+/-0.017) for {'n_estimators': 600, 'max_depth': 5}
0.734 (+/-0.012) for {'n_estimators': 200, 'max_depth': 7}
0.734 (+/-0.014) for {'n_estimators': 400, 'max_depth': 7}
0.735 (+/-0.014) for {'n_estimators': 600, 'max_depth': 7}
0.736 (+/-0.011) for {'n_estimators': 200, 'max_depth': 9}
0.738 (+/-0.018) for {'n_estimators': 400, 'max_depth': 9}
0.738 (+/-0.018) for {'n_estimators': 600, 'max_depth': 9}
Best parameters set found on development set:
{'n_estimators': 400, 'max_depth': 9}
('Accuracy: ', 0.74769339957416603)
Confusion matrix:
[[ 562   71  161]
 [ 124  277  225]
 [  74   56 1268]]


In [43]:
np.unique(y_pred, return_counts=True)

(array([-1,  0,  1], dtype=int64), array([ 760,  404, 1654], dtype=int64))

## Simulations

### group rounds
- win - 3 points
- draw - 1 point
- lose - 0 point

In [118]:
# best_model = rf_model

In [44]:
# prepare features for the 2018 matches - get from latest ranking
def get_features(country1, country2):
    feats = [c[:-5] for c in features[:12]]
    country1_feats = rankings[rankings.country_full == country1].iloc[-1, :][feats]
    country2_feats = rankings[rankings.country_full == country2].iloc[-1, :][feats]
#     print(country1_feats, country2_feats)
    final_feats = country1_feats - country2_feats
    final_feats.index = [i + '_diff' for i in final_feats.index]
    final_feats = pd.DataFrame(final_feats).T
    final_feats['home_team'] = country1
    final_feats['away_team'] = country2
    # add home information
    final_feats['home_advantage'] = (final_feats.home_team == 'Russia')
    final_feats['neutral'] = (final_feats.home_team != 'Russia') & (final_feats.away_team != 'Russia')
    final_feats = pd.merge(final_feats, hist_stats_w_oppo_agg, on=['home_team', 'away_team'], how='left')

    
#     final_feats['total_home_score'] = hist_stats_w_oppo_agg[(hist_stats_w_oppo_agg.home_team == country1) 
#                                                 & (hist_stats_w_oppo_agg.away_team == country2)].total_home_score.values[0]
#     final_feats['home_win_rate'] = hist_stats_w_oppo_agg[(hist_stats_w_oppo_agg.home_team == country1) 
#                                                 & (hist_stats_w_oppo_agg.away_team == country2)].home_win_rate.values[0]
#     final_feats['total_away_score'] = hist_stats_w_oppo_agg[(hist_stats_w_oppo_agg.home_team == country1) 
#                                                 & (hist_stats_w_oppo_agg.away_team == country2)].total_away_score.values[0]
#     final_feats['home_draw_rate'] = hist_stats_w_oppo_agg[(hist_stats_w_oppo_agg.home_team == country1) 
#                                                 & (hist_stats_w_oppo_agg.away_team == country2)].home_draw_rate.values[0]
    
        
    final_feats[['total_away_score','total_home_score','total_score_diff']] = final_feats[['total_away_score','total_home_score','total_score_diff']].fillna(0)
    final_feats[['home_win_rate']] = final_feats[['home_win_rate']].fillna(0.4)
    final_feats[['home_draw_rate']] = final_feats[['home_draw_rate']].fillna(0.2)
    
    return final_feats[features]

In [45]:
get_features('Brazil', 'Serbia')#.values.reshape(1, -1).shape

Unnamed: 0,rank_diff,total_points_diff,previous_points_diff,cur_year_avg_diff,cur_year_avg_weighted_diff,last_year_avg_diff,last_year_avg_weighted_diff,two_year_ago_avg_diff,two_year_ago_weighted_diff,three_year_ago_avg_diff,three_year_ago_weighted_diff,neutral,home_advantage,total_score_diff,home_win_rate,home_draw_rate
0,-32,679.82,652,141.83,141.83,675.24,337.62,254.97,76.49,619.4,123.88,True,False,0.0,0.4,0.2


In [47]:
# start from latest points as of 22 Jun
# matches_2018['points'] = 0
current_results = pd.read_csv("../data/current_results.csv")
# for i, row in current_results.dropna().iterrows():
#     if row.home_score > row.away_score:
#         matches_2018.loc[row.home_team, 'points'] += 3
#     elif row.home_score == row.away_score:
#         matches_2018.loc[row.home_team, 'points'] += 1
#         matches_2018.loc[row.away_team, 'points'] += 1
#     else:
#         matches_2018.loc[row.away_team, 'points'] += 3

In [48]:
matches_2018['total_prob'] = 0

for i, row in current_results[current_results.home_score.isnull()].iterrows():
    print("Group: {}".format(row.group))
    home = row.home_team
    away = row.away_team
    print("{} vs. {}: ".format(home, away))
    feats = get_features(home, away)
    pred_prob = best_model.predict_proba(feats.values.reshape(1, -1))
#         result = rf_model.predict(feats.values.reshape(1, -1))
    matches_2018.loc[home, 'total_prob'] += pred_prob[0, 2]
    matches_2018.loc[away, 'total_prob'] += pred_prob[0, 0]
    # calculate the expected score for each team
    matches_2018.loc[home, 'points'] += (3 * pred_prob[0, 2] + 1 * pred_prob[0, 1])
    matches_2018.loc[away, 'points'] += (3 * pred_prob[0, 0] + 1 * pred_prob[0, 1])
    print(pred_prob)
        
#         if result == 1: # home win
#             matches_2018.loc[home, 'points'] += 3
#             print("  {} wins ({})".format(home, pred_prob[0, 2]))
#         elif result == 0: # draw
#             matches_2018.loc[home, 'points'] += 1
#             matches_2018.loc[away, 'points'] += 1
#             print("  Draw ({})".format(home, pred_prob[0, 1]))
#         else: # away win 
#             matches_2018.loc[away, 'points'] += 3
#             print("  {} wins ({})".format(away, pred_prob[0, 0]))


Group: C
France vs. Denmark: 
[[ 0.48889618  0.05812602  0.4529778 ]]
Group: C
Australia vs. Peru: 
[[ 0.39761638  0.24431632  0.35806731]]
Group: E
Brazil vs. Serbia: 
[[ 0.27045402  0.25031821  0.47922776]]
Group: E
Switzerland vs. Costa Rica: 
[[ 0.36512069  0.22174839  0.41313092]]
Group: D
Argentina vs. Nigeria: 
[[ 0.02027861  0.01629831  0.96342308]]
Group: D
Iceland vs. Croatia: 
[[ 0.43140344  0.22693961  0.34165694]]
Group: G
Belgium vs. England: 
[[ 0.0747793   0.91001498  0.01520572]]
Group: G
Panama vs. Tunisia: 
[[ 0.49986505  0.22204769  0.27808726]]
Group: F
Germany vs. Korea Republic: 
[[ 0.10528522  0.01967887  0.87503591]]
Group: F
Mexico vs. Sweden: 
[[ 0.33002493  0.20861619  0.46135888]]
Group: H
Poland vs. Japan: 
[[ 0.87497615  0.06249508  0.06252877]]
Group: H
Senegal vs. Colombia: 
[[ 0.30542098  0.19676953  0.49780949]]


In [49]:
next_round = matches_2018.groupby('Group').apply(lambda x: 
                                    x.sort_values(by=['points', 'total_prob'], ascending=False).iloc[:2, :])


next_round = next_round.drop('Group', axis=1).reset_index()[['Group', 'Team']]

In [50]:
pairing = [0,3,4,7,8,11,12,15,1,2,5,6,9,10,13,14]

next_round = next_round.loc[pairing]
next_round.set_index('Team', inplace=True)

In [51]:
finals = ['round_of_16', 'quarterfinal', 'semifinal', 'final']

for f in finals:
    print("___Starting of the {}___".format(f))
    iterations = int(len(next_round) / 2)
    winners = []

    for i in range(iterations):
        home = next_round.index[i*2]
        away = next_round.index[i*2+1]
        print("{} vs. {}: ".format(home, away))
        feats = get_features(home, away)
        pred_prob = best_model.predict_proba(feats.values.reshape(1, -1))
        result = best_model.predict(feats.values.reshape(1, -1))
        
        if result == 1: # home win
            print("  {} wins ({})".format(home, pred_prob[0, 2]))
            winners.append(home)
        elif result == 0: # draw
            print(" Draw ({})".format(pred_prob[0, 1]))
            if pred_prob[0, 2] > pred_prob[0, 0]:
                winners.append(home)
                print(home)
            else:
                winners.append(away)
                print(away)
        else: # away win 
            print("  {} wins ({})".format(away, pred_prob[0, 0]))
            winners.append(away)
                
    next_round = next_round.loc[winners]
    print("\n")

___Starting of the round_of_16___
Uruguay vs. Spain: 
  Spain wins (0.900514222965)
France vs. Argentina: 
  France wins (0.497120633964)
Brazil vs. Germany: 
  Brazil wins (0.541622931968)
England vs. Senegal: 
  England wins (0.47003233773)
Russia vs. Portugal: 
  Portugal wins (0.661327643831)
Denmark vs. Croatia: 
 Draw (0.413831580825)
Denmark
Switzerland vs. Mexico: 
  Switzerland wins (0.490335581012)
Belgium vs. Japan: 
 Draw (0.590095851519)
Japan


___Starting of the quarterfinal___
Spain vs. France: 
  France wins (0.364736667069)
Brazil vs. England: 
  Brazil wins (0.945230891881)
Portugal vs. Denmark: 
  Portugal wins (0.492289756325)
Switzerland vs. Japan: 
  Switzerland wins (0.466096719336)


___Starting of the semifinal___
France vs. Brazil: 
  France wins (0.605572337952)
Portugal vs. Switzerland: 
  Switzerland wins (0.510262223087)


___Starting of the final___
France vs. Switzerland: 
  France wins (0.4510103148)




## use all data to train model

In [53]:
best_model = RandomForestClassifier(n_estimators=400, max_depth=7)
best_model.fit(XX, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=400, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [54]:
# Reset to latest points as of 22 Jun
matches_2018['points'] = 0
current_results = pd.read_csv("../data/current_results.csv")
for i, row in current_results.dropna().iterrows():
    if row.home_score > row.away_score:
        matches_2018.loc[row.home_team, 'points'] += 3
    elif row.home_score == row.away_score:
        matches_2018.loc[row.home_team, 'points'] += 1
        matches_2018.loc[row.away_team, 'points'] += 1
    else:
        matches_2018.loc[row.away_team, 'points'] += 3

In [55]:
matches_2018['total_prob'] = 0

for i, row in current_results[current_results.home_score.isnull()].iterrows():
    print("Group: {}".format(row.group))
    home = row.home_team
    away = row.away_team
    print("{} vs. {}: ".format(home, away))
    feats = get_features(home, away)
    pred_prob = best_model.predict_proba(feats.values.reshape(1, -1))
#         result = rf_model.predict(feats.values.reshape(1, -1))
    matches_2018.loc[home, 'total_prob'] += pred_prob[0, 2]
    matches_2018.loc[away, 'total_prob'] += pred_prob[0, 0]
    # calculate the expected score for each team
    matches_2018.loc[home, 'points'] += (3 * pred_prob[0, 2] + 1 * pred_prob[0, 1])
    matches_2018.loc[away, 'points'] += (3 * pred_prob[0, 0] + 1 * pred_prob[0, 1])
    print(pred_prob)
        
#         if result == 1: # home win
#             matches_2018.loc[home, 'points'] += 3
#             print("  {} wins ({})".format(home, pred_prob[0, 2]))
#         elif result == 0: # draw
#             matches_2018.loc[home, 'points'] += 1
#             matches_2018.loc[away, 'points'] += 1
#             print("  Draw ({})".format(home, pred_prob[0, 1]))
#         else: # away win 
#             matches_2018.loc[away, 'points'] += 3
#             print("  {} wins ({})".format(away, pred_prob[0, 0]))


Group: C
France vs. Denmark: 
[[ 0.36883757  0.08330065  0.54786178]]
Group: C
Australia vs. Peru: 
[[ 0.38832396  0.20815866  0.40351738]]
Group: E
Brazil vs. Serbia: 
[[ 0.29047672  0.25970585  0.44981743]]
Group: E
Switzerland vs. Costa Rica: 
[[ 0.30395023  0.23733151  0.45871825]]
Group: D
Argentina vs. Nigeria: 
[[ 0.02922528  0.01122532  0.9595494 ]]
Group: D
Iceland vs. Croatia: 
[[ 0.50511791  0.22797267  0.26690943]]
Group: G
Belgium vs. England: 
[[ 0.08506794  0.88630779  0.02862427]]
Group: G
Panama vs. Tunisia: 
[[ 0.43408828  0.22186805  0.34404367]]
Group: F
Germany vs. Korea Republic: 
[[ 0.07612984  0.02525847  0.89861169]]
Group: F
Mexico vs. Sweden: 
[[ 0.33025763  0.20284717  0.4668952 ]]
Group: H
Poland vs. Japan: 
[[ 0.80800022  0.06762963  0.12437015]]
Group: H
Senegal vs. Colombia: 
[[ 0.29306545  0.19086544  0.5160691 ]]


In [56]:
next_round = matches_2018.groupby('Group').apply(lambda x: 
                                    x.sort_values(by=['points', 'total_prob'], ascending=False).iloc[:2, :])


next_round = next_round.drop('Group', axis=1).reset_index()[['Group', 'Team']]

In [57]:
# pairing = [0,3,4,7,8,11,12,15,1,2,5,6,9,10,13,14]
pairing = [1,3,4,7,8,11,12,15,0,2,5,6,9,10,13,14] #this is a hack to keep the grouping same as real world


next_round = next_round.loc[pairing]
next_round.set_index('Team', inplace=True)

In [58]:
finals = ['round_of_16', 'quarterfinal', 'semifinal', 'final']

for f in finals:
    print("___Starting of the {}___".format(f))
    iterations = int(len(next_round) / 2)
    winners = []

    for i in range(iterations):
        home = next_round.index[i*2]
        away = next_round.index[i*2+1]
        print("{} vs. {}: ".format(home, away))
        feats = get_features(home, away)
        pred_prob = best_model.predict_proba(feats.values.reshape(1, -1))
        result = best_model.predict(feats.values.reshape(1, -1))
        
        if result == 1: # home win
            print("  {} wins ({})".format(home, pred_prob[0, 2]))
            winners.append(home)
        elif result == 0: # draw
            print("  Draw ({})".format(pred_prob[0, 1]))
            if pred_prob[0, 2] > pred_prob[0, 0]:
                winners.append(home)
                print(home)
            else:
                winners.append(away)
                print(away)
        else: # away win 
            print("  {} wins ({})".format(away, pred_prob[0, 0]))
            winners.append(away)
                
    next_round = next_round.loc[winners]
    print("\n")

___Starting of the round_of_16___
Russia vs. Spain: 
  Spain wins (0.929768246174)
France vs. Iceland: 
  France wins (0.624626332816)
Switzerland vs. Germany: 
  Germany wins (0.418418047455)
England vs. Senegal: 
  England wins (0.487870661483)
Uruguay vs. Portugal: 
  Uruguay wins (0.418729757209)
Denmark vs. Croatia: 
  Draw (0.339273052369)
Denmark
Brazil vs. Mexico: 
  Brazil wins (0.498896935175)
Belgium vs. Japan: 
  Draw (0.529899631427)
Japan


___Starting of the quarterfinal___
Spain vs. France: 
  Draw (0.357205771397)
Spain
Germany vs. England: 
  England wins (0.474212490718)
Uruguay vs. Denmark: 
  Denmark wins (0.86048557036)
Brazil vs. Japan: 
  Brazil wins (0.466414874576)


___Starting of the semifinal___
Spain vs. England: 
  Draw (0.885346997554)
England
Denmark vs. Brazil: 
  Brazil wins (0.915359582965)


___Starting of the final___
England vs. Brazil: 
  Brazil wins (0.883419212862)




## Further improvements:
- How to deal with draw? 
- Run simulations for many rounds: 
    - Use the model prediction mean/sd
    - or Use the probability
    - Add some random disturbance

- Other useful data/features:
    - weather
    - time of match
    - player data
    - twitter sentiment
    - odds: http://www.football-data.co.uk/data.php