In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import random

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier

In [3]:
def fetch_worldcup_raw_data(url):
    return pd.read_csv(url)

In [4]:
matches = fetch_worldcup_raw_data("https://raw.githubusercontent.com/jfjelstul/worldcup/master/data-csv/matches.csv")
matches.head()

Unnamed: 0,key_id,tournament_id,tournament_name,match_id,match_name,stage_name,group_name,group_stage,knockout_stage,replayed,replay,match_date,match_time,stadium_id,stadium_name,city_name,country_name,home_team_id,home_team_name,home_team_code,away_team_id,away_team_name,away_team_code,score,home_team_score,away_team_score,home_team_score_margin,away_team_score_margin,extra_time,penalty_shootout,score_penalties,home_team_score_penalties,away_team_score_penalties,result,home_team_win,away_team_win,draw
0,1,WC-1930,1930 FIFA World Cup,M-1930-01,France v Mexico,group stage,Group 1,1,0,0,0,1930-07-13,15:00,S-193,Estadio Pocitos,Montevideo,Uruguay,T-28,France,FRA,T-44,Mexico,MEX,4–1,4,1,3,-3,0,0,0-0,0,0,home team win,1,0,0
1,2,WC-1930,1930 FIFA World Cup,M-1930-02,United States v Belgium,group stage,Group 4,1,0,0,0,1930-07-13,15:00,S-192,Estadio Gran Parque Central,Montevideo,Uruguay,T-80,United States,USA,T-06,Belgium,BEL,3–0,3,0,3,-3,0,0,0-0,0,0,home team win,1,0,0
2,3,WC-1930,1930 FIFA World Cup,M-1930-03,Yugoslavia v Brazil,group stage,Group 2,1,0,0,0,1930-07-14,12:45,S-192,Estadio Gran Parque Central,Montevideo,Uruguay,T-84,Yugoslavia,YUG,T-09,Brazil,BRA,2–1,2,1,1,-1,0,0,0-0,0,0,home team win,1,0,0
3,4,WC-1930,1930 FIFA World Cup,M-1930-04,Romania v Peru,group stage,Group 3,1,0,0,0,1930-07-14,14:50,S-193,Estadio Pocitos,Montevideo,Uruguay,T-59,Romania,ROU,T-54,Peru,PER,3–1,3,1,2,-2,0,0,0-0,0,0,home team win,1,0,0
4,5,WC-1930,1930 FIFA World Cup,M-1930-05,Argentina v France,group stage,Group 1,1,0,0,0,1930-07-15,16:00,S-192,Estadio Gran Parque Central,Montevideo,Uruguay,T-03,Argentina,ARG,T-28,France,FRA,1–0,1,0,1,-1,0,0,0-0,0,0,home team win,1,0,0


In [5]:
def nan_check():
    total = matches.isnull().sum().sort_values(ascending=False)
    percent = (np.round(matches.isnull().sum()/matches.isnull().count()*100, 1)).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', '%'])
    return missing_data

nan_check()

Unnamed: 0,Total,%
key_id,0,0.0
home_team_code,0,0.0
away_team_name,0,0.0
away_team_code,0,0.0
score,0,0.0
home_team_score,0,0.0
away_team_score,0,0.0
home_team_score_margin,0,0.0
away_team_score_margin,0,0.0
extra_time,0,0.0


In [6]:
# drop unimportant columns
matches_clean = matches.drop(['key_id', 'tournament_id', 'tournament_name', 'stage_name', 'replay', \
              'stadium_id', 'home_team_id', 'away_team_id', 'home_team_code', 'away_team_code', \
              'score', 'score_penalties', \
              'result', 'match_id', 'match_name', 'match_time'], axis=1)
matches_clean.head(1)

Unnamed: 0,group_name,group_stage,knockout_stage,replayed,match_date,stadium_name,city_name,country_name,home_team_name,away_team_name,home_team_score,away_team_score,home_team_score_margin,away_team_score_margin,extra_time,penalty_shootout,home_team_score_penalties,away_team_score_penalties,home_team_win,away_team_win,draw
0,Group 1,1,0,0,1930-07-13,Estadio Pocitos,Montevideo,Uruguay,France,Mexico,4,1,3,-3,0,0,0,0,1,0,0


In [7]:
def split_train_test(data):
    date = '2022-01-01'
    data['match_date'] = pd.to_datetime(data['match_date'])
    train_set = data[data['match_date'] < date].drop('match_date', axis=1)
    test_set = data[data['match_date'] >= date].drop('match_date', axis=1)
    return train_set, test_set

train_set, test_set = split_train_test(matches_clean)

In [8]:
data_train = train_set.copy().drop(['home_team_score', 'away_team_score'], axis=1)
data_test = test_set.copy().drop(['home_team_score', 'away_team_score'], axis=1)
data_train_labels = train_set[['home_team_score', 'away_team_score']]
data_test_labels = test_set[['home_team_score', 'away_team_score']]
data_train.head(1)

Unnamed: 0,group_name,group_stage,knockout_stage,replayed,stadium_name,city_name,country_name,home_team_name,away_team_name,home_team_score_margin,away_team_score_margin,extra_time,penalty_shootout,home_team_score_penalties,away_team_score_penalties,home_team_win,away_team_win,draw
0,Group 1,1,0,0,Estadio Pocitos,Montevideo,Uruguay,France,Mexico,3,-3,0,0,0,0,1,0,0


In [11]:
cat_attribs = ['stadium_name', 'city_name', 'country_name', 'home_team_name', 'away_team_name', 'group_name']
num_attribs = [column for column in data_train.columns if column not in cat_attribs]
print(num_attribs)

pipeline = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_attribs),
    ('num', StandardScaler(), num_attribs)
])

['group_stage', 'knockout_stage', 'replayed', 'home_team_score_margin', 'away_team_score_margin', 'extra_time', 'penalty_shootout', 'home_team_score_penalties', 'away_team_score_penalties', 'home_team_win', 'away_team_win', 'draw']


In [12]:
x_train = pipeline.fit_transform(data_train).toarray()
y_train = data_train_labels.to_numpy()

x_test = pipeline.transform(data_test).toarray()
y_test = data_test_labels.to_numpy()

In [13]:
x_train.shape

(900, 552)

In [14]:
x_test.shape

(64, 552)

Train

In [15]:
model = LogisticRegression(random_state=1, max_iter=1000)
multi_target = MultiOutputClassifier(model, n_jobs=2)

In [16]:
p = np.random.permutation(len(x_train))
x_train = x_train[p]
y_train = y_train[p]
y_train_pred = multi_target.fit(x_train, y_train).predict(x_train)

In [17]:
(y_train==y_train_pred).mean()

0.8322222222222222

In [18]:
# p = np.random.permutation(len(x_test))
# x_test = x_test[p]
# y_test = y_test[p]
y_test_pred = multi_target.predict(x_test)

In [19]:
(y_test==y_test_pred).mean()

0.421875