In [1]:
"""
    Ranking:
        1.1 Hierarchical Clustering (ranking_prediction.ipynb)
        1.2 Map labels to future matches as home and away of past season
            1.2.1 If team is new, label it as normal (which is a not promoted team)
        1.3 Compute difference in ranking (NO because of no sequential)
        1.4 Future work: study the week development of the league to see if there is a point where
            it can be started to predict the actual behavior of the team as the promotion label
    Prediction:
        2.0 Drop features --> computation differences (visualization.ipynb)
        2.1 Features TEST
            2.1.0 First match as median of past season (already started) TODO
            2.1.2 Median of current season (home and away games are definetely a factor)
            2.1.2.1 Median at home/away <-- Just this
            2.1.2.2 Past average of last n games (test with 3, 5, etc)
            2.1.2.3 Average of last n games at home/away
"""

'\n    Ranking:\n        1.1 Hierarchical Clustering (ranking_prediction.ipynb)\n        1.2 Map labels to future matches as home and away of past season\n            1.2.1 If team is new, label it as normal (which is a not promoted team)\n        1.3 Compute difference in ranking (NO because of no sequential)\n        1.4 Future work: study the week development of the league to see if there is a point where\n            it can be started to predict the actual behavior of the team as the promotion label\n    Prediction:\n        2.0 Drop features --> computation differences (visualization.ipynb)\n        2.1 Features TEST\n            2.1.0 First match as median of past season (already started) TODO\n            2.1.2 Median of current season (home and away games are definetely a factor)\n            2.1.2.1 Median at home/away <-- Just this\n            2.1.2.2 Past average of last n games (test with 3, 5, etc)\n            2.1.2.3 Average of last n games at home/away\n'

In [1]:
import numpy as np
import pandas as pd
import playstyle
import ranking

In [2]:
"""
    Everything to create the clusters
"""
from sklearn import preprocessing
from scipy.cluster.hierarchy import linkage, cophenet, fcluster
from scipy.spatial.distance import pdist

# Returns Z, coph_matrix and best cophence score from HC
def HierarchicalClustering(data, label):
    methods = ["single","complete","average","centroid","ward"]

    # Pass the dataset into pdist to get your proximity matrix for calculating CPCC
    proximity_matrix = pdist(data)

    best_coph = -1
    best_method = None

    for method in methods:
        Z = linkage(data, method)
        coph, coph_matrix = cophenet(Z, proximity_matrix)
        if coph > best_coph:
            best_coph = coph
            best_method = method
            best_matrix = coph_matrix
        Z = linkage(data, best_method)
        coph_matrix = cophenet(Z)
    return Z, coph_matrix, best_coph

#Returns a dictionary with the clusters in the form
#"id": "season" : point -> Attr object
def dct_clusters(dct_clusters, Z, coph_matrix, dendo_label, criterion='distance', real=None):
    if real != None:
        clusters = real[2]
    else:
        clusters = fcluster(Z, t=coph_matrix[0], criterion=criterion)
    for i in range(0, len(dendo_label)):
        point = ranking.Attr(dendo_label[i], clusters[i])
        if point.value2 not in dct_clusters:
            dct_clusters[point.value2] = {}
            dct_clusters[point.value2][point.value3] = point
        else:
            dct_clusters[point.value2][point.value3] = point
    return dct_clusters, clusters

# Return the cluster of an specific type
def get_cluster_of_type(all_dct, country, league, seasons, target_col, clean_type, real=None):
    # Data initialization
    data, column_names = ranking.concat_data(country, league, seasons, target_col, clean_type)
    all_data, all_season, all_names, all_target = ranking.get_all_data(data, len(target_col))
    # Use Agglomerative
    all_data = preprocessing.StandardScaler().fit_transform(all_data)
    dendo_label = ranking.label_team_season(all_names, all_season)
    Z, coph_matrix, coph = HierarchicalClustering(all_data, dendo_label)
    if real == None:
        all_dct, all_lst = dct_clusters(all_dct, Z, coph_matrix, dendo_label)
    else:
        all_dct, all_lst = dct_clusters(all_dct, Z, coph_matrix, dendo_label, real=all_target)
    return all_dct

#Returns three clusters as: overall, home and away performance
def get_clusters(country, league, season, year_window=1):
    # Historicity: multi season directory
    start_season = season
    seasons = list(range(start_season, start_season - year_window, -1))
    seasons = sorted(seasons, reverse=True)
    # Targets
    target_col = ["rank", "points", "description"]
    # Gets the dictionary
    overall_dct = dict()
    home_dct = dict()
    away_dct = dict()
    real_dct = dict()
    for season in seasons:
        season = [season]
        real_dct = get_cluster_of_type(overall_dct, country, league, season, target_col, clean_type=None, real=True)
        overall_dct = get_cluster_of_type(overall_dct, country, league, season, target_col, clean_type=None)
        home_dct = get_cluster_of_type(home_dct, country, league, season, target_col, clean_type='home')
        away_dct = get_cluster_of_type(away_dct, country, league, season, target_col, clean_type='away')
    return (overall_dct, home_dct, away_dct, real_dct)

In [3]:
"""
    Creates the statistics dataset
"""
def get_statistics(country, league, curr_week, season, year_window=1):
    # Historicity: multi season directory
    start_season = season
    seasons = list(range(start_season-1, start_season - year_window, -1))
    seasons = sorted(seasons, reverse=True)
    # Data initialization
    data = []
    # Current week
    df, target = playstyle.df_season(country, league, season, curr_week, drop_goals=False, drop_fixture=False)
    tup = (season, df, target)
    data.append(tup)
    # Past years
    for season in seasons:
        df, target = playstyle.df_season(country, league, season, 38, drop_goals=False)
        tup = (season, df, target)
        data.append(tup)
    all_data, all_target = playstyle.get_all(data)
    return all_data, all_target

In [4]:
"""
    Function that maps the dataset with the clusters
"""
def max_appearance_rank(dct, season):
    dct_rank = dict()
    season = str(int(season))
    for team in dct:
        if season in dct[team]:
            rank = dct[team][season].value
            if rank not in dct_rank:
                dct_rank[rank] = 1
            else:
                dct_rank[rank] += 1
    return max(dct_rank.keys(), key=dct_rank.get)

def get_rank(df, dct, team):
    rank = []
    for index, row in df.iterrows():
        season = row["season"]
        season = season - 1
        if str(row[team]) in dct:
            if str(season) in dct[str(row[team])]:
                rank.append(dct[str(row[team])][str(season)].value)
            else:
                max_appear = max_appearance_rank(dct, season)
                rank.append(max_appear)
        else:
            max_appear = max_appearance_rank(dct, season)
            rank.append(max_appear)
    return rank

def get_data(dct, statistics):
    statistics["home_team.overall_rank"] = get_rank(statistics, dct[0], "home_team.id")
    statistics["away_team.overall_rank"] = get_rank(statistics, dct[0], "away_team.id")
    statistics["home_team.ranking"] = get_rank(statistics, dct[1], "home_team.id")
    statistics["away_team.ranking"] = get_rank(statistics, dct[2], "away_team.id")
    statistics["home_team.real_rank"] = get_rank(statistics, dct[3], "home_team.id")
    statistics["away_team.real_rank"] = get_rank(statistics, dct[3], "away_team.id")
    return statistics, statistics["home_team.overall_rank"].values.reshape(-1, 1), statistics["away_team.overall_rank"].values.reshape(-1, 1), statistics["home_team.ranking"].values.reshape(-1, 1), statistics["away_team.ranking"].values.reshape(-1, 1),statistics["home_team.real_rank"].values.reshape(-1, 1), statistics["away_team.real_rank"].values.reshape(-1, 1)

In [5]:
def get_median(df, method=None, season=None, week=None):
    # GET median for team_home.stats_home. team_home.stats_away. team_away.stats_home. team_away.stats_away.
    # Filter df according to method
    if method == '1':
        print("Current and past season median")
        df = df[(df["season"].isin([season, season-1]))]
    elif method == '2':
        print("Current and all past seasons median")
        df = df[(df["season"] <= season)]
    elif method == '3':
        print("Current season only median")
        # First and second week zero/one home/away games
        if week > 3:
            df = df[(df["season"] == season)]
        else:
            df = df[(df["season"].isin([season, season-1]))]
    elif method == '4':
        print("Last 5 games median")
        if week > 5:
            weeks = [w for w in range(week-1, week-6, -1)]
            df = df[(df["season"] == season) & (df["week"].isin(weeks))]
        else:
            df = df[(df["season"].isin([season, season-1]))]
        
    # Median home
    extra_columns = df.filter(["goals_home"]).columns
    columns_home = df.filter(regex='^stats_home').columns
    columns_home = columns_home.append(extra_columns)
    median_home = df.groupby(['home_team.id'], as_index=True)[columns_home].median()
    # Median away
    extra_columns = df.filter(["goals_away"]).columns
    columns_away = df.filter(regex='^stats_away').columns
    columns_away = columns_away.append(extra_columns)
    median_away = df.groupby(['away_team.id'], as_index=True)[columns_away].median()
    if method == '5':
        print("Current season only mean")
        # First and second week zero/one home/away games
        if week > 3:
            df = df[(df["season"] == season)]
        else:
            df = df[(df["season"].isin([season, season-1]))]
        mean_home = df.groupby(['home_team.id'], as_index=True)[columns_home].mean()
        mean_away = df.groupby(['away_team.id'], as_index=True)[columns_away].mean()
        return mean_home, mean_away
    return median_home, median_away

In [6]:
def create_test_set(df, teams, season, week, pezzali=True, method=None, extras=None):
    columns_home = ["goals_home", "stats_home.c_red", "stats_home.s_total", "stats_home.s_off_g", "stats_home.s_on_g", "stats_home.s_in", 
                   "stats_home.saves", "stats_home.s_blocked", "stats_home.c_yellow", "stats_home.s_out"]
    columns_away = ["goals_away", "stats_away.c_red", "stats_away.s_total", "stats_away.s_off_g", "stats_away.s_on_g", "stats_away.s_in", 
                   "stats_away.saves", "stats_away.s_blocked", "stats_away.c_yellow", "stats_away.s_out"]
    columns = ["season", "week", "id"]
    columns = columns_home + columns_away + columns
    columns_pezzali = columns
    
    if pezzali == False or method == '5' or extras == '4' or extras == '3':
        extra_columns = df.filter(["goals_home"]).columns
        columns_home = df.filter(regex='^stats_home').columns
        columns_home = columns_home.append(extra_columns)
        extra_columns = df.filter(["goals_away"]).columns
        columns_away = df.filter(regex='^stats_away').columns
        columns_away = columns_away.append(extra_columns)
        columns = columns_home.append(columns_away)
        
    
    test_set = pd.DataFrame(columns=columns)
    home_teams = [match[0] for match in teams]
    away_teams = [match[1] for match in teams]
    seasons = [season for i in range(0, len(teams))]
    weeks = [week for i in range(0, len(teams))]
    test_set["home_team.id"] = home_teams
    test_set["away_team.id"] = away_teams
    test_set["season"] = seasons
    test_set["week"] = weeks
    
    columns_home = test_set.filter(columns_home).columns
    columns_away = test_set.filter(columns_away).columns
    i_th = test_set.columns.get_loc("home_team.id")
    i_ta = test_set.columns.get_loc("away_team.id")
    
    median_home, median_away = get_median(df, method, season, week)

    for i in range(0, len(test_set)):
        for index, row in median_home.iterrows():
            if test_set.iloc[i,i_th] == index:
                for c in columns_home:
                    try:
                        i_c = test_set.columns.get_loc(c)
                        test_set.iloc[i,i_c] = row[c]
                    except:
                        print(index, c)
    for i in range(0, len(test_set)):
        for index, row in median_away.iterrows():
            if test_set.iloc[i,i_ta] == index:
                for c in columns_away:
                    try:
                        i_c = test_set.columns.get_loc(c)
                        test_set.iloc[i,i_c] = row[c]
                    except:
                        print(index, c)
    # For non existing teams (this means ascending teams filled with median of column)
    test_set = test_set.apply(lambda x: x.fillna(x.median()),axis=0)
    if method == '5':
        test_set = linear_reg(test_set, columns)
        if pezzali == True and extras == '1':
            return test_set[['id','week','season','away_team.id', 'home_team.id', 'goals_home', 'stats_home.s_off_g', 'stats_home.s_on_g', 'stats_home.s_in', 'stats_home.saves', 'stats_home.s_blocked', 'stats_home.c_yellow', 'stats_home.s_out', 'goals_away', 'stats_away.s_off_g', 'stats_away.s_on_g', 'stats_away.s_in', 'stats_away.saves', 'stats_away.s_blocked', 'stats_away.c_yellow', 'stats_away.s_out', 'stats_home.s_total', 'stats_away.s_total', 'stats_home.c_red']]
    return test_set

In [7]:
# For the linear regression model, loads pretrains and returns the prediction
def get_regression(row, column):
    global country
    # Load model
    filename = 'model/' + country + '/' + column + '.sav'
    model = pickle.load(open(filename, 'rb'))
    row = row.drop(column)
    row = row.drop("home_team.id")
    row = row.drop("away_team.id")
    row = row.drop("season")
    row = row.drop("week")  
    pred = np.array(row).reshape(1, -1)
    return model.predict(pred)

In [8]:
# Returns the test set for all columns calculating linear regression
def linear_reg(test_set, columns):
    for i in range(0, len(test_set)):
        for c in columns:
            try:
                i_c = test_set.columns.get_loc(c)
                test_set.iloc[i,i_c] = get_regression(test_set.iloc[i], c)
            except:
                print(i, c)
                raise
    return test_set

In [42]:
"""
    A proper evaluation: we cannot test week 10 2018 with week 20 2018
    The sequence of events must be maintained
"""
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.multiclass import OneVsOneClassifier
from sklearn.linear_model import RidgeClassifierCV

def subtraining_trainset(df, season, week, up_to_season):
    df_sub = df[(df["season"] == season) & (df["week"] < week)]
    target = []
    past_seasons = [s for s in range(up_to_season, season)]
    
    df_past = df[(df["season"].isin(past_seasons))]
    
    df_sub = df_sub.append(df_past)
    
    for index, row in df_sub.iterrows():
        target.append(playstyle.get_status(row))
    
    return df_sub, target

In [10]:
# Data found to be not relevant to models
# If not using Anova
def remove_before_models(df, extras, if_anova=False):
    if extras != '1':
        if not if_anova or if_anova == 'both':
            try:
                df = df.drop(columns=['stats_home.c_yellow', 'stats_away.c_yellow', 'stats_home.p_accurate','stats_away.p_accurate','stats_away.p_total','stats_away.fouls','stats_home.p_total','stats_home.fouls','stats_home.corners','stats_home.offside','stats_away.corners','stats_away.offside'], axis=1)
            except:
                print("NO")
        try:
            df = df.drop(columns=['index'], axis=1)
        except:
            print("")
        try:
            df = df.drop(columns=['stats_home','stats_away'], axis=1)
        except:
            print("")
    return df

In [11]:
"""
    This function returns a training set with the given method
"""
def train_method(dct, train_set, s, w, seasons, method, extras=None, if_anova=False):
    print("Train Method")
    statistics_subset = None
    target = None
    if method == '1':
        print("Current and past season median")
    elif method == '2':
        print("Current and all past seasons median")
    elif method == '3':
        print("Current season only median")
    elif method == '4':
        print("Last 5 games median")
    elif method == '5':
        print("Current and past season linear regression")
    elif method == '6':
        print("Current and all past seasons linear regression")
    elif method == '7':
        print("Current season only linear regression")
    elif method == '8':
        print("Plain train set: all games from past seasons")
        statistics_subset, target = subtraining_trainset(train_set, s, w, seasons)
   
    # If something must be removed
    statistics_removed = remove_before_models(statistics_subset, extras, if_anova)
    n = 10
    if extras == '1':
        print("Applies pezzali to train set")
        statistics_subset = playstyle.pezzali_data(statistics_subset, is_train=True, is_prediction=True)
        statistics_subset,_,_,_,_,_,_ = get_data(dct, statistics_subset)
        statistics_subset = statistics_subset.drop(columns=['id','home_team.id', 'away_team.id', 'home_team.name', 'away_team.name', 'season', 'week'])
    elif extras == '2':
        print("Applies pca to train set")
        statistics_subset,ho,ao,hr,ar,rh,ra = get_data(dct, statistics_removed)
        statistics_subset = statistics_subset.drop(columns=['id','home_team.id', 'away_team.id', 'home_team.name', 'away_team.name', 'goals_away','goals_home','season','week'])
        pca = PCA(n_components=n)
        statistics_subset = pca.fit_transform(statistics_subset)
        statistics_subset = np.append(statistics_subset, ho, axis=1)
        statistics_subset = np.append(statistics_subset, ao, axis=1)
        statistics_subset = np.append(statistics_subset, hr, axis=1)
        statistics_subset = np.append(statistics_subset, ar, axis=1)
        statistics_subset = np.append(statistics_subset, rh, axis=1)
        statistics_subset = np.append(statistics_subset, ra, axis=1)
    elif extras == '3':
        print("Adds pezzali and then pca to train set")
        statistics_subset = playstyle.pezzali_data(statistics_removed, is_train=True, both=True, is_prediction=True)
        statistics_subset,ho,ao,hr,ar,rh,ra = get_data(dct, statistics_subset)
        statistics_subset = statistics_subset.drop(columns=['id','home_team.id', 'away_team.id', 'home_team.name', 'away_team.name', 'goals_away','goals_home','season','week'])
        pca = PCA(n_components=n)
        statistics_subset = pca.fit_transform(statistics_subset)
        statistics_subset = np.append(statistics_subset, ho, axis=1)
        statistics_subset = np.append(statistics_subset, ao, axis=1)
        statistics_subset = np.append(statistics_subset, hr, axis=1)
        statistics_subset = np.append(statistics_subset, ar, axis=1)
        statistics_subset = np.append(statistics_subset, rh, axis=1)
        statistics_subset = np.append(statistics_subset, ra, axis=1)
    elif extras == '4':
        print("Adds pezzali to plain train set")
        statistics_subset = playstyle.pezzali_data(statistics_removed, is_train=True, both=True, is_prediction=True)
        statistics_subset,ho,ao,hr,ar,rh,ra = get_data(dct, statistics_removed)
        statistics_subset = statistics_subset.drop(columns=['id','home_team.id', 'away_team.id', 'home_team.name', 'away_team.name', 'goals_away','goals_home','season','week'])
    elif extras == '0':
        print("Plain train set with clusters")
        statistics_subset,_,_,_,_,_,_ = get_data(dct, statistics_removed)
        statistics_subset = statistics_subset.drop(columns=['id','home_team.id', 'away_team.id','home_team.name', 'away_team.name','goals_away','goals_home','season','week'])
    return statistics_subset, target

In [12]:
def test_extras(dct, statistics_test, extras='0', if_anova=False):
    # If something must be removed
    statistics_removed = remove_before_models(statistics_test, extras, if_anova)
    n = 10
    if extras == '1':
        print("Applies pezzali to test set")
        # Get test pezalli and mappings
        statistics_test = playstyle.pezzali_data(statistics_test, is_train=False, is_prediction=True)
        statistics_test,_,_,_,_,_,_ = get_data(dct, statistics_test)
        try:
            statistics_test = statistics_test.drop(columns=['id','home_team.id', 'away_team.id', 'season','week'])
        except:
            print("")
    elif extras == '2':
        print("Applies pca to test set")
        statistics_test,ho,ao,hr,ar,rh,ra = get_data(dct, statistics_test)
        pca = PCA(n_components=n)
        statistics_test = pca.fit_transform(statistics_removed)
        statistics_test = np.append(statistics_test, ho, axis=1)
        statistics_test = np.append(statistics_test, ao, axis=1)
        statistics_test = np.append(statistics_test, hr, axis=1)
        statistics_test = np.append(statistics_test, ar, axis=1)
        statistics_test = np.append(statistics_test, rh, axis=1)
        statistics_test = np.append(statistics_test, ra, axis=1)
    elif extras == '3':
        print("Adds pezzali and then pca to test set")
        statistics_removed = playstyle.pezzali_data(statistics_removed, is_train=False, both=True, is_prediction=True)
        statistics_removed = statistics_removed.drop(columns=['id','home_team.id', 'away_team.id', 'goals_away','goals_home','season','week'])
        statistics_test,ho,ao,hr,ar,rh,ra = get_data(dct, statistics_test)
        pca = PCA(n_components=n)
        statistics_test = pca.fit_transform(statistics_removed)
        statistics_test = np.append(statistics_test, ho, axis=1)
        statistics_test = np.append(statistics_test, ao, axis=1)
        statistics_test = np.append(statistics_test, hr, axis=1)
        statistics_test = np.append(statistics_test, ar, axis=1)
        statistics_test = np.append(statistics_test, rh, axis=1)
        statistics_test = np.append(statistics_test, ra, axis=1)
    elif extras == '4':
        print("Adds pezzali to plain test set")
        statistics_removed = playstyle.pezzali_data(statistics_removed, is_train=False, both=True, is_prediction=True)
        statistics_removed = statistics_removed.drop(columns=['id','home_team.id', 'away_team.id', 'goals_away','goals_home','season','week'])
        statistics_test,ho,ao,hr,ar,rh,ra = get_data(dct, statistics_test)
        statistics_test = np.append(statistics_removed, ho, axis=1)
        statistics_test = np.append(statistics_test, ao, axis=1)
        statistics_test = np.append(statistics_test, hr, axis=1)
        statistics_test = np.append(statistics_test, ar, axis=1)
        statistics_test = np.append(statistics_test, rh, axis=1)
        statistics_test = np.append(statistics_test, ra, axis=1)
    elif extras == '0':
        print("Plain test set with clusters")
        statistics_test,_,_,_,_,_,_ = get_data(dct, statistics_removed)
        statistics_test = statistics_test.drop(columns=['home_team.id', 'away_team.id','goals_away','goals_home','season','week'])
    return statistics_test

In [90]:
"""
    Returns the y_predicted given a majority voting
    - nb, rforest, rfores_pezzali
"""

from sklearn.preprocessing import FunctionTransformer

def get_prediction(clf_name, if_anova, train_set, target, test_set, k_best=10):
    # 1) anova filter, take n best ranked features
    anova_filter = SelectKBest(f_regression, k=k_best)
    classifiers = dict(nb_ovo=OneVsOneClassifier(GaussianNB()),
                       rforest=OneVsOneClassifier(RandomForestClassifier(random_state=0, criterion="entropy", max_features="log2", min_samples_leaf=2)),
                       svc_ovo_p=OneVsOneClassifier(SVC(kernel='linear', probability=True)),
                       rforest_p=OneVsOneClassifier(RandomForestClassifier(random_state=0, criterion="entropy", max_features="log2", min_samples_leaf=2)))
    clf = classifiers[clf_name]
    if if_anova:
        anova_model = make_pipeline(anova_filter, clf)
    else:
        anova_model = clf
    anova_model.fit(train_set, target)
    Y = anova_model.predict(test_set)
    prob = anova_model.decision_function(test_set)
    return Y, prob

# Future search for weights average
def get_majority(votes, probability):
    majority = []
    votes = np.transpose(votes)
    for i in range(0, len(votes)):
        win = 0
        lose = 0
        draw = 0
        for j in range(0, len(votes[0])):
            if votes[i][j] == 0:
                draw = draw + 1
            if votes[i][j] == 1:
                win = win + 1
            if votes[i][j] == -1:
                lose = lose + 1
        if draw > (win+lose):
            majority.append(0)
        elif lose > (win+draw):
            majority.append(-1)
        elif win > (draw+lose):
            majority.append(1)
        elif win == draw and draw == lose:
            majority.append(0)
        elif draw == win:
            majority.append(1)
        elif draw == lose:
            majority.append(-1)
        else:
            majority.append(0)
    final_prob = []
    for i in range(0, len(probability)):
        if majority[i] == 0:
            look_index = 1
        elif majority[i] == 1:
            look_index = 2
        else:
            look_index = 0
        max_p = 0
        max_row = []
        mean_prob = np.mean(probability[i], axis=0)
        for p in probability[i]:
            if p[look_index] > max_p:
                max_p = p[look_index]
                max_row = p
        mean_prob[look_index] = max_p
        final_prob.append(list(mean_prob))
    return majority, final_prob

def vote_classifier(train_set, target, test_set, train_pezzali, test_pezzali, classifiers=["nb_ovo", "rforest", "rforest_p"], k_best=10):
    dct_votes = {
        "nb_ovo":{
            "predict":[],
            "proba":[]
        },
        "rforest":{
            "predict":[],
            "proba":[]
        },
        "rforest_p":{
            "predict":[],
            "proba":[]
        },
        "svc_ovo_p":{
            "predict":[],
            "proba":[]
        }
    }
    st = preprocessing.StandardScaler().fit_transform(train_set)
    sp = preprocessing.StandardScaler().fit_transform(test_set)
    stp = preprocessing.StandardScaler().fit_transform(train_pezzali)
    spp = preprocessing.StandardScaler().fit_transform(test_pezzali)
    # Plain
    dct_votes["nb_ovo"]["predict"], prob = get_prediction("nb_ovo", True, st, target, sp)
    dct_votes["nb_ovo"]["proba"] = prob
    dct_votes["rforest"]["predict"], prob = get_prediction("rforest", True, st, target, sp)
    dct_votes["rforest"]["proba"] = prob
    # Pezzali
    dct_votes["rforest_p"]["predict"], prob = get_prediction("rforest_p", False, stp, target, spp)
    dct_votes["rforest_p"]["proba"] = prob
    dct_votes["svc_ovo_p"]["predict"], prob = get_prediction("svc_ovo_p", False, stp, target, spp)
    dct_votes["svc_ovo_p"]["proba"] = prob
    # Get Vote
    Y = []
    P = [[] for i in range(0, len(test_set))]
    for clf in classifiers:
        Y.append(dct_votes[clf]["predict"])
        for i in range(0, len(test_set)):
            P[i].append(dct_votes[clf]["proba"][i])
        
    vY, vP = get_majority(Y, P)
    return vY, vP

def vote_predict(df, clusters_dct, teams, season, week, up_to_season=2017, classifiers=["nb_ovo", "rforest", "rforest_p"]):
    # Create training and test for plain test
    statistics_train, target = train_method(clusters_dct, df, season, week, up_to_season, '8', '0', 'both')
    statistics_test = create_test_set(df, teams, season, week, False, '1', '0')
    statistics_test = test_extras(clusters_dct, statistics_test, '0', 'both')
    statistics_train = FunctionTransformer(np.log1p).fit_transform(statistics_train)
    statistics_test = FunctionTransformer(np.log1p).fit_transform(statistics_test)
    
    # Create training and test for pezzali test
    pezzali_train, _ = train_method(clusters_dct, df, season, week, up_to_season, '8', '1', False)
    pezzali_test = create_test_set(df, teams, season, week, True, '3', '1')
    pezzali_test = test_extras(clusters_dct, pezzali_test, '1', False)
    
    # Changes to vote classifier
    Y, prob = vote_classifier(statistics_train, target, statistics_test, pezzali_train, pezzali_test, classifiers)
    return Y, prob

In [64]:
def predict(df, clusters_dct, teams, season, week, test_m='1', extras='1', up_to_season=2017, classifier='rforest', if_anova=False, k_best=10):
    """
        teams: the list of pair of teams' ids as [[10, 20], [30, 20]]
    """
        
    classifiers = dict(nb_ovo=OneVsOneClassifier(GaussianNB()),
                       svc_ovo=OneVsOneClassifier(SVC(kernel='linear', probability=True)),
                       ridge=OneVsOneClassifier(RidgeClassifierCV(class_weight='balanced')), 
                       rforest=OneVsOneClassifier(RandomForestClassifier(random_state=0, criterion="entropy", max_features="log2", min_samples_leaf=2)))
    
    if extras == '2' or extras == '0':
        p = False
    else:
        p = True
        
    # Divide the training set to predict
    statistics_train, target = train_method(clusters_dct, df, season, week, up_to_season, method='8', extras=extras, if_anova=if_anova)
    # Creates the test set
    statistics_test = create_test_set(df, teams, season, week, p, test_m, extras)
    statistics_test = test_extras(clusters_dct, statistics_test, extras, if_anova)
    # Scale the training and test set
    st = preprocessing.StandardScaler().fit_transform(statistics_train)
    sp = preprocessing.StandardScaler().fit_transform(statistics_test)
    # Makes the prediction given a classifier
    # 1) anova filter, take n best ranked features
    if if_anova == True or if_anova == 'both':
        anova_filter = SelectKBest(f_regression, k=k_best)
        clf = make_pipeline(anova_filter, classifiers[classifier])
    else:
        clf = classifiers[classifier]
    clf.fit(st, target)
    Y = clf.predict(sp)
    prob = clf.decision_function(sp)
    return Y, prob

In [15]:
def maps_results(teams, results, dct, season, prob, print_prob=False):
    for i in range(0, len(results)):
        home_team = dct[0][str(teams[i][0])][str(season)].value2 + " " + dct[0][str(teams[i][0])][str(season)].name
        away_team = dct[0][str(teams[i][1])][str(season)].value2 + " " + dct[0][str(teams[i][1])][str(season)].name
        print("home_team= %s, away_team= %s, result= %s" %(home_team, away_team, str(results[i])))
        if print_prob:
            print(prob[i])

In [43]:
def test_list(df, season, week):
    df_test = df[(df["season"] == season) & (df["week"] == week)]
    teams = []
    target = []
    fixtures = []
    for index, row in df_test.iterrows():
        teams.append([row["home_team.id"], row["away_team.id"]])
        target.append(playstyle.get_status(row))
        fixtures.append(row["id"])
    return teams, target, fixtures

In [17]:
teams_gb = [[66, 41], [44, 49], [36, 60], [63, 46], [40, 48], [33, 42], [34, 45], [62, 50], [47, 51], [39, 52]]
teams_es = [[542, 529], [531, 536], [543, 797], [538, 548], [545, 724], [715, 539], [727, 530], [541, 726], [532, 546], [533, 720]]
teams_de = [[163, 173], [167, 182], [160, 168], [169, 162], [159, 161], [174, 172], [192, 157], [170, 164], [188, 165]]
teams_it = [[500, 490], [501, 499], [505, 523], [492, 488], [497, 502], [498, 495], [515, 496], [503, 487], [494, 489], [504, 506]]
teams_fr = [[81, 116], [79, 80], [89, 97], [77, 84], [91, 78], [83, 85], [94, 106], [93, 95], [92, 112], [1063, 82]]
teams_nl = [[194, 205], [202, 204], [415, 193], [201, 417], [206, 207], [426, 210], [197, 198], [208, 209], [195, 200]]
teams_pt = [[762, 224], [234, 212], [221, 231], [222, 211], [217, 242], [214, 225], [226, 215], [216, 227], [228, 218]]
teams_be = [[569, 266], [554, 740], [733, 624], [742, 739], [743, 735], [600, 734], [736, 741], [263, 260], [738, 631]]

In [102]:
ids_es = [605135, 605136, 605137, 605138, 605139, 605140, 605141, 605142, 605143, 605144]
ids_gb = [592201, 592202, 592203, 592204, 592205, 592206, 592207, 592208, 592209, 592210]
ids_it = [608530, 608531, 608532, 608533, 608534, 608535, 608536, 608537, 608538, 608539]
ids_de = [587221, 587222, 587223, 587224, 587225, 587226, 587227, 587228, 587229]
ids_fr = [571552, 571553, 571554, 571555, 571556, 571557, 571558, 571559, 571560, 571561]
ids_nl = [573218, 573219, 573220, 573221, 573222, 573223, 573224, 573225, 573226]
ids_pt = [601066, 601067, 601068, 601069, 601070, 601071, 601072, 601073, 601074]
ids_be = [581232, 581233, 581234, 581235, 581236, 581237, 581238, 581239, 581240]

In [103]:
leagues_dct = {
    'ES': {
        'country' : 'ES',
        'league':'140',
        'curr_week': 7,
        'predict_week': 8,
        'start_odds': 6,
        'best_clf': ["rforest", "nb_ovo", "rforest_p", "svc_ovo_p"],
        'matches': teams_es,
        'matches_ids': ids_es
    },
    'GB': {
        'country' : 'GB',
        'league':'39',
        'curr_week': 6,
        'predict_week': 7,
        'start_odds': 1,
        'best_clf': ["nb_ovo", "rforest_p", "svc_ovo_p"],
        'matches': teams_gb,
        'matches_ids': ids_gb
    },
    'IT': {
        'country' : 'IT',
        'league':'135',
        'curr_week': 5,
        'predict_week': 6,
        'start_odds': 1,
        'best_clf': ["rforest", "nb_ovo", "rforest_p"],
        'matches': teams_it,
        'matches_ids': ids_it
    },
    'DE': {
        'country' : 'DE',
        'league':'78',
        'curr_week': 5,
        'predict_week': 6,
        'start_odds': 1,
        'best_clf': ["svc_ovo_p"],
        'matches': teams_de,
        'matches_ids': ids_de
    },
    'FR': {
        'country' : 'FR',
        'league':'61',
        'curr_week': 7,
        'predict_week': 8,
        'start_odds': 7,
        'best_clf': ["nb_ovo", "rforest_p", "rforest"],
        'matches': teams_fr,
        'matches_ids': ids_fr
    },
    'NL': {
        'country' : 'NL',
        'league':'88',
        'curr_week': 6,
        'predict_week': 7,
        'start_odds': 4,
        'best_clf': ["nb_ovo", "rforest_p", "rforest"],
        'matches': teams_nl,
        'matches_ids': ids_nl
    },
    'PT': {
        'country' : 'PT',
        'league':'94',
        'curr_week': 5,
        'predict_week': 6,
        'start_odds': 4,
        'best_clf': ["nb_ovo", "rforest_p", "svc_ovo_p"],
        'matches': teams_pt,
        'matches_ids': ids_pt
    },
    'BE': {
        'country' : 'BE',
        'league':'144',
        'curr_week': 10,
        'predict_week': 11,
        'start_odds': 9,
        'best_clf': ["rforest"],
        'matches': teams_be,
        'matches_ids': ids_be
    }
}

In [101]:
def league_statistics(data, dct, leagues_dct, country_id, curr_week, week, season=2020, year_window=4, teams=None, fixtures_ids=None):
    elem = leagues_dct[country_id]
    clf = elem['best_clf']
    # Dataframe for the match (test from 2017 to 2019)
    statistics_to_test, _ = subtraining_trainset(data, season, week, year_window)
    # Get teams
    if not teams:
        teams, real_results, fixtures = test_list(data, season, week)
    results, prob = vote_predict(statistics_to_test, dct, teams, season, week, classifiers=clf)
    predictions = pd.DataFrame()
    home_team = [i[0] for i in teams]
    away_team = [i[1] for i in teams]
    arr_week = [week for i in range(0, len(results))]
    arr_lose = []
    arr_draw = []
    arr_win = []
    for i in range(0, len(prob)):
        arr_lose.append(prob[i][0])
        arr_draw.append(prob[i][1])
        arr_win.append(prob[i][2])
    if not teams:
        predictions["id"] = fixtures
    else:
        predictions["id"] = fixtures_ids
    predictions["home_team.id"] = home_team
    predictions["away_team.id"] = away_team
    predictions["week"] = arr_week
    predictions["predict_lose"] = arr_lose
    predictions["predict_draw"] = arr_draw
    predictions["predict_win"] = arr_win
    if not teams:
        predictions["real"] = real_results
    predictions["predict"] = results
    return statistics_to_test, predictions

In [97]:
def get_odds(leagues_dct, season=2020):
    csv_file = '../' + 'Odds' + '/' + str(season) + '_odds'+ '.csv'
    odds_df = pd.read_csv(csv_file)
    df = pd.DataFrame()
    for k, v in leagues_dct.items():
        print(k)
        elem = leagues_dct[k]
        country = elem['country']
        league = elem['league']
        curr_week = elem['curr_week']
        start_odds = elem['start_odds']
        predict_week = elem['predict_week']
        # Dictionary for the clusters (clusters from 2016 - 2019)
        dct = get_clusters(country, league, season, year_window=5)
        # Dataframe for the match (test from 2017 to 2019)
        statistics_to_test, target = get_statistics(country, league, curr_week, season, year_window=4)
        for w in range(start_odds-1, predict_week-1):
            stats, predictions = league_statistics(statistics_to_test, dct, leagues_dct, k, w, w+1, season)
            df = df.append(predictions)
    result = pd.merge(df, odds_df, how='inner', on=['id'])
    return result, df

In [108]:
def get_test_odds(leagues_dct, season=2020):
    csv_file = '../' + 'Odds' + '/' + str(season) + '_odds'+ '.csv'
    odds_df = pd.read_csv(csv_file)
    df = pd.DataFrame()
    new_odds = pd.DataFrame()
    for k, v in leagues_dct.items():
        print(k)
        elem = leagues_dct[k]
        country = elem['country']
        league = elem['league']
        curr_week = elem['curr_week']
        teams = elem['matches']
        matches_ids = elem['matches_ids']
        predict_week = elem['predict_week']
        # Dictionary for the clusters (clusters from 2016 - 2019)
        dct = get_clusters(country, league, season, year_window=5)
        # Dataframe for the match (test from 2017 to 2019)
        statistics_to_test, target = get_statistics(country, league, curr_week, season, year_window=4)
        stats, predictions = league_statistics(statistics_to_test, dct, leagues_dct, k, curr_week, predict_week, season, teams=teams, fixtures_ids=matches_ids)
        df = df.append(predictions)
        new_odds = new_odds.append(odds_df[(odds_df['league']==league) & (odds_df['week']==predict_week)])
    result = pd.merge(df, odds_df, how='inner', on=['id'])
    return result, df

In [91]:
odds, df = get_odds(leagues_dct)

ES
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


GB
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Applies pezzali to train set
Current season only median
Applies pezzali to test set


  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom


Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Applies pezzali to train set
Current season only median
Applies pezzali to test set


  corr /= X_norms
  corr /= X_norms


Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom


Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom


Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  corr /= X_norms
  corr /= X_norms


Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  corr /= X_norms
  corr /= X_norms


IT
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Applies pezzali to train set
Current season only median
Applies pezzali to test set
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


DE
Train Method
Plain train set: all games from past seasons


Plain train set with clusters
Current and past season median


Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Train Method
Plain train set: all games from past seasons


Plain train set with clusters
Current and past season median


Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Train Method
Plain train set: all games from past seasons


Plain train set with clusters
Current and past season median


Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Train Method
Plain train set: all games from past seasons


Plain train set with clusters
Current and past season median


Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Train Method
Plain train set: all games from past seasons


Plain train set with clusters
Current and past season median


Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


FR
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


NL
Train Method
Plain train set: all games from past seasons


Plain train set with clusters
Current and past season median


Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom


Train Method
Plain train set: all games from past seasons


Plain train set with clusters
Current and past season median


Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  corr /= X_norms
  corr /= X_norms


Train Method
Plain train set: all games from past seasons


Plain train set with clusters
Current and past season median


Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom


PT
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Applies pezzali to train set
Current season only median
Applies pezzali to test set
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Applies pezzali to train set
Current season only median
Applies pezzali to test set
BE
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  corr /= X_norms
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  corr /= X_norms
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom


Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  corr /= X_norms
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  corr /= X_norms
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom


In [109]:
odds_test, df_s = get_test_odds(leagues_dct)

ES
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  res_values = method(rvalues)


GB
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  corr /= X_norms
  corr /= X_norms
  res_values = method(rvalues)


IT
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  res_values = method(rvalues)


DE
Train Method
Plain train set: all games from past seasons


Plain train set with clusters
Current and past season median


Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  res_values = method(rvalues)


FR
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  res_values = method(rvalues)


NL
Train Method
Plain train set: all games from past seasons


Plain train set with clusters
Current and past season median


Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  corr /= X_norms
  corr /= X_norms
  res_values = method(rvalues)


PT
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Applies pezzali to train set
Current season only median
Applies pezzali to test set


  res_values = method(rvalues)


BE
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  corr /= X_norms
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  corr /= X_norms
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  res_values = method(rvalues)


In [110]:
odds_test.head()

Unnamed: 0,id,home_team.id,away_team.id,week_x,predict_lose,predict_draw,predict_win,predict,league,season,bookmaker,winner_home,winner_away,winner_draw,double_home_away,double_home_draw,double_away_draw,week_y
0,605135,542,529,8,2.296468,1.074388,-0.051669,-1,140,2020,8,6.5,1.44,4.5,1.18,2.63,1.11,8.0
1,605136,531,536,8,0.805671,1.058271,2.276429,1,140,2020,8,2.88,2.63,3.0,1.36,1.44,1.4,8.0
2,605137,543,797,8,-0.238466,1.091237,2.313058,1,140,2020,8,1.5,6.5,4.0,1.22,1.11,2.5,8.0
3,605138,538,548,8,0.963633,1.276102,1.044164,0,140,2020,8,3.0,2.38,3.2,1.36,1.53,1.36,8.0
4,605139,545,724,8,1.213535,1.049107,2.187189,1,140,2020,8,2.4,3.25,3.0,1.36,1.33,1.53,8.0


In [114]:
# csv to save
csv_file = '../' + 'Odds' + '/' + str(season) + '_' + '10_29_'+ 'odds_test'+ '.csv'
print(csv_file)

../Odds/2020_10_29_odds_test.csv


In [115]:
odds_test.to_csv(csv_file, index=False)

In [92]:
odds

Unnamed: 0,id,home_team.id,away_team.id,week_x,predict_lose,predict_draw,predict_win,real,predict,league,season,bookmaker,winner_home,winner_away,winner_draw,double_home_away,double_home_draw,double_away_draw,week_y
0,605124.0,533,532,6,-0.230920,1.042431,2.308306,1,1,140,2020,8,1.60,5.25,4.10,1.22,1.17,2.25,6.0
1,605117.0,543,548,6,0.029372,1.048234,2.271473,-1,1,140,2020,8,2.60,2.63,3.40,1.30,1.44,1.50,6.0
2,605119.0,545,727,6,1.182168,1.240975,1.048746,0,0,140,2020,8,2.70,2.90,3.00,1.36,1.40,1.44,6.0
3,605115.0,542,797,6,0.673380,2.285670,0.538878,-1,0,140,2020,8,1.75,5.50,3.30,1.33,1.14,2.00,6.0
4,605116.0,531,539,6,1.218447,1.058653,2.199194,1,1,140,2020,8,2.00,4.00,3.30,1.33,1.22,1.73,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227,581227.0,741,743,10,0.843750,-0.068924,2.177790,0,1,144,2020,8,1.70,4.75,3.75,1.25,1.18,2.10,10.0
228,581229.0,734,554,10,0.843750,-0.068924,2.177790,-1,1,144,2020,8,3.75,1.91,3.60,1.29,1.80,1.25,10.0
229,581230.0,735,733,10,-0.211501,1.007736,2.210434,1,1,144,2020,8,3.75,1.83,4.00,1.22,1.83,1.25,10.0
230,581226.0,624,600,10,0.843750,-0.068924,2.177790,1,1,144,2020,8,1.57,5.50,4.00,1.22,1.14,2.25,10.0


In [93]:
# csv to save
csv_file = '../' + 'Odds' + '/' + str(season) + '_odds_ds'+ '.csv'
print(csv_file)

../Odds/2020_odds_ds.csv


In [94]:
odds.to_csv(csv_file, index=False)

In [79]:
odds, df = get_odds(leagues_dct)

ES
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


[-0.23092031  1.04243102  2.23295921]
[-0.23092031  1.04243102  2.30830608]
[0.0293719  1.04823354 1.96557915]
[0.0293719  1.04823354 2.27147284]
[0.30914491 1.02015832 1.6793235 ]
[0.30914491 1.02015832 2.19398615]
[1.18216801 0.79478271 1.04874616]
[1.18216801 1.24097518 1.04874616]
[0.67338042 1.86981473 0.53887809]
[0.67338042 2.28567006 0.53887809]
[-0.22873422  1.05556242  2.22825194]
[-0.22873422  1.05556242  2.30389886]
[1.2184468  1.05865295 0.77967856]
[1.2184468  1.05865295 2.19919379]
[0.8552892  1.35809587 0.79164327]
[0.8552892  1.35809587 2.19107281]
[0.60275041 1.30414943 1.07594859]
[1.85556867 1.30414943 1.07594859]
[0.88223632 1.03667797 1.06240855]
[1.9225193  1.03667797 1.06240855]
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test 

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


[1.18057974 1.64220616 0.23973494]
[2.1837086  1.64220616 0.23973494]
[0.55403331 0.52037067 1.96305843]
[0.55403331 0.52037067 2.29351852]
[1.7205698  0.50937014 0.78836081]
[2.24472403 0.50937014 0.78836081]
[-0.23468138  1.03655226  2.23837566]
[-0.23468138  1.03655226  2.31890826]
[-0.22250073  1.04697402  2.20773351]
[-0.22250073  1.04697402  2.2180662 ]
[0.96818031 1.0297772  1.034634  ]
[0.96818031 1.23049976 1.034634  ]
[1.18367527 0.79000157 1.05829607]
[1.18367527 0.79000157 2.18471426]
[0.95794969 1.07377828 1.03462014]
[0.95794969 1.26088163 1.03462014]
[1.18999261 0.83318979 1.0440711 ]
[1.18999261 1.26220042 1.0440711 ]
[-0.24109091  1.02623945  2.24288844]
[-0.24109091  1.02623945  2.31933105]
GB
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Applies pezzali to train set
Current season only median
Applies pezzali to test set


  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom


[-0.25143199  1.10821267  2.24580391]
[-0.25143199  1.10821267  2.30232177]
[ 2.08194522  1.01002994 -0.06781927]
[ 2.309094    1.01002994 -0.06781927]
[0.48713697 1.05306432 1.51753073]
[0.48713697 1.05306432 2.27678786]
[0.50937725 1.05290632 1.49365606]
[0.50937725 1.05290632 2.25586288]
[1.30773217 1.12709647 0.67714937]
[2.28289233 1.12709647 0.67714937]
[0.48954861 1.07730631 1.49881337]
[0.48954861 1.07730631 2.24886838]
[ 2.07929351  1.05786319 -0.08177874]
[ 2.31444258  1.05786319 -0.08177874]
[1.29016777 1.10929838 0.68928219]
[2.24615045 1.10929838 0.68928219]
[-0.24834587  1.07901251  2.24634118]
[-0.24834587  1.07901251  2.29975347]
[-0.24866675  1.45318048  1.90615632]
[-0.24866675  1.45318048  2.31636372]
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median


  return np.nanmean(a, axis, out=out, keepdims=keepdims)



Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  corr /= X_norms
  corr /= X_norms


[1.30880711 1.10880465 0.68046203]
[2.28013843 1.10880465 0.68046203]
[1.19034621 1.7949937  0.01061098]
[1.19034621 2.21596472 0.01061098]
[-0.24889231  1.12967268  2.23531795]
[-0.24889231  1.12967268  2.28371252]
[-0.23129614  1.02823657  2.23658221]
[-0.23129614  1.02823657  2.3062337 ]
[0.25312711 1.76696655 1.03357016]
[0.25312711 2.25722421 1.03357016]
[ 2.04545756  0.9357879  -0.0448511 ]
[ 2.30080027  0.9357879  -0.0448511 ]
[1.35475879 1.06325375 0.64844878]
[2.2909391  1.06325375 0.64844878]
[-0.24633833  1.09816629  2.23117056]
[-0.24633833  1.09816629  2.25395837]
[0.48664001 1.4751722  1.01403465]
[0.48664001 2.24801322 1.01403465]
[0.10397415 0.73497049 2.23181426]
[0.10397415 0.73497049 2.30768269]
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezz

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom


[-0.24403657  1.06998918  2.24369283]
[-0.24403657  1.06998918  2.29962166]
[0.92338051 1.03347514 1.045868  ]
[0.92338051 2.23995101 1.045868  ]
[ 2.08466526  1.04263242 -0.08914075]
[ 2.30373585  1.04263242 -0.08914075]
[ 1.65004409  1.42886867 -0.01076901]
[ 2.19934416  1.42886867 -0.01076901]
[ 1.69385947  1.31055366 -0.07071271]
[ 2.29193028  1.31055366 -0.07071271]
[0.97615883 0.75207851 1.36047474]
[0.97615883 0.75207851 2.21150156]
[-0.22923321  1.09535944  2.21653502]
[-0.22923321  1.09535944  2.25685622]
[0.11018215 0.72340672 2.23052439]
[0.11018215 0.72340672 2.28799803]
[-0.24969305  1.08775505  2.24642429]
[-0.24969305  1.08775505  2.29870574]
[-0.25262106  1.09024025  2.25087307]
[-0.25262106  1.09024025  2.31015922]
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only m

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom


[1.34259023 1.08018938 0.65925888]
[2.23622244 1.08018938 0.65925888]
[ 1.70433696  1.35099749 -0.06825008]
[ 2.3067929   1.35099749 -0.06825008]
[1.31399259 1.10083912 0.67813416]
[2.30037167 1.10083912 0.67813416]
[-0.25184873  1.09978831  2.24837368]
[-0.25184873  1.09978831  2.31189195]
[-0.24645151  1.07499562  2.24483319]
[-0.24645151  1.07499562  2.29863314]
[ 2.06179231  1.07596367 -0.07068158]
[ 2.2680991   1.07596367 -0.07068158]
[0.50192836 1.06477358 1.5093161 ]
[0.50192836 1.06477358 2.27890535]
[-0.24562346  1.07835775  2.24481545]
[-0.24562346  1.07835775  2.31423123]
[-0.24426654  1.50944044  1.87607196]
[-0.24426654  1.50944044  2.2615321 ]
[1.35163598 0.99392921 0.65033346]
[2.30756512 0.99392921 0.65033346]
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  corr /= X_norms
  corr /= X_norms


[0.96472757 1.44784506 0.68807614]
[0.96472757 2.0904522  0.68807614]
[ 1.28916187  1.79065871 -0.02628769]
[ 1.28916187  2.16217946 -0.02628769]
[0.5082686  1.39936891 1.01065249]
[0.5082686  2.29094306 1.01065249]
[0.48426492 0.72018316 1.87492906]
[0.48426492 0.72018316 2.29772778]
[ 2.04946013  1.07490546 -0.04618054]
[ 2.29684167  1.07490546 -0.04618054]
[1.31638269 0.99847098 0.66795484]
[2.28995324 0.99847098 0.66795484]
[0.10351469 0.71537951 2.23778132]
[0.10351469 0.71537951 2.31357079]
[0.1266274  1.42152627 1.40540678]
[0.1266274  1.42152627 2.21884794]
[-0.24022213  1.05854753  2.24239016]
[-0.24022213  1.05854753  2.29944772]
[-0.22048648  1.77920922  1.47020472]
[-0.22048648  2.26998661  1.47020472]
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezz

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  corr /= X_norms
  corr /= X_norms


[0.45044899 0.71520501 1.8982096 ]
[0.45044899 0.71520501 2.30543094]
[-0.23825472  1.46403556  1.81449376]
[-0.23825472  1.46403556  2.21808072]
[1.31587032 1.05392432 0.67200631]
[1.31587032 2.09228547 0.67200631]
[1.27390445 1.04051691 0.70625724]
[2.10236324 1.04051691 0.70625724]
[ 2.06455597  0.99465299 -0.07547109]
[ 2.31922344  0.99465299 -0.07547109]
[ 1.70069784  1.45473151 -0.06388348]
[ 2.30459859  1.45473151 -0.06388348]
[0.13407363 1.80386091 1.11165573]
[0.13407363 2.23838825 1.11165573]
[-0.22783675  1.43969132  1.8860467 ]
[-0.22783675  1.43969132  2.30342482]
[0.50595476 1.06620586 1.50306999]
[0.50595476 1.06620586 2.24127766]
[-0.25392795  1.10266238  2.25162214]
[-0.25392795  1.10266238  2.31530824]
IT
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Applies pezzali to train set
Current season only median
Applies pezzali to test set
[-0.20060955  1.35609943  1.86238818]
[-0.20060955  1.35609943  2.21792677]
[1.24390351 1.34829507 0.41916805]
[1.95893462 1.34829507 0.41916805]
[-0.21092483  0.97717618  2.21592416]
[-0.21092483  0.97717618  2.22155664]
[1.92862791 0.64789076 0.41332262]
[1.97588126 0.64789076 0.41332262]
[-0.21743234  0.98314628  2.21973029]
[-0.21743234  0.98314628  2.2213958 ]
[1.18986587 0.64960402 1.14882546]
[1.88842933 0.64960402 1.14882546]
[0.13736945 1.3567343  1.51495093]
[0.13736945 1.3567343  2.21942689]
[1.54355406 1.39643649 0.08276962]
[1.92744769 1.39643649 0.08276962]
[1.22685912 1.3616579  0.41871829]
[1.93936915 1.3616579  0.41871829]
[1.21301184 1.02273324 0.76221806]
[1.94042912 1.02273324 0.76221806]
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from pas

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


[0.12858637 0.60044883 2.21855631]
[0.12858637 0.60044883 2.22135125]
[-0.20304894  1.77107538  1.50537161]
[-0.20304894  2.14002184  1.50537161]
[1.92224044 1.00723323 0.0712253 ]
[1.9750681  1.00723323 0.0712253 ]
[0.13855397 1.32967505 1.52334095]
[0.13855397 1.32967505 2.2154507 ]
[1.9233994  0.98626757 0.08119912]
[1.97129831 0.98626757 0.08119912]
[-0.1988849   1.00348951  2.19819143]
[-0.1988849   1.00348951  2.20582367]
[0.86594743 0.64239915 1.47046735]
[0.86594743 0.64239915 2.20931002]
[0.48987633 2.08850092 0.48029885]
[0.48987633 2.12338659 0.48029885]
[-0.21460761  0.97041216  2.21918692]
[-0.21460761  0.97041216  2.22115089]
[0.13882489 0.98851542 1.86303311]
[0.13882489 0.98851542 2.2172028 ]
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


[0.14918512 1.3597906  1.50890187]
[0.14918512 1.3597906  2.21015028]
[-0.21828983  1.00657115  2.21720577]
[-0.21828983  1.00657115  2.22185061]
[0.48246743 0.29975309 2.19349754]
[0.48246743 0.29975309 2.20614282]
[0.51177542 1.37141865 1.13482538]
[0.51177542 2.11749047 1.13482538]
[0.1348688  0.62573974 2.20640355]
[0.1348688  0.62573974 2.21631953]
[1.54264927 1.01391075 0.4456202 ]
[1.92853033 1.01391075 0.4456202 ]
[0.48191561 0.68074469 1.85326147]
[0.48191561 0.68074469 2.22109195]
[1.96229189 1.015677   0.02523165]
[1.98322254 1.015677   0.02523165]
[1.21593189 1.40981372 0.40475978]
[1.94329184 1.40981372 0.40475978]
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


[0.14208962 0.61365398 2.20244685]
[0.14208962 0.61365398 2.21656547]
[-0.2089128   1.02659115  2.20450915]
[-0.2089128   1.02659115  2.21302534]
[-0.20428461  1.03080327  2.19922756]
[-0.20428461  1.03080327  2.21773412]
[1.19050033 0.25594998 1.50361855]
[1.19050033 0.25594998 2.21020177]
[0.85417227 0.97433823 1.15835655]
[0.85417227 2.0807443  1.15835655]
[1.58267137 1.37858431 0.05560069]
[1.97313982 1.37858431 0.05560069]
[0.48716511 1.0096478  1.50442962]
[0.48716511 1.0096478  2.20248182]
[0.47622842 1.020121   1.51951076]
[0.47622842 1.020121   2.21575724]
[0.8325005  0.26237495 1.85897901]
[0.8325005  0.26237495 2.22080309]
[0.83941273 0.62895876 1.50312462]
[0.83941273 0.62895876 2.20349414]
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test 

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


[0.13179709 1.39026273 1.51642016]
[0.13179709 1.39026273 2.21167258]
[1.95575492 0.98731793 0.04963527]
[1.99266504 0.98731793 0.04963527]
[0.1407324  0.64338789 2.19853338]
[0.1407324  0.64338789 2.22101628]
[-0.20756346  0.99901813  2.20684079]
[-0.20756346  0.99901813  2.22098676]
[1.57354917 1.35938031 0.0820381 ]
[1.97216535 1.35938031 0.0820381 ]
[-0.20828047  1.00025787  2.20589983]
[-0.20828047  1.00025787  2.21702104]
[0.49000395 0.30199666 2.18575009]
[0.49000395 0.30199666 2.19777052]
[1.2077102  0.60751377 1.159986  ]
[1.94139194 0.60751377 1.159986  ]
[1.94127972 0.99411955 0.07079651]
[1.98392555 0.99411955 0.07079651]
[0.49351927 1.38033762 1.16063181]
[0.49351927 2.11778623 1.16063181]
DE
Train Method
Plain train set: all games from past seasons


Plain train set with clusters
Current and past season median


Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to 

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


[-0.32155997  1.19590619  2.32093575]
[-0.32155997  1.19590619  2.32093575]
[-0.22510075  2.2358817   0.91529061]
[-0.22510075  2.2358817   0.91529061]
[ 2.27777744  1.24002088 -0.2944479 ]
[ 2.27777744  1.24002088 -0.2944479 ]
[-0.29861526  1.23207451  2.28772904]
[-0.29861526  1.23207451  2.28772904]
[-0.14083007  2.24814304  0.77144773]
[-0.14083007  2.24814304  0.77144773]
[ 2.24880231  1.24100064 -0.28246961]
[ 2.24880231  1.24100064 -0.28246961]
[ 2.31939715  1.25385774 -0.32103898]
[ 2.31939715  1.25385774 -0.32103898]
[-0.31142599  1.23766568  2.30715117]
[-0.31142599  1.23766568  2.30715117]
[ 1.11716823  2.24366155 -0.25507305]
[ 1.11716823  2.24366155 -0.25507305]
Train Method
Plain train set: all games from past seasons


Plain train set with clusters
Current and past season median


Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


[ 2.29882795  1.23337991 -0.30554447]
[ 2.29882795  1.23337991 -0.30554447]
[-0.31464721  1.22450069  2.31220385]
[-0.31464721  1.22450069  2.31220385]
[ 2.26572783  1.25716246 -0.29320504]
[ 2.26572783  1.25716246 -0.29320504]
[ 2.32092899  1.25243433 -0.3222195 ]
[ 2.32092899  1.25243433 -0.3222195 ]
[ 2.19999312  1.24567475 -0.27046994]
[ 2.19999312  1.24567475 -0.27046994]
[-0.30978479  1.2410235   2.30445863]
[-0.30978479  1.2410235   2.30445863]
[-0.30598154  1.20209637  2.30202548]
[-0.30598154  1.20209637  2.30202548]
[-0.31867346  1.24722898  2.3165547 ]
[-0.31867346  1.24722898  2.3165547 ]
[ 2.17169702  1.2225648  -0.25146385]
[ 2.17169702  1.2225648  -0.25146385]
Train Method
Plain train set: all games from past seasons


Plain train set with clusters
Current and past season median


Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


[-0.30278281  1.21900218  2.29627728]
[-0.30278281  1.21900218  2.29627728]
[-0.13121123  2.25104194  0.76466327]
[-0.13121123  2.25104194  0.76466327]
[ 2.27388106  1.2303892  -0.29084202]
[ 2.27388106  1.2303892  -0.29084202]
[-0.29747444  1.21978378  2.28804421]
[-0.29747444  1.21978378  2.28804421]
[-0.31674209  1.2244669   2.31484466]
[-0.31674209  1.2244669   2.31484466]
[-0.30737099  1.23643836  2.30127884]
[-0.30737099  1.23643836  2.30127884]
[ 2.319687    1.26085012 -0.32143934]
[ 2.319687    1.26085012 -0.32143934]
[-0.29884899  1.23972344  2.28642029]
[-0.29884899  1.23972344  2.28642029]
[ 2.29824824  1.24863668 -0.30653013]
[ 2.29824824  1.24863668 -0.30653013]
Train Method
Plain train set: all games from past seasons


Plain train set with clusters
Current and past season median


Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


[ 2.27717433  1.22888747 -0.29231765]
[ 2.27717433  1.22888747 -0.29231765]
[ 2.21623125  1.23846606 -0.2711466 ]
[ 2.21623125  1.23846606 -0.2711466 ]
[ 2.29604466  1.24685729 -0.30507013]
[ 2.29604466  1.24685729 -0.30507013]
[-0.31935795  1.23146531  2.3178864 ]
[-0.31935795  1.23146531  2.3178864 ]
[-0.19854723  2.24910957  0.80082335]
[-0.19854723  2.24910957  0.80082335]
[ 1.15439591  2.24640676 -0.26237364]
[ 1.15439591  2.24640676 -0.26237364]
[-0.25640237  1.22195767  2.1908841 ]
[-0.25640237  1.22195767  2.1908841 ]
[ 0.96729842  2.24503576 -0.24241607]
[ 0.96729842  2.24503576 -0.24241607]
[-0.30331346  1.23410561  2.29521405]
[-0.30331346  1.23410561  2.29521405]
Train Method
Plain train set: all games from past seasons


Plain train set with clusters
Current and past season median


Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


[ 2.29867172  1.24019147 -0.30600098]
[ 2.29867172  1.24019147 -0.30600098]
[-0.31526044  1.2496195   2.31177512]
[-0.31526044  1.2496195   2.31177512]
[ 2.29900487  1.2516848  -0.30727671]
[ 2.29900487  1.2516848  -0.30727671]
[ 2.18296907  1.24755176 -0.26800815]
[ 2.18296907  1.24755176 -0.26800815]
[-0.31478143  1.21003295  2.31283841]
[-0.31478143  1.21003295  2.31283841]
[ 2.30037933  1.24329367 -0.30732657]
[ 2.30037933  1.24329367 -0.30732657]
[-0.31931259  1.21252909  2.31819215]
[-0.31931259  1.21252909  2.31819215]
[ 2.30184532  1.24707462 -0.30855089]
[ 2.30184532  1.24707462 -0.30855089]
[-0.21757409  2.23092289  0.90903154]
[-0.21757409  2.23092289  0.90903154]
FR
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


[1.55130236 1.39647859 0.07877092]
[1.91466544 1.39647859 0.07877092]
[1.22478759 0.98547586 0.78576157]
[1.92939454 0.98547586 0.78576157]
[1.20028187 1.73896287 0.09234538]
[1.20028187 2.12807264 0.09234538]
[0.52179483 1.031319   1.47451957]
[0.52179483 1.031319   2.20784702]
[0.14647104 1.73794147 1.16817688]
[0.14647104 2.08820525 1.16817688]
[-0.20754595  0.95275996  2.21632007]
[-0.20754595  0.95275996  2.22078409]
[0.14504441 1.35310782 1.51408467]
[0.14504441 1.35310782 2.221589  ]
[0.82880924 0.30052075 1.84931786]
[0.82880924 0.30052075 2.19831159]
[-0.20609696  1.01011899  2.20477106]
[-0.20609696  1.01011899  2.21367981]
[-0.21577769  0.99124695  2.21707032]
[-0.21577769  0.99124695  2.22153903]
NL
Train Method
Plain train set: all games from past seasons


Plain train set with clusters
Current and past season median


Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezza

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom


[0.8484696  0.98665466 1.15391167]
[0.8484696  0.98665466 2.21998678]
[0.13148041 1.3838309  1.51320928]
[0.13148041 1.3838309  2.21808261]
[0.13791152 1.04430239 1.84405173]
[0.13791152 1.04430239 2.21687532]
[1.20985766 1.01566996 0.78269311]
[1.96247021 1.01566996 0.78269311]
[-0.21788183  0.99158151  2.21889936]
[-0.21788183  0.99158151  2.22217238]
[1.21302379 0.97479975 0.79197738]
[1.93017251 0.97479975 0.79197738]
[1.88152314 1.06390797 0.07616161]
[1.90080387 1.06390797 0.07616161]
[-0.21692064  0.97989848  2.21959262]
[-0.21692064  0.97989848  2.22159982]
[0.4900339  0.25324823 2.2046722 ]
[0.4900339  0.25324823 2.21043211]
Train Method
Plain train set: all games from past seasons


Plain train set with clusters
Current and past season median


Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  corr /= X_norms
  corr /= X_norms


[0.13698152 1.71146966 1.18212929]
[0.13698152 2.10572426 1.18212929]
[0.83201768 0.57880517 1.52877348]
[0.83201768 0.57880517 2.21950201]
[1.87624862 0.63563985 0.46749098]
[1.91626554 0.63563985 0.46749098]
[1.55274026 1.38482891 0.08538232]
[1.94886845 1.38482891 0.08538232]
[1.57407896 0.65004946 0.76432131]
[1.99752349 0.65004946 0.76432131]
[-0.21151664  0.98743522  2.2135933 ]
[-0.21151664  0.98743522  2.22012118]
[-0.19931807  1.37468372  1.85763999]
[-0.19931807  1.37468372  2.21027442]
[0.82455225 0.62869295 1.51498938]
[0.82455225 0.62869295 2.22199859]
[0.13008268 1.35824985 1.52931785]
[0.13008268 1.35824985 2.21923137]
Train Method
Plain train set: all games from past seasons


Plain train set with clusters
Current and past season median


Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom


[0.13632644 1.32595393 1.52629661]
[0.13632644 1.32595393 2.21681717]
[-0.21138814  0.99874665  2.21158894]
[-0.21138814  0.99874665  2.22202278]
[1.23803174 0.97350901 0.76661633]
[1.98186883 0.97350901 0.76661633]
[-0.21170515  1.00963471  2.20938099]
[-0.21170515  1.00963471  2.22117541]
[1.96951573 0.98904614 0.03813298]
[1.9947345  0.98904614 0.03813298]
[-0.21240524  0.97778828  2.21670657]
[-0.21240524  0.97778828  2.22092326]
[1.93737721 1.02845726 0.04165895]
[1.96584393 1.02845726 0.04165895]
[0.14864306 0.98299799 1.85426368]
[0.14864306 0.98299799 2.19877417]
[-0.21260321  0.96272661  2.21855995]
[-0.21260321  0.96272661  2.22160112]
PT
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Applies pezzali to train set
Current season only median
Applies pezzali to test set
[-0.22270963  1.0641374   2.19779441]
[-0.22270963  1.0641374   2.21634092]
[0.10155518 1.43057918 1.54994447]
[0.10155518 1.43057918 2.29672556]
[ 2.07655466  1.06029445 -0.06079955]
[ 2.29288237  1.06029445 -0.06079955]
[-0.22541145  1.43735906  1.85793329]
[-0.22541145  1.43735906  2.22098727]
[2.05990828 0.69782196 0.30283147]
[2.29605719 0.69782196 0.30283147]
[0.1807354  1.79053783 1.04421927]
[0.1807354  2.23311082 1.04421927]
[0.48388847 1.05521514 1.51653911]
[0.48388847 1.05521514 2.26836247]
[-0.24416563  1.09434525  2.24033257]
[-0.24416563  1.09434525  2.30839062]
[1.20171366 1.0634815  0.69281324]
[1.20171366 2.26125387 0.69281324]
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Applies pezzali to train set
Current season only median
Applies pezzali to test set
[-0.2430126   1.4376119   1.90293267]
[-0.2430126  1.4376119  2.3043777]
[ 2.06836082  1.05673456 -0.05357449]
[ 2.291076    1.05673456 -0.05357449]
[ 1.70067179  1.44354358 -0.06118239]
[ 2.28609452  1.44354358 -0.06118239]
[1.34110011 1.04067473 0.66824746]
[2.29589116 1.04067473 0.66824746]
[-0.2460798   1.02303578  2.25205789]
[-0.2460798   1.02303578  2.31670182]
[0.48640413 1.81268557 0.69042993]
[0.48640413 2.2407893  0.69042993]
[2.07166665 0.70891411 0.28955697]
[2.30574351 0.70891411 0.28955697]
[-0.23834035  1.03401662  2.24405614]
[-0.23834035  1.03401662  2.30507656]
[-0.22403363  1.06133443  2.22172975]
[-0.22403363  1.06133443  2.2718663 ]
BE
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current seaso

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  corr /= X_norms
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  corr /= X_norms
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom


[ 0.84147186 -0.08111573  2.18375554]
[ 0.84147186 -0.08111573  2.18375554]
[-0.19970124  0.92553863  2.21351755]
[-0.19970124  0.92553863  2.21351755]
[ 0.84147186 -0.08111573  2.18375554]
[ 0.84147186 -0.08111573  2.18375554]
[ 0.84147186 -0.08111573  2.18375554]
[ 0.84147186 -0.08111573  2.18375554]
[1.88366878 0.94357909 0.1417441 ]
[1.88366878 0.94357909 0.1417441 ]
[ 0.84147186 -0.08111573  2.18375554]
[ 0.84147186 -0.08111573  2.18375554]
[-0.19838173  0.97978287  2.20181956]
[-0.19838173  0.97978287  2.20181956]
[ 0.84147186 -0.08111573  2.18375554]
[ 0.84147186 -0.08111573  2.18375554]
[-0.20542518  0.94724203  2.21403312]
[-0.20542518  0.94724203  2.21403312]
Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  corr /= X_norms
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  corr /= X_norms
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom


[ 0.84375016 -0.06892393  2.17778978]
[ 0.84375016 -0.06892393  2.17778978]
[ 0.84375016 -0.06892393  2.17778978]
[ 0.84375016 -0.06892393  2.17778978]
[-0.2142107   0.99928906  2.21430162]
[-0.2142107   0.99928906  2.21430162]
[-0.20572295  0.9951428   2.20644128]
[-0.20572295  0.9951428   2.20644128]
[ 0.84375016 -0.06892393  2.17778978]
[ 0.84375016 -0.06892393  2.17778978]
[ 0.84375016 -0.06892393  2.17778978]
[ 0.84375016 -0.06892393  2.17778978]
[-0.21150091  1.00773637  2.2104336 ]
[-0.21150091  1.00773637  2.2104336 ]
[ 0.84375016 -0.06892393  2.17778978]
[ 0.84375016 -0.06892393  2.17778978]
[ 0.84375016 -0.06892393  2.17778978]
[ 0.84375016 -0.06892393  2.17778978]


In [81]:
odds

Unnamed: 0,id,home_team.id,away_team.id,predict_lose,predict_draw,predict_win,real,predict,league,season,bookmaker,winner_home,winner_away,winner_draw,double_home_away,double_home_draw,double_away_draw,week
0,605124.0,533,532,-0.230920,1.042431,2.308306,1,1,140,2020,8,1.60,5.25,4.10,1.22,1.17,2.25,6.0
1,605117.0,543,548,0.029372,1.048234,2.271473,-1,1,140,2020,8,2.60,2.63,3.40,1.30,1.44,1.50,6.0
2,605119.0,545,727,1.182168,1.240975,1.048746,0,0,140,2020,8,2.70,2.90,3.00,1.36,1.40,1.44,6.0
3,605115.0,542,797,0.673380,2.285670,0.538878,-1,0,140,2020,8,1.75,5.50,3.30,1.33,1.14,2.00,6.0
4,605116.0,531,539,1.218447,1.058653,2.199194,1,1,140,2020,8,2.00,4.00,3.30,1.33,1.22,1.73,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227,581227.0,741,743,0.843750,-0.068924,2.177790,0,1,144,2020,8,1.70,4.75,3.75,1.25,1.18,2.10,10.0
228,581229.0,734,554,0.843750,-0.068924,2.177790,-1,1,144,2020,8,3.75,1.91,3.60,1.29,1.80,1.25,10.0
229,581230.0,735,733,-0.211501,1.007736,2.210434,1,1,144,2020,8,3.75,1.83,4.00,1.22,1.83,1.25,10.0
230,581226.0,624,600,0.843750,-0.068924,2.177790,1,1,144,2020,8,1.57,5.50,4.00,1.22,1.14,2.25,10.0


In [21]:
"""
    Everything to retrieve the data
"""
global country
elem = leagues_dct['IT']
country = elem['country']
league = elem['league']
curr_week = elem['curr_week']
week = elem['predict_week']
season = 2020
teams = elem['matches']

# Dictionary for the clusters (clusters from 2016 - 2019)
dct = get_clusters(country, league, season, year_window=5)

# Dataframe for the match (test from 2017 to 2019)
statistics_to_test, target = get_statistics(country, league, curr_week, season, year_window=4)

results, prob = predict(statistics_to_test, dct, teams, season, week, test_m='3', extras='1', if_anova=False, classifier='svc_ovo')
maps_results(teams, results, dct, season, prob)

Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set
home_team= 500 Bologna - 2020, away_team= 490 Cagliari - 2020, result= 1
home_team= 501 Crotone - 2020, away_team= 499 Atalanta - 2020, result= -1
home_team= 505 Inter - 2020, away_team= 523 Parma - 2020, result= 1
home_team= 492 Napoli - 2020, away_team= 488 Sassuolo - 2020, result= 1
home_team= 497 AS Roma - 2020, away_team= 502 Fiorentina - 2020, result= 1
home_team= 498 Sampdoria - 2020, away_team= 495 Genoa - 2020, result= 1
home_team= 515 Spezia - 2020, away_team= 496 Juventus - 2020, result= 1
home_team= 503 Torino - 2020, away_team= 487 Lazio - 2020, result= 1
home_team= 494 Udinese - 2020, away_team= 489 AC Milan - 2020, result= -1
home_team= 504 Verona - 2020, away_team= 506 Benevento - 2020, result= 0


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [21]:
statistics_to_test.head()

Unnamed: 0,id,season,week,goals_home,goals_away,stats_home.s_on_g,stats_home.s_off_g,stats_home.s_in,stats_home.s_out,stats_home.s_total,...,stats_away.saves,stats_away.p_total,stats_away.p_accurate,stats_away.p_percentage,stats_home,stats_away,home_team.id,away_team.id,home_team.name,away_team.name
0,608481.0,2020,1,1,0,5.0,6.0,16.0,5.0,8.0,...,5.0,445.0,369.0,0.829213,0.0,0.0,502,503,Fiorentina,Torino
1,608502.0,2020,3,1,2,5.0,13.0,22.0,4.0,15.0,...,4.0,298.0,210.0,0.704698,0.0,0.0,502,498,Fiorentina,Sampdoria
2,608523.0,2020,5,3,2,3.0,2.0,5.0,0.0,4.0,...,0.0,417.0,354.0,0.848921,0.0,0.0,502,494,Fiorentina,Udinese
3,608512.0,2020,4,1,2,6.0,7.0,19.0,6.0,16.0,...,4.0,361.0,275.0,0.761773,0.0,0.0,505,489,Inter,AC Milan
4,608493.0,2020,2,4,3,8.0,7.0,21.0,6.0,11.0,...,5.0,328.0,253.0,0.771341,0.0,0.0,505,502,Inter,Fiorentina


In [25]:
results, prob = vote_predict(statistics_to_test, dct, teams, season, week, classifiers=["rforest", "nb_ovo", "rforest_p"])
maps_results(teams, results, dct, season, prob, print_prob=True)

Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


home_team= 500 Bologna - 2020, away_team= 490 Cagliari - 2020, result= 1
[-0.21790273597999205, 0.9780199414359316, 2.220657276995305]
home_team= 501 Crotone - 2020, away_team= 499 Atalanta - 2020, result= -1
[1.989507667473769, 0.991869918699187, 0.018124507486209612]
home_team= 505 Inter - 2020, away_team= 523 Parma - 2020, result= 1
[-0.22156415833735937, 0.9998818282049302, 2.2215774476581043]
home_team= 492 Napoli - 2020, away_team= 488 Sassuolo - 2020, result= 1
[-0.21300046462348787, 0.9578609377892418, 2.218974624073867]
home_team= 497 AS Roma - 2020, away_team= 502 Fiorentina - 2020, result= 1
[-0.21520800260037373, 0.9665409646554681, 2.2197010420072054]
home_team= 498 Sampdoria - 2020, away_team= 495 Genoa - 2020, result= 1
[-0.21894303363074813, 0.9879002034479066, 2.220402785620177]
home_team= 515 Spezia - 2020, away_team= 496 Juventus - 2020, result= 0
[-0.1940090560780216, 2.088355381349012, 1.1692778825909824]
home_team= 503 Torino - 2020, away_team= 487 Lazio - 2020, r

In [125]:
"""
    Everything to retrieve the data
"""
global country
elem = leagues_dct['GB']
country = elem['country']
league = elem['league']
curr_week = elem['curr_week']
week = elem['predict_week']
season = 2020
teams = elem['matches']

# Dictionary for the clusters (clusters from 2016 - 2019)
dct = get_clusters(country, league, season, year_window=5)

# Dataframe for the match (test from 2017 to 2019)
statistics_to_test, target = get_statistics(country, league, curr_week, season, year_window=4)

results, prob = predict(statistics_to_test, dct, teams, season, week, test_m='1', extras='0', if_anova=False, classifier='nb_ovo')
maps_results(teams, results, dct, season, prob)

Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
home_team= 66 ston Villa - 2020, away_team= 41 outhampton - 2020, result= -1
home_team= 44 urnley - 2020, away_team= 49 helsea - 2020, result= -1
home_team= 36 ulham - 2020, away_team= 60 est Brom - 2020, result= 1
home_team= 63 eeds - 2020, away_team= 46 eicester - 2020, result= 1
home_team= 40 iverpool - 2020, away_team= 48 est Ham - 2020, result= 1
home_team= 33 anchester United - 2020, away_team= 42 rsenal - 2020, result= 1
home_team= 34 ewcastle - 2020, away_team= 45 verton - 2020, result= -1
home_team= 62 heffield Utd - 2020, away_team= 50 anchester City - 2020, result= -1
home_team= 47 ottenham - 2020, away_team= 51 righton - 2020, result= 1
home_team= 39 olves - 2020, away_team= 52 rystal Palace - 2020, result= 1


In [127]:
results, prob = vote_predict(statistics_to_test, dct, teams, season, week, classifiers=["nb_ovo", "rforest_p", "svc_ovo_p"])
maps_results(teams, results, dct, season, prob, print_prob=False)

Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  corr /= X_norms
  corr /= X_norms


home_team= 66 ston Villa - 2020, away_team= 41 outhampton - 2020, result= 1
home_team= 44 urnley - 2020, away_team= 49 helsea - 2020, result= -1
home_team= 36 ulham - 2020, away_team= 60 est Brom - 2020, result= 0
home_team= 63 eeds - 2020, away_team= 46 eicester - 2020, result= 1
home_team= 40 iverpool - 2020, away_team= 48 est Ham - 2020, result= 1
home_team= 33 anchester United - 2020, away_team= 42 rsenal - 2020, result= 1
home_team= 34 ewcastle - 2020, away_team= 45 verton - 2020, result= 0
home_team= 62 heffield Utd - 2020, away_team= 50 anchester City - 2020, result= -1
home_team= 47 ottenham - 2020, away_team= 51 righton - 2020, result= 1
home_team= 39 olves - 2020, away_team= 52 rystal Palace - 2020, result= 1


In [126]:
results, prob = vote_predict(statistics_to_test, dct, teams, season, week, classifiers=["nb_ovo", "rforest_p", "svc_ovo_p"])
maps_results(teams, results, dct, season, prob, print_prob=True)

Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  corr /= X_norms
  corr /= X_norms


home_team= 66 ston Villa - 2020, away_team= 41 outhampton - 2020, result= 1
[array([ 1.93576476, -0.09628976,  1.13068885]), array([-0.21823819,  0.97765684,  2.22102426]), array([-0.31876755,  1.26744642,  2.31562687])]
home_team= 44 urnley - 2020, away_team= 49 helsea - 2020, result= -1
[array([1.98356622, 1.00427665, 0.0124692 ]), array([1.96629213, 0.98919529, 0.04246655]), array([ 2.31830095,  1.12613246, -0.3187026 ])]
home_team= 36 ulham - 2020, away_team= 60 est Brom - 2020, result= 0
[array([-0.22221614,  1.06601262,  2.21224865]), array([0.82721713, 2.07866981, 0.14471891]), array([ 2.30382684,  1.29258714, -0.31529354])]
home_team= 63 eeds - 2020, away_team= 46 eicester - 2020, result= 1
[array([ 0.84088104, -0.12248364,  2.19969297]), array([0.82652092, 1.00919006, 1.17127566]), array([-0.27447682,  1.21891344,  2.24445026])]
home_team= 40 iverpool - 2020, away_team= 48 est Ham - 2020, result= 1
[array([-0.21922826,  0.99976836,  2.21925541]), array([-0.21921995,  0.9913645

In [18]:
"""
    Everything to retrieve the data
"""
global country
elem = leagues_dct['PT']
country = elem['country']
league = elem['league']
curr_week = elem['curr_week']
week = elem['predict_week']
season = 2020
teams = elem['matches']

# Dictionary for the clusters (clusters from 2016 - 2019)
dct = get_clusters(country, league, season, year_window=5)

# Dataframe for the match (test from 2017 to 2019)
statistics_to_test, target = get_statistics(country, league, curr_week, season, year_window=4)
results, prob = vote_predict(statistics_to_test, dct, teams, season, week, classifiers=["nb_ovo", "rforest_p", "svc_ovo_p"])
maps_results(teams, results, dct, season, prob, print_prob=False)

Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Applies pezzali to train set
Current season only median
Applies pezzali to test set
home_team= 762 GIL Vicente - 2020, away_team= 224 Guimaraes - 2020, result= 1
home_team= 234 Pacos Ferreira - 2020, away_team= 212 FC Porto - 2020, result= -1
home_team= 221 Belenenses - 2020, away_team= 231 Farense - 2020, result= 0
home_team= 222 Boavista - 2020, away_team= 211 Benfica - 2020, result= -1
home_team= 217 SC Braga - 2020, away_team= 242 Famalicao - 2020, result= 0
home_team= 214 Maritimo - 2020, away_team= 225 Nacional - 2020, result= 1
home_team= 226 Rio Ave - 2020, away_team= 215 Moreirense - 2020, result= 1
home_team= 216 Portimonense - 2020, away_team= 227 Santa Clara - 2020, result= 1
home_team= 228 Sporting CP - 2020, away_team= 218 Tondela - 2020, result= 1


In [20]:
"""
    Everything to retrieve the data
"""
global country
elem = leagues_dct['NL']
country = elem['country']
league = elem['league']
curr_week = elem['curr_week']
week = elem['predict_week']
season = 2020
teams = elem['matches']

# Dictionary for the clusters (clusters from 2016 - 2019)
dct = get_clusters(country, league, season, year_window=5)

# Dataframe for the match (test from 2017 to 2019)
statistics_to_test, target = get_statistics(country, league, curr_week, season, year_window=4)
results, prob = vote_predict(statistics_to_test, dct, teams, season, week, classifiers=["nb_ovo", "rforest_p", "rforest"])
maps_results(teams, results, dct, season, prob, print_prob=True)

Train Method
Plain train set: all games from past seasons


Plain train set with clusters
Current and past season median


Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  corr /= X_norms
  corr /= X_norms


home_team= 194 Ajax - 2020, away_team= 205 Fortuna Sittard - 2020, result= 1
[array([-0.21828508,  0.9695958 ,  2.22213706]), array([-0.21934074,  0.9828722 ,  2.22141392]), array([-0.21312658,  0.9452182 ,  2.22108729])]
home_team= 202 Groningen - 2020, away_team= 204 VVV Venlo - 2020, result= 0
[array([-0.20469305,  2.14024651,  1.15458947]), array([1.84847658, 0.99375874, 0.15339609]), array([-0.20202219,  2.09696862,  1.17671013])]
home_team= 415 Twente - 2020, away_team= 193 PEC Zwolle - 2020, result= 0
[array([1.93147019, 1.06045849, 0.0119657 ]), array([-0.19905816,  1.03667603,  2.19202054]), array([-0.20594016,  2.08877476,  1.18541997])]
home_team= 201 AZ Alkmaar - 2020, away_team= 417 Waalwijk - 2020, result= 1
[array([-0.21285348,  0.94582171,  2.22075091]), array([-0.19415061,  0.98759635,  2.19636105]), array([-0.20162317,  0.90497062,  2.21955151])]
home_team= 206 Heracles - 2020, away_team= 207 Utrecht - 2020, result= -1
[array([1.88978565, 1.0053732 , 0.10774037]), arr

In [130]:
"""
    Everything to retrieve the data
"""
global country
country = 'ES'
league = '140'
curr_week = 6
season = 2020

# Dictionary for the clusters (clusters from 2016 - 2019)
dct = get_clusters(country, league, season, year_window=5)

# Dataframe for the match (test from 2017 to 2019)
statistics_to_test, target = get_statistics(country, league, curr_week, season, year_window=4)

In [51]:
teams = [[724, 533]]
week = 7
results, prob = predict(statistics_to_test, dct, teams, season, week, test_m='3', extras='1', if_anova='both', classifier='svc_ovo')
maps_results(teams, results, dct, season, prob)

Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set
home_team= 724 Cadiz - 2020, away_team= 533 Villarreal - 2020, result= 0
[-0.23942296  2.23757704  1.0213474 ]


In [71]:
results, prob = vote_predict(statistics_to_test, dct, teams, season, week)
maps_results(teams, results, dct, season, prob)

Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set
home_team= 724 Cadiz - 2020, away_team= 533 Villarreal - 2020, result= 1
[[0.8144931939275888, -0.08645158627564854, 2.2053776849270585], [0.8100840419610681, -0.07913367435703167, 2.2068562618195835], [-0.21294982587385528, 1.0964507086738522, 2.1921957645287686]]
