In [1]:
"""
    Ranking:
        1.1 Hierarchical Clustering (ranking_prediction.ipynb)
        1.2 Map labels to future matches as home and away of past season
            1.2.1 If team is new, label it as normal (which is a not promoted team)
        1.3 Compute difference in ranking (NO because of no sequential)
        1.4 Future work: study the week development of the league to see if there is a point where
            it can be started to predict the actual behavior of the team as the promotion label
    Prediction:
        2.0 Drop features --> computation differences (visualization.ipynb)
        2.1 Features TEST
            2.1.0 First match as median of past season (already started) TODO
            2.1.2 Median of current season (home and away games are definetely a factor)
            2.1.2.1 Median at home/away <-- Just this
            2.1.2.2 Past average of last n games (test with 3, 5, etc)
            2.1.2.3 Average of last n games at home/away
"""

'\n    Ranking:\n        1.1 Hierarchical Clustering (ranking_prediction.ipynb)\n        1.2 Map labels to future matches as home and away of past season\n            1.2.1 If team is new, label it as normal (which is a not promoted team)\n        1.3 Compute difference in ranking (NO because of no sequential)\n        1.4 Future work: study the week development of the league to see if there is a point where\n            it can be started to predict the actual behavior of the team as the promotion label\n    Prediction:\n        2.0 Drop features --> computation differences (visualization.ipynb)\n        2.1 Features TEST\n            2.1.0 First match as median of past season (already started) TODO\n            2.1.2 Median of current season (home and away games are definetely a factor)\n            2.1.2.1 Median at home/away <-- Just this\n            2.1.2.2 Past average of last n games (test with 3, 5, etc)\n            2.1.2.3 Average of last n games at home/away\n'

In [18]:
import numpy as np
import pandas as pd
import playstyle
import ranking

In [19]:
"""
    Everything to create the clusters
"""
from sklearn import preprocessing
from scipy.cluster.hierarchy import linkage, cophenet, fcluster
from scipy.spatial.distance import pdist

# Returns Z, coph_matrix and best cophence score from HC
def HierarchicalClustering(data, label):
    methods = ["single","complete","average","centroid","ward"]

    # Pass the dataset into pdist to get your proximity matrix for calculating CPCC
    proximity_matrix = pdist(data)

    best_coph = -1
    best_method = None

    for method in methods:
        Z = linkage(data, method)
        coph, coph_matrix = cophenet(Z, proximity_matrix)
        if coph > best_coph:
            best_coph = coph
            best_method = method
            best_matrix = coph_matrix
        Z = linkage(data, best_method)
        coph_matrix = cophenet(Z)
    return Z, coph_matrix, best_coph

#Returns a dictionary with the clusters in the form
#"id": "season" : point -> Attr object
def dct_clusters(dct_clusters, Z, coph_matrix, dendo_label, criterion='distance', real=None):
    if real != None:
        clusters = real[2]
    else:
        clusters = fcluster(Z, t=coph_matrix[0], criterion=criterion)
    for i in range(0, len(dendo_label)):
        point = ranking.Attr(dendo_label[i], clusters[i])
        if point.value2 not in dct_clusters:
            dct_clusters[point.value2] = {}
            dct_clusters[point.value2][point.value3] = point
        else:
            dct_clusters[point.value2][point.value3] = point
    return dct_clusters, clusters

# Return the cluster of an specific type
def get_cluster_of_type(all_dct, country, league, seasons, target_col, clean_type, real=None):
    # Data initialization
    data, column_names = ranking.concat_data(country, league, seasons, target_col, clean_type)
    all_data, all_season, all_names, all_target = ranking.get_all_data(data, len(target_col))
    # Use Agglomerative
    all_data = preprocessing.StandardScaler().fit_transform(all_data)
    dendo_label = ranking.label_team_season(all_names, all_season)
    Z, coph_matrix, coph = HierarchicalClustering(all_data, dendo_label)
    if real == None:
        all_dct, all_lst = dct_clusters(all_dct, Z, coph_matrix, dendo_label)
    else:
        all_dct, all_lst = dct_clusters(all_dct, Z, coph_matrix, dendo_label, real=all_target)
    return all_dct

#Returns three clusters as: overall, home and away performance
def get_clusters(country, league, season, year_window=1):
    # Historicity: multi season directory
    start_season = season
    seasons = list(range(start_season, start_season - year_window, -1))
    seasons = sorted(seasons, reverse=True)
    # Targets
    target_col = ["rank", "points", "description"]
    # Gets the dictionary
    overall_dct = dict()
    home_dct = dict()
    away_dct = dict()
    real_dct = dict()
    for season in seasons:
        season = [season]
        real_dct = get_cluster_of_type(overall_dct, country, league, season, target_col, clean_type=None, real=True)
        overall_dct = get_cluster_of_type(overall_dct, country, league, season, target_col, clean_type=None)
        home_dct = get_cluster_of_type(home_dct, country, league, season, target_col, clean_type='home')
        away_dct = get_cluster_of_type(away_dct, country, league, season, target_col, clean_type='away')
    return (overall_dct, home_dct, away_dct, real_dct)

In [20]:
"""
    Creates the statistics dataset
"""
def get_statistics(country, league, curr_week, season, year_window=1):
    # Historicity: multi season directory
    start_season = season
    seasons = list(range(start_season-1, start_season - year_window, -1))
    seasons = sorted(seasons, reverse=True)
    # Data initialization
    data = []
    # Current week
    df, target = playstyle.df_season(country, league, season, curr_week, drop_goals=False)
    tup = (season, df, target)
    data.append(tup)
    # Past years
    for season in seasons:
        df, target = playstyle.df_season(country, league, season, 38, drop_goals=False)
        tup = (season, df, target)
        data.append(tup)
    all_data, all_target = playstyle.get_all(data)
    return all_data, all_target

In [21]:
"""
    Function that maps the dataset with the clusters
"""
def max_appearance_rank(dct, season):
    dct_rank = dict()
    season = str(int(season))
    for team in dct:
        if season in dct[team]:
            rank = dct[team][season].value
            if rank not in dct_rank:
                dct_rank[rank] = 1
            else:
                dct_rank[rank] += 1
    return max(dct_rank.keys(), key=dct_rank.get)

def get_rank(df, dct, team):
    rank = []
    for index, row in df.iterrows():
        season = row["season"]
        season = season - 1
        if str(row[team]) in dct:
            if str(season) in dct[str(row[team])]:
                rank.append(dct[str(row[team])][str(season)].value)
            else:
                max_appear = max_appearance_rank(dct, season)
                rank.append(max_appear)
        else:
            max_appear = max_appearance_rank(dct, season)
            rank.append(max_appear)
    return rank

def get_data(dct, statistics):
    statistics["home_team.overall_rank"] = get_rank(statistics, dct[0], "home_team.id")
    statistics["away_team.overall_rank"] = get_rank(statistics, dct[0], "away_team.id")
    statistics["home_team.ranking"] = get_rank(statistics, dct[1], "home_team.id")
    statistics["away_team.ranking"] = get_rank(statistics, dct[2], "away_team.id")
    statistics["home_team.real_rank"] = get_rank(statistics, dct[3], "home_team.id")
    statistics["away_team.real_rank"] = get_rank(statistics, dct[3], "away_team.id")
    return statistics, statistics["home_team.overall_rank"].values.reshape(-1, 1), statistics["away_team.overall_rank"].values.reshape(-1, 1), statistics["home_team.ranking"].values.reshape(-1, 1), statistics["away_team.ranking"].values.reshape(-1, 1),statistics["home_team.real_rank"].values.reshape(-1, 1), statistics["away_team.real_rank"].values.reshape(-1, 1)

In [22]:
def get_median(df, method=None, season=None, week=None):
    # GET median for team_home.stats_home. team_home.stats_away. team_away.stats_home. team_away.stats_away.
    # Filter df according to method
    if method == '1':
        print("Current and past season median")
        df = df[(df["season"].isin([season, season-1]))]
    elif method == '2':
        print("Current and all past seasons median")
        df = df[(df["season"] <= season)]
    elif method == '3':
        print("Current season only median")
        # First and second week zero/one home/away games
        if week > 3:
            df = df[(df["season"] == season)]
        else:
            df = df[(df["season"].isin([season, season-1]))]
    elif method == '4':
        print("Last 5 games median")
        if week > 5:
            weeks = [w for w in range(week-1, week-6, -1)]
            df = df[(df["season"] == season) & (df["week"].isin(weeks))]
        else:
            df = df[(df["season"].isin([season, season-1]))]
        
    # Median home
    extra_columns = df.filter(["goals_home"]).columns
    columns_home = df.filter(regex='^stats_home').columns
    columns_home = columns_home.append(extra_columns)
    median_home = df.groupby(['home_team.id'], as_index=True)[columns_home].median()
    # Median away
    extra_columns = df.filter(["goals_away"]).columns
    columns_away = df.filter(regex='^stats_away').columns
    columns_away = columns_away.append(extra_columns)
    median_away = df.groupby(['away_team.id'], as_index=True)[columns_away].median()
    if method == '5':
        print("Current season only mean")
        # First and second week zero/one home/away games
        if week > 3:
            df = df[(df["season"] == season)]
        else:
            df = df[(df["season"].isin([season, season-1]))]
        mean_home = df.groupby(['home_team.id'], as_index=True)[columns_home].mean()
        mean_away = df.groupby(['away_team.id'], as_index=True)[columns_away].mean()
        return mean_home, mean_away
    return median_home, median_away

In [23]:
def create_test_set(df, teams, season, week, pezzali=True, method=None, extras=None):
    columns_home = ["goals_home", "stats_home.c_red", "stats_home.s_total", "stats_home.s_off_g", "stats_home.s_on_g", "stats_home.s_in", 
                   "stats_home.saves", "stats_home.s_blocked", "stats_home.c_yellow", "stats_home.s_out"]
    columns_away = ["goals_away", "stats_away.c_red", "stats_away.s_total", "stats_away.s_off_g", "stats_away.s_on_g", "stats_away.s_in", 
                   "stats_away.saves", "stats_away.s_blocked", "stats_away.c_yellow", "stats_away.s_out"]
    columns = ["season", "week"]
    columns = columns_home + columns_away + columns
    columns_pezzali = columns
    
    if pezzali == False or method == '5' or extras == '4' or extras == '3':
        extra_columns = df.filter(["goals_home"]).columns
        columns_home = df.filter(regex='^stats_home').columns
        columns_home = columns_home.append(extra_columns)
        extra_columns = df.filter(["goals_away"]).columns
        columns_away = df.filter(regex='^stats_away').columns
        columns_away = columns_away.append(extra_columns)
        columns = columns_home.append(columns_away)
        
    
    test_set = pd.DataFrame(columns=columns)
    home_teams = [match[0] for match in teams]
    away_teams = [match[1] for match in teams]
    seasons = [season for i in range(0, len(teams))]
    weeks = [week for i in range(0, len(teams))]
    test_set["home_team.id"] = home_teams
    test_set["away_team.id"] = away_teams
    test_set["season"] = seasons
    test_set["week"] = weeks
    
    columns_home = test_set.filter(columns_home).columns
    columns_away = test_set.filter(columns_away).columns
    i_th = test_set.columns.get_loc("home_team.id")
    i_ta = test_set.columns.get_loc("away_team.id")
    
    median_home, median_away = get_median(df, method, season, week)

    for i in range(0, len(test_set)):
        for index, row in median_home.iterrows():
            if test_set.iloc[i,i_th] == index:
                for c in columns_home:
                    try:
                        i_c = test_set.columns.get_loc(c)
                        test_set.iloc[i,i_c] = row[c]
                    except:
                        print(index, c)
    for i in range(0, len(test_set)):
        for index, row in median_away.iterrows():
            if test_set.iloc[i,i_ta] == index:
                for c in columns_away:
                    try:
                        i_c = test_set.columns.get_loc(c)
                        test_set.iloc[i,i_c] = row[c]
                    except:
                        print(index, c)
    # For non existing teams (this means ascending teams filled with median of column)
    test_set = test_set.apply(lambda x: x.fillna(x.median()),axis=0)
    if method == '5':
        test_set = linear_reg(test_set, columns)
        if pezzali == True and extras == '1':
            return test_set[['week','season','away_team.id', 'home_team.id', 'goals_home', 'stats_home.s_off_g', 'stats_home.s_on_g', 'stats_home.s_in', 'stats_home.saves', 'stats_home.s_blocked', 'stats_home.c_yellow', 'stats_home.s_out', 'goals_away', 'stats_away.s_off_g', 'stats_away.s_on_g', 'stats_away.s_in', 'stats_away.saves', 'stats_away.s_blocked', 'stats_away.c_yellow', 'stats_away.s_out', 'stats_home.s_total', 'stats_away.s_total', 'stats_home.c_red']]
    return test_set

In [24]:
# For the linear regression model, loads pretrains and returns the prediction
def get_regression(row, column):
    global country
    # Load model
    filename = 'model/' + country + '/' + column + '.sav'
    model = pickle.load(open(filename, 'rb'))
    row = row.drop(column)
    row = row.drop("home_team.id")
    row = row.drop("away_team.id")
    row = row.drop("season")
    row = row.drop("week")  
    pred = np.array(row).reshape(1, -1)
    return model.predict(pred)

In [25]:
# Returns the test set for all columns calculating linear regression
def linear_reg(test_set, columns):
    for i in range(0, len(test_set)):
        for c in columns:
            try:
                i_c = test_set.columns.get_loc(c)
                test_set.iloc[i,i_c] = get_regression(test_set.iloc[i], c)
            except:
                print(i, c)
                raise
    return test_set

In [26]:
"""
    A proper evaluation: we cannot test week 10 2018 with week 20 2018
    The sequence of events must be maintained
"""
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.multiclass import OneVsOneClassifier
from sklearn.linear_model import RidgeClassifierCV

def subtraining_trainset(df, season, week, up_to_season):
    df_sub = df[(df["season"] == season) & (df["week"] < week)]
    target = []
    past_seasons = [s for s in range(up_to_season, season)]
    
    df_past = df[(df["season"].isin(past_seasons))]
    
    df_sub = df_sub.append(df_past)
    
    for index, row in df_sub.iterrows():
        target.append(playstyle.get_status(row))
    
    return df_sub, target

In [27]:
# Data found to be not relevant to models
# If not using Anova
def remove_before_models(df, extras, if_anova=False):
    if extras != '1':
        if not if_anova or if_anova == 'both':
            try:
                df = df.drop(columns=['stats_home.c_yellow', 'stats_away.c_yellow', 'stats_home.p_accurate','stats_away.p_accurate','stats_away.p_total','stats_away.fouls','stats_home.p_total','stats_home.fouls','stats_home.corners','stats_home.offside','stats_away.corners','stats_away.offside'], axis=1)
            except:
                print("NO")
        try:
            df = df.drop(columns=['index'], axis=1)
        except:
            print("")
        try:
            df = df.drop(columns=['stats_home','stats_away'], axis=1)
        except:
            print("")
    return df

In [28]:
"""
    This function returns a training set with the given method
"""
def train_method(dct, train_set, s, w, seasons, method, extras=None, if_anova=False):
    print("Train Method")
    statistics_subset = None
    target = None
    if method == '1':
        print("Current and past season median")
    elif method == '2':
        print("Current and all past seasons median")
    elif method == '3':
        print("Current season only median")
    elif method == '4':
        print("Last 5 games median")
    elif method == '5':
        print("Current and past season linear regression")
    elif method == '6':
        print("Current and all past seasons linear regression")
    elif method == '7':
        print("Current season only linear regression")
    elif method == '8':
        print("Plain train set: all games from past seasons")
        statistics_subset, target = subtraining_trainset(train_set, s, w, seasons)
   
    # If something must be removed
    statistics_removed = remove_before_models(statistics_subset, extras, if_anova)
    n = 10
    if extras == '1':
        print("Applies pezzali to train set")
        statistics_subset = playstyle.pezzali_data(statistics_subset, is_train=True)
        statistics_subset,_,_,_,_,_,_ = get_data(dct, statistics_subset)
        statistics_subset = statistics_subset.drop(columns=['home_team.id', 'away_team.id', 'home_team.name', 'away_team.name', 'season', 'week'])
    elif extras == '2':
        print("Applies pca to train set")
        statistics_subset,ho,ao,hr,ar,rh,ra = get_data(dct, statistics_removed)
        statistics_subset = statistics_subset.drop(columns=['home_team.id', 'away_team.id', 'home_team.name', 'away_team.name', 'goals_away','goals_home','season','week'])
        pca = PCA(n_components=n)
        statistics_subset = pca.fit_transform(statistics_subset)
        statistics_subset = np.append(statistics_subset, ho, axis=1)
        statistics_subset = np.append(statistics_subset, ao, axis=1)
        statistics_subset = np.append(statistics_subset, hr, axis=1)
        statistics_subset = np.append(statistics_subset, ar, axis=1)
        statistics_subset = np.append(statistics_subset, rh, axis=1)
        statistics_subset = np.append(statistics_subset, ra, axis=1)
    elif extras == '3':
        print("Adds pezzali and then pca to train set")
        statistics_subset = playstyle.pezzali_data(statistics_removed, is_train=True, both=True)
        statistics_subset,ho,ao,hr,ar,rh,ra = get_data(dct, statistics_subset)
        statistics_subset = statistics_subset.drop(columns=['home_team.id', 'away_team.id', 'home_team.name', 'away_team.name', 'goals_away','goals_home','season','week'])
        pca = PCA(n_components=n)
        statistics_subset = pca.fit_transform(statistics_subset)
        statistics_subset = np.append(statistics_subset, ho, axis=1)
        statistics_subset = np.append(statistics_subset, ao, axis=1)
        statistics_subset = np.append(statistics_subset, hr, axis=1)
        statistics_subset = np.append(statistics_subset, ar, axis=1)
        statistics_subset = np.append(statistics_subset, rh, axis=1)
        statistics_subset = np.append(statistics_subset, ra, axis=1)
    elif extras == '4':
        print("Adds pezzali to plain train set")
        statistics_subset = playstyle.pezzali_data(statistics_removed, is_train=True, both=True)
        statistics_subset,ho,ao,hr,ar,rh,ra = get_data(dct, statistics_removed)
        statistics_subset = statistics_subset.drop(columns=['home_team.id', 'away_team.id', 'home_team.name', 'away_team.name', 'goals_away','goals_home','season','week'])
    elif extras == '0':
        print("Plain train set with clusters")
        statistics_subset,_,_,_,_,_,_ = get_data(dct, statistics_removed)
        statistics_subset = statistics_subset.drop(columns=['home_team.id', 'away_team.id','home_team.name', 'away_team.name','goals_away','goals_home','season','week'])
    return statistics_subset, target

In [29]:
def test_extras(dct, statistics_test, extras='0', if_anova=False):
    # If something must be removed
    statistics_removed = remove_before_models(statistics_test, extras, if_anova)
    n = 10
    if extras == '1':
        print("Applies pezzali to test set")
        # Get test pezalli and mappings
        statistics_test = playstyle.pezzali_data(statistics_test, is_train=False)
        statistics_test,_,_,_,_,_,_ = get_data(dct, statistics_test)
        try:
            statistics_test = statistics_test.drop(columns=['home_team.id', 'away_team.id', 'season','week'])
        except:
            print("")
    elif extras == '2':
        print("Applies pca to test set")
        statistics_test,ho,ao,hr,ar,rh,ra = get_data(dct, statistics_test)
        pca = PCA(n_components=n)
        statistics_test = pca.fit_transform(statistics_removed)
        statistics_test = np.append(statistics_test, ho, axis=1)
        statistics_test = np.append(statistics_test, ao, axis=1)
        statistics_test = np.append(statistics_test, hr, axis=1)
        statistics_test = np.append(statistics_test, ar, axis=1)
        statistics_test = np.append(statistics_test, rh, axis=1)
        statistics_test = np.append(statistics_test, ra, axis=1)
    elif extras == '3':
        print("Adds pezzali and then pca to test set")
        statistics_removed = playstyle.pezzali_data(statistics_removed, is_train=False, both=True)
        statistics_removed = statistics_removed.drop(columns=['home_team.id', 'away_team.id', 'goals_away','goals_home','season','week'])
        statistics_test,ho,ao,hr,ar,rh,ra = get_data(dct, statistics_test)
        pca = PCA(n_components=n)
        statistics_test = pca.fit_transform(statistics_removed)
        statistics_test = np.append(statistics_test, ho, axis=1)
        statistics_test = np.append(statistics_test, ao, axis=1)
        statistics_test = np.append(statistics_test, hr, axis=1)
        statistics_test = np.append(statistics_test, ar, axis=1)
        statistics_test = np.append(statistics_test, rh, axis=1)
        statistics_test = np.append(statistics_test, ra, axis=1)
    elif extras == '4':
        print("Adds pezzali to plain test set")
        statistics_removed = playstyle.pezzali_data(statistics_removed, is_train=False, both=True)
        statistics_removed = statistics_removed.drop(columns=['home_team.id', 'away_team.id', 'goals_away','goals_home','season','week'])
        statistics_test,ho,ao,hr,ar,rh,ra = get_data(dct, statistics_test)
        statistics_test = np.append(statistics_removed, ho, axis=1)
        statistics_test = np.append(statistics_test, ao, axis=1)
        statistics_test = np.append(statistics_test, hr, axis=1)
        statistics_test = np.append(statistics_test, ar, axis=1)
        statistics_test = np.append(statistics_test, rh, axis=1)
        statistics_test = np.append(statistics_test, ra, axis=1)
    elif extras == '0':
        print("Plain test set with clusters")
        statistics_test,_,_,_,_,_,_ = get_data(dct, statistics_removed)
        statistics_test = statistics_test.drop(columns=['home_team.id', 'away_team.id','goals_away','goals_home','season','week'])
    return statistics_test

In [30]:
"""
    Returns the y_predicted given a majority voting
    - nb, rforest, rfores_pezzali
"""

from sklearn.preprocessing import FunctionTransformer

def get_prediction(clf_name, if_anova, train_set, target, test_set, k_best=10):
    # 1) anova filter, take n best ranked features
    anova_filter = SelectKBest(f_regression, k=k_best)
    classifiers = dict(nb_ovo=OneVsOneClassifier(GaussianNB()),
                       rforest=OneVsOneClassifier(RandomForestClassifier(random_state=0, criterion="entropy", max_features="log2", min_samples_leaf=2)),
                       svc_ovo_p=OneVsOneClassifier(SVC(kernel='linear', probability=True)),
                       rforest_p=OneVsOneClassifier(RandomForestClassifier(random_state=0, criterion="entropy", max_features="log2", min_samples_leaf=2)))
    clf = classifiers[clf_name]
    if if_anova:
        anova_model = make_pipeline(anova_filter, clf)
    else:
        anova_model = clf
    anova_model.fit(train_set, target)
    Y = anova_model.predict(test_set)
    prob = anova_model.decision_function(test_set)
    return Y, prob

def get_majority(votes):
    majority = []
    votes = np.transpose(votes)
    for i in range(0, len(votes)):
        win = 0
        lose = 0
        draw = 0
        for j in range(0, len(votes[0])):
            if votes[i][j] == 0:
                draw = draw + 1
            if votes[i][j] == 1:
                win = win + 1
            if votes[i][j] == -1:
                lose = lose + 1
        if draw > (win+lose):
            majority.append(0)
        elif lose > (win+draw):
            majority.append(-1)
        elif win > (draw+lose):
            majority.append(1)
        elif win == draw and draw == lose:
            majority.append(0)
        elif draw == win:
            majority.append(1)
        elif draw == lose:
            majority.append(-1)
        else:
            majority.append(0)
    return majority

def vote_classifier(train_set, target, test_set, train_pezzali, test_pezzali, classifiers=["nb_ovo", "rforest", "rforest_p"], k_best=10):
    dct_votes = {
        "nb_ovo":{
            "predict":[],
            "proba":[]
        },
        "rforest":{
            "predict":[],
            "proba":[]
        },
        "rforest_p":{
            "predict":[],
            "proba":[]
        },
        "svc_ovo_p":{
            "predict":[],
            "proba":[]
        }
    }
    st = preprocessing.StandardScaler().fit_transform(train_set)
    sp = preprocessing.StandardScaler().fit_transform(test_set)
    stp = preprocessing.StandardScaler().fit_transform(train_pezzali)
    spp = preprocessing.StandardScaler().fit_transform(test_pezzali)
    # Plain
    dct_votes["nb_ovo"]["predict"], prob = get_prediction("nb_ovo", True, st, target, sp)
    dct_votes["nb_ovo"]["proba"] = prob
    dct_votes["rforest"]["predict"], prob = get_prediction("rforest", True, st, target, sp)
    dct_votes["rforest"]["proba"] = prob
    # Pezzali
    dct_votes["rforest_p"]["predict"], prob = get_prediction("rforest_p", False, stp, target, spp)
    dct_votes["rforest_p"]["proba"] = prob
    dct_votes["svc_ovo_p"]["predict"], prob = get_prediction("svc_ovo_p", False, stp, target, spp)
    dct_votes["svc_ovo_p"]["proba"] = prob
    # Get Vote
    Y = []
    P = [[] for i in range(0, len(test_set))]
    for clf in classifiers:
        Y.append(dct_votes[clf]["predict"])
        for i in range(0, len(test_set)):
            P[i].append(dct_votes[clf]["proba"][i])
        
    vY = get_majority(Y)
    return vY, P

def vote_predict(df, clusters_dct, teams, season, week, up_to_season=2017, classifiers=["nb_ovo", "rforest", "rforest_p"]):
    # Create training and test for plain test
    statistics_train, target = train_method(clusters_dct, df, season, week, up_to_season, '8', '0', 'both')
    statistics_test = create_test_set(df, teams, season, week, False, '1', '0')
    statistics_test = test_extras(clusters_dct, statistics_test, '0', 'both')
    statistics_train = FunctionTransformer(np.log1p).fit_transform(statistics_train)
    statistics_test = FunctionTransformer(np.log1p).fit_transform(statistics_test)
    
    # Create training and test for pezzali test
    pezzali_train, _ = train_method(clusters_dct, df, season, week, up_to_season, '8', '1', False)
    pezzali_test = create_test_set(df, teams, season, week, True, '3', '1')
    pezzali_test = test_extras(clusters_dct, pezzali_test, '1', False)
    
    # Changes to vote classifier
    Y, prob = vote_classifier(statistics_train, target, statistics_test, pezzali_train, pezzali_test, classifiers)
    return Y, prob

In [31]:
def predict(df, clusters_dct, teams, season, week, test_m='1', extras='1', up_to_season=2017, classifier='rforest', if_anova=False, k_best=10):
    """
        teams: the list of pair of teams' ids as [[10, 20], [30, 20]]
    """
        
    classifiers = dict(nb_ovo=OneVsOneClassifier(GaussianNB()),
                       svc_ovo=OneVsOneClassifier(SVC(kernel='linear', probability=True)),
                       ridge=OneVsOneClassifier(RidgeClassifierCV(class_weight='balanced')), 
                       rforest=OneVsOneClassifier(RandomForestClassifier(random_state=0, criterion="entropy", max_features="log2", min_samples_leaf=2)))
    
    if extras == '2' or extras == '0':
        p = False
    else:
        p = True
        
    # Divide the training set to predict
    statistics_train, target = train_method(clusters_dct, df, season, week, up_to_season, method='8', extras=extras, if_anova=if_anova)
    # Creates the test set
    statistics_test = create_test_set(df, teams, season, week, p, test_m, extras)
    statistics_test = test_extras(clusters_dct, statistics_test, extras, if_anova)
    # Scale the training and test set
    st = preprocessing.StandardScaler().fit_transform(statistics_train)
    sp = preprocessing.StandardScaler().fit_transform(statistics_test)
    # Makes the prediction given a classifier
    # 1) anova filter, take n best ranked features
    if if_anova == True or if_anova == 'both':
        anova_filter = SelectKBest(f_regression, k=k_best)
        clf = make_pipeline(anova_filter, classifiers[classifier])
    else:
        clf = classifiers[classifier]
    clf.fit(st, target)
    Y = clf.predict(sp)
    prob = clf.decision_function(sp)
    return Y, prob

In [32]:
def maps_results(teams, results, dct, season, prob, print_prob=False):
    for i in range(0, len(results)):
        home_team = dct[0][str(teams[i][0])][str(season)].value2 + " " + dct[0][str(teams[i][0])][str(season)].name
        away_team = dct[0][str(teams[i][1])][str(season)].value2 + " " + dct[0][str(teams[i][1])][str(season)].name
        print("home_team= %s, away_team= %s, result= %s" %(home_team, away_team, str(results[i])))
        if print_prob:
            print(prob[i])

In [33]:
teams_gb = [[66, 34], [51, 41], [44, 45], [49, 63], [40, 39], [50, 36], [62, 46],
            [47, 42], [60, 52], [48, 33]]
teams_es = [[542, 548], [531, 538], [530, 720], [545, 532], [724, 529], [715, 726],
            [539, 546], [727, 543], [536, 541], [533, 797]]
teams_de = [[157, 173], [167, 170], [160, 163], [169, 165], [159, 182], [174, 168],
            [192, 161], [162, 172], [188, 164]]
teams_it = [[501, 492], [502, 495], [505, 500], [496, 503], [523, 506], [497, 488], 
            [498, 489], [515, 487], [494, 499], [504, 490]]
teams_fr = [[79, 91], [89, 1063], [78, 106], [77, 97], [112, 80], [82, 85], [93, 84],
            [92, 81], [83, 95], [94, 116]]
teams_nl = [[193, 200], [417, 204], [201, 202], [426, 208], [194, 415], [207, 198], 
            [209, 206], [205, 195], [210, 197]]
teams_pt = [[242, 228], [225, 227], [221, 217], [211, 234], [224, 216], [215, 762],
            [212, 218], [231, 214], [226, 222]]
teams_be = [[569, 735], [733, 266], [742, 740], [624, 631], [600, 554], [736, 734],
            [263, 739], [260, 741], [738, 743]]

In [34]:
ids_gb = [592241, 592242, 592243, 592244, 592245, 592246, 592247, 592248, 592249, 592250]
ids_es = [605175, 605176, 605177, 605178, 605179, 605180, 605181, 605182, 605183, 605184]
ids_de = [587257, 587258, 587259, 587260, 587261, 587262, 587263, 587264, 587265]
ids_it = [608625, 608626, 608627, 608628, 608629, 608630, 608631, 608632, 608633, 608634]
ids_fr = [571592, 571593, 571594, 571595, 571596, 571597, 571598, 571599, 571600, 571601]
ids_nl = [573254, 573255, 573256, 573257, 573258, 573259, 573260, 573261, 573262]
ids_pt = [601093, 601094, 601095, 601096, 601097, 601098, 601099, 601100, 601101]
ids_be = [581268, 581269, 581270, 581271, 581272, 581273, 581274, 581275, 581276]

In [35]:
leagues_dct = {
    'ES': {
        'country' : 'ES',
        'league':'140',
        'curr_week': 9,
        'predict_week': 10,
        'start_odds': 6,
        'best_clf': ["rforest", "nb_ovo", "rforest_p", "svc_ovo_p"],
        'matches': teams_es,
        'matches_ids': ids_es
    },
    'GB': {
        'country' : 'GB',
        'league':'39',
        'curr_week': 8,
        'predict_week': 9,
        'start_odds': 1,
        'best_clf': ["nb_ovo", "rforest_p", "svc_ovo_p"],
        'matches': teams_gb,
        'matches_ids': ids_gb
    },
    'IT': {
        'country' : 'IT',
        'league':'135',
        'curr_week': 7,
        'predict_week': 8,
        'start_odds': 1,
        #'best_clf': ["rforest", "nb_ovo", "rforest_p"],
        'best_clf': ["svc_ovo_p"],
        'matches': teams_it,
        'matches_ids': ids_it
    },
    'DE': {
        'country' : 'DE',
        'league':'78',
        'curr_week': 7,
        'predict_week': 8,
        'start_odds': 1,
        'best_clf': ["rforest"],
        'matches': teams_de,
        'matches_ids': ids_de
    },
    'FR': {
        'country' : 'FR',
        'league':'61',
        'curr_week': 9,
        'predict_week': 10,
        'start_odds': 7,
        'best_clf': ["nb_ovo", "rforest_p", "rforest"],
        'matches': teams_fr,
        'matches_ids': ids_fr
    },
    'NL': {
        'country' : 'NL',
        'league':'88',
        'curr_week': 8,
        'predict_week': 9,
        'start_odds': 4,
        'best_clf': ["nb_ovo", "rforest_p", "rforest"],
        'matches': teams_nl,
        'matches_ids': ids_nl
    },
    'PT': {
        'country' : 'PT',
        'league':'94',
        'curr_week': 7,
        'predict_week': 8,
        'start_odds': 4,
        'best_clf': ["nb_ovo", "rforest_p", "svc_ovo_p"],
        'matches': teams_pt,
        'matches_ids': ids_pt
    },
    'BE': {
        'country' : 'BE',
        'league':'144',
        'curr_week': 12,
        'predict_week': 13,
        'start_odds': 9,
        'best_clf': ["rforest"],
        'matches': teams_be,
        'matches_ids': ids_be
    }
}

In [42]:
"""
    Everything to retrieve the data
"""
global country
elem = leagues_dct['ES']
country = elem['country']
league = elem['league']
curr_week = elem['curr_week']
week = elem['predict_week']
season = 2020
teams = elem['matches']

# Dictionary for the clusters (clusters from 2016 - 2019)
dct = get_clusters(country, league, season, year_window=5)

# Dataframe for the match (test from 2017 to 2019)
statistics_to_test, target = get_statistics(country, league, curr_week, season, year_window=4)

results, prob = predict(statistics_to_test, dct, teams, season, week, test_m='3', extras='1', if_anova=False, classifier='svc_ovo')
maps_results(teams, results, dct, season, prob)

Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set
home_team= 542 Alaves - 2020, away_team= 532 Valencia - 2020, result= 0
home_team= 531 Athletic Club - 2020, away_team= 543 Real Betis - 2020, result= 1
home_team= 530 Atletico Madrid - 2020, away_team= 529 Barcelona - 2020, result= 1
home_team= 545 Eibar - 2020, away_team= 546 Getafe - 2020, result= 1
home_team= 724 Cadiz - 2020, away_team= 548 Real Sociedad - 2020, result= -1
home_team= 715 Granada CF - 2020, away_team= 720 Valladolid - 2020, result= 1
home_team= 539 Levante - 2020, away_team= 797 Elche - 2020, result= -1
home_team= 727 Osasuna - 2020, away_team= 726 Huesca - 2020, result= 1
home_team= 536 Sevilla - 2020, away_team= 538 Celta Vigo - 2020, result= -1
home_team= 533 Villarreal - 2020, away_team= 541 Real Madrid - 2020, result= -1


In [43]:
clf = elem['best_clf']
results, prob = vote_predict(statistics_to_test, dct, teams, season, week, classifiers=clf)
maps_results(teams, results, dct, season, prob)

Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set
home_team= 542 Alaves - 2020, away_team= 532 Valencia - 2020, result= 0
home_team= 531 Athletic Club - 2020, away_team= 543 Real Betis - 2020, result= 1
home_team= 530 Atletico Madrid - 2020, away_team= 529 Barcelona - 2020, result= 0
home_team= 545 Eibar - 2020, away_team= 546 Getafe - 2020, result= 1
home_team= 724 Cadiz - 2020, away_team= 548 Real Sociedad - 2020, result= 1
home_team= 715 Granada CF - 2020, away_team= 720 Valladolid - 2020, result= 1
home_team= 539 Levante - 2020, away_team= 797 Elche - 2020, result= 1
home_team= 727 Osasuna - 2020, away_team= 726 Huesca - 2020, result= 1
home_team= 536 Sevilla - 2020, away_team= 538 Celta Vigo - 2020, result= 0
home_team= 533 Villarreal - 2020, 

In [27]:
"""
    Everything to retrieve the data
"""
global country
elem = leagues_dct['ES']
country = elem['country']
league = elem['league']
curr_week = elem['curr_week']
week = elem['predict_week']
season = 2020
teams = elem['matches']

# Dictionary for the clusters (clusters from 2016 - 2019)
dct = get_clusters(country, league, season, year_window=5)

# Dataframe for the match (test from 2017 to 2019)
statistics_to_test, target = get_statistics(country, league, curr_week, season, year_window=4)

results, prob = predict(statistics_to_test, dct, teams, season, week, test_m='1', extras='0', if_anova=False, classifier='nb_ovo')
maps_results(teams, results, dct, season, prob)

Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
home_team= 530 Atletico Madrid - 2020, away_team= 724 Cadiz - 2020, result= 0
home_team= 529 Barcelona - 2020, away_team= 543 Real Betis - 2020, result= 1
home_team= 546 Getafe - 2020, away_team= 533 Villarreal - 2020, result= 0
home_team= 726 Huesca - 2020, away_team= 545 Eibar - 2020, result= 1
home_team= 539 Levante - 2020, away_team= 542 Alaves - 2020, result= 1
home_team= 548 Real Sociedad - 2020, away_team= 715 Granada CF - 2020, result= 1
home_team= 536 Sevilla - 2020, away_team= 727 Osasuna - 2020, result= 1
home_team= 532 Valencia - 2020, away_team= 541 Real Madrid - 2020, result= -1
home_team= 720 Valladolid - 2020, away_team= 531 Athletic Club - 2020, result= 1
home_team= 797 Elche - 2020, away_team= 538 Celta Vigo - 2020, result= 0


In [28]:
clf = elem['best_clf']
results, prob = vote_predict(statistics_to_test, dct, teams, season, week, classifiers=clf)
maps_results(teams, results, dct, season, prob, print_prob=False)

Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set
home_team= 530 Atletico Madrid - 2020, away_team= 724 Cadiz - 2020, result= 0
home_team= 529 Barcelona - 2020, away_team= 543 Real Betis - 2020, result= 1
home_team= 546 Getafe - 2020, away_team= 533 Villarreal - 2020, result= 0
home_team= 726 Huesca - 2020, away_team= 545 Eibar - 2020, result= 0
home_team= 539 Levante - 2020, away_team= 542 Alaves - 2020, result= -1
home_team= 548 Real Sociedad - 2020, away_team= 715 Granada CF - 2020, result= 1
home_team= 536 Sevilla - 2020, away_team= 727 Osasuna - 2020, result= 1
home_team= 532 Valencia - 2020, away_team= 541 Real Madrid - 2020, result= 1
home_team= 720 Valladolid - 2020, away_team= 531 Athletic Club - 2020, result= 1
home_team= 797 Elche - 2020

In [126]:
results, prob = vote_predict(statistics_to_test, dct, teams, season, week, classifiers=["nb_ovo", "rforest_p", "svc_ovo_p"])
maps_results(teams, results, dct, season, prob, print_prob=True)

Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  corr /= X_norms
  corr /= X_norms


home_team= 66 ston Villa - 2020, away_team= 41 outhampton - 2020, result= 1
[array([ 1.93576476, -0.09628976,  1.13068885]), array([-0.21823819,  0.97765684,  2.22102426]), array([-0.31876755,  1.26744642,  2.31562687])]
home_team= 44 urnley - 2020, away_team= 49 helsea - 2020, result= -1
[array([1.98356622, 1.00427665, 0.0124692 ]), array([1.96629213, 0.98919529, 0.04246655]), array([ 2.31830095,  1.12613246, -0.3187026 ])]
home_team= 36 ulham - 2020, away_team= 60 est Brom - 2020, result= 0
[array([-0.22221614,  1.06601262,  2.21224865]), array([0.82721713, 2.07866981, 0.14471891]), array([ 2.30382684,  1.29258714, -0.31529354])]
home_team= 63 eeds - 2020, away_team= 46 eicester - 2020, result= 1
[array([ 0.84088104, -0.12248364,  2.19969297]), array([0.82652092, 1.00919006, 1.17127566]), array([-0.27447682,  1.21891344,  2.24445026])]
home_team= 40 iverpool - 2020, away_team= 48 est Ham - 2020, result= 1
[array([-0.21922826,  0.99976836,  2.21925541]), array([-0.21921995,  0.9913645

In [18]:
"""
    Everything to retrieve the data
"""
global country
elem = leagues_dct['PT']
country = elem['country']
league = elem['league']
curr_week = elem['curr_week']
week = elem['predict_week']
season = 2020
teams = elem['matches']

# Dictionary for the clusters (clusters from 2016 - 2019)
dct = get_clusters(country, league, season, year_window=5)

# Dataframe for the match (test from 2017 to 2019)
statistics_to_test, target = get_statistics(country, league, curr_week, season, year_window=4)
results, prob = vote_predict(statistics_to_test, dct, teams, season, week, classifiers=["nb_ovo", "rforest_p", "svc_ovo_p"])
maps_results(teams, results, dct, season, prob, print_prob=False)

Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Applies pezzali to train set
Current season only median
Applies pezzali to test set
home_team= 762 GIL Vicente - 2020, away_team= 224 Guimaraes - 2020, result= 1
home_team= 234 Pacos Ferreira - 2020, away_team= 212 FC Porto - 2020, result= -1
home_team= 221 Belenenses - 2020, away_team= 231 Farense - 2020, result= 0
home_team= 222 Boavista - 2020, away_team= 211 Benfica - 2020, result= -1
home_team= 217 SC Braga - 2020, away_team= 242 Famalicao - 2020, result= 0
home_team= 214 Maritimo - 2020, away_team= 225 Nacional - 2020, result= 1
home_team= 226 Rio Ave - 2020, away_team= 215 Moreirense - 2020, result= 1
home_team= 216 Portimonense - 2020, away_team= 227 Santa Clara - 2020, result= 1
home_team= 228 Sporting CP - 2020, away_team= 218 Tondela - 2020, result= 1


In [20]:
"""
    Everything to retrieve the data
"""
global country
elem = leagues_dct['NL']
country = elem['country']
league = elem['league']
curr_week = elem['curr_week']
week = elem['predict_week']
season = 2020
teams = elem['matches']

# Dictionary for the clusters (clusters from 2016 - 2019)
dct = get_clusters(country, league, season, year_window=5)

# Dataframe for the match (test from 2017 to 2019)
statistics_to_test, target = get_statistics(country, league, curr_week, season, year_window=4)
results, prob = vote_predict(statistics_to_test, dct, teams, season, week, classifiers=["nb_ovo", "rforest_p", "rforest"])
maps_results(teams, results, dct, season, prob, print_prob=True)

Train Method
Plain train set: all games from past seasons


Plain train set with clusters
Current and past season median


Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set


  corr /= X_norms
  corr /= X_norms


home_team= 194 Ajax - 2020, away_team= 205 Fortuna Sittard - 2020, result= 1
[array([-0.21828508,  0.9695958 ,  2.22213706]), array([-0.21934074,  0.9828722 ,  2.22141392]), array([-0.21312658,  0.9452182 ,  2.22108729])]
home_team= 202 Groningen - 2020, away_team= 204 VVV Venlo - 2020, result= 0
[array([-0.20469305,  2.14024651,  1.15458947]), array([1.84847658, 0.99375874, 0.15339609]), array([-0.20202219,  2.09696862,  1.17671013])]
home_team= 415 Twente - 2020, away_team= 193 PEC Zwolle - 2020, result= 0
[array([1.93147019, 1.06045849, 0.0119657 ]), array([-0.19905816,  1.03667603,  2.19202054]), array([-0.20594016,  2.08877476,  1.18541997])]
home_team= 201 AZ Alkmaar - 2020, away_team= 417 Waalwijk - 2020, result= 1
[array([-0.21285348,  0.94582171,  2.22075091]), array([-0.19415061,  0.98759635,  2.19636105]), array([-0.20162317,  0.90497062,  2.21955151])]
home_team= 206 Heracles - 2020, away_team= 207 Utrecht - 2020, result= -1
[array([1.88978565, 1.0053732 , 0.10774037]), arr

In [130]:
"""
    Everything to retrieve the data
"""
global country
country = 'ES'
league = '140'
curr_week = 6
season = 2020

# Dictionary for the clusters (clusters from 2016 - 2019)
dct = get_clusters(country, league, season, year_window=5)

# Dataframe for the match (test from 2017 to 2019)
statistics_to_test, target = get_statistics(country, league, curr_week, season, year_window=4)

In [51]:
teams = [[724, 533]]
week = 7
results, prob = predict(statistics_to_test, dct, teams, season, week, test_m='3', extras='1', if_anova='both', classifier='svc_ovo')
maps_results(teams, results, dct, season, prob)

Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set
home_team= 724 Cadiz - 2020, away_team= 533 Villarreal - 2020, result= 0
[-0.23942296  2.23757704  1.0213474 ]


In [71]:
results, prob = vote_predict(statistics_to_test, dct, teams, season, week)
maps_results(teams, results, dct, season, prob)

Train Method
Plain train set: all games from past seasons

Plain train set with clusters
Current and past season median

Plain test set with clusters
Train Method
Plain train set: all games from past seasons
Applies pezzali to train set
Current season only median
Applies pezzali to test set
home_team= 724 Cadiz - 2020, away_team= 533 Villarreal - 2020, result= 1
[[0.8144931939275888, -0.08645158627564854, 2.2053776849270585], [0.8100840419610681, -0.07913367435703167, 2.2068562618195835], [-0.21294982587385528, 1.0964507086738522, 2.1921957645287686]]
