In [1]:
"""
    Ranking:
        1.1 Hierarchical Clustering (ranking_prediction.ipynb)
        1.2 Map labels to future matches as home and away of past season
            1.2.1 If team is new, label it as normal (which is a not promoted team)
        1.3 Compute difference in ranking (NO because of no sequential)
        1.4 Future work: study the week development of the league to see if there is a point where
            it can be started to predict the actual behavior of the team as the promotion label
    Prediction:
        2.0 Drop features --> computation differences (visualization.ipynb)
        2.1 Features TEST
            2.1.0 First match as median of past season (already started) TODO
            2.1.2 Median of current season (home and away games are definetely a factor)
            2.1.2.1 Median at home/away <-- Just this
            2.1.2.2 Past average of last n games (test with 3, 5, etc)
            2.1.2.3 Average of last n games at home/away
"""

'\n    Ranking:\n        1.1 Hierarchical Clustering (ranking_prediction.ipynb)\n        1.2 Map labels to future matches as home and away of past season\n            1.2.1 If team is new, label it as normal (which is a not promoted team)\n        1.3 Compute difference in ranking (NO because of no sequential)\n        1.4 Future work: study the week development of the league to see if there is a point where\n            it can be started to predict the actual behavior of the team as the promotion label\n    Prediction:\n        2.0 Drop features --> computation differences (visualization.ipynb)\n        2.1 Features TEST\n            2.1.0 First match as median of past season (already started) TODO\n            2.1.2 Median of current season (home and away games are definetely a factor)\n            2.1.2.1 Median at home/away <-- Just this\n            2.1.2.2 Past average of last n games (test with 3, 5, etc)\n            2.1.2.3 Average of last n games at home/away\n'

In [209]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import playstyle
import ranking

In [210]:
"""
    Everything to create the clusters
"""
from sklearn import preprocessing
from scipy.cluster.hierarchy import linkage, cophenet, fcluster
from scipy.spatial.distance import pdist

# Returns Z, coph_matrix and best cophence score from HC
def HierarchicalClustering(data, label):
    methods = ["single","complete","average","centroid","ward"]

    # Pass the dataset into pdist to get your proximity matrix for calculating CPCC
    proximity_matrix = pdist(data)

    best_coph = -1
    best_method = None

    for method in methods:
        Z = linkage(data, method)
        coph, coph_matrix = cophenet(Z, proximity_matrix)
        if coph > best_coph:
            best_coph = coph
            best_method = method
            best_matrix = coph_matrix
        Z = linkage(data, best_method)
        coph_matrix = cophenet(Z)
    return Z, coph_matrix, best_coph

#Returns a dictionary with the clusters in the form
#"id": "season" : point -> Attr object
def dct_clusters(dct_clusters, Z, coph_matrix, dendo_label, criterion='distance'):
    clusters = fcluster(Z, t=coph_matrix[0], criterion=criterion)
    for i in range(0, len(dendo_label)):
        point = ranking.Attr(dendo_label[i], clusters[i])
        if point.value2 not in dct_clusters:
            dct_clusters[point.value2] = {}
            dct_clusters[point.value2][point.value3] = point
        else:
            dct_clusters[point.value2][point.value3] = point
    return dct_clusters, clusters

# Return the cluster of an specific type
def get_cluster_of_type(all_dct, country, league, seasons, target_col, clean_type):
    # Data initialization
    data, column_names = ranking.concat_data(country, league, seasons, target_col, clean_type)
    all_data, all_season, all_names, all_target = ranking.get_all_data(data, len(target_col))
    # Use Agglomerative
    all_data = preprocessing.StandardScaler().fit_transform(all_data)
    dendo_label = ranking.label_team_season(all_names, all_season)
    Z, coph_matrix, coph = HierarchicalClustering(all_data, dendo_label)
    all_dct, all_lst = dct_clusters(all_dct, Z, coph_matrix, dendo_label)
    return all_dct

#Returns three clusters as: overall, home and away performance
def get_clusters(country, league, curr_week, season, year_window=1):
    # Historicity: multi season directory
    start_season = season
    seasons = list(range(start_season, start_season - year_window, -1))
    seasons = sorted(seasons, reverse=True)
    # Targets
    target_col = ["rank", "points", "description"]
    # Gets the dictionary
    overall_dct = dict()
    home_dct = dict()
    away_dct = dict()
    for season in seasons:
        season = [season]
        overall_dct = get_cluster_of_type(overall_dct, country, league, season, target_col, clean_type=None)
        home_dct = get_cluster_of_type(home_dct, country, league, season, target_col, clean_type='home')
        away_dct = get_cluster_of_type(away_dct, country, league, season, target_col, clean_type='away')
    return (overall_dct, home_dct, away_dct)

In [221]:
# TODO: Remove pezzali
"""
    Creates the statistics dataset with petzalli score
"""
def get_statistics(country, league, curr_week, season, year_window=1):
    # Historicity: multi season directory
    start_season = season
    seasons = list(range(start_season, start_season - year_window, -1))
    seasons = sorted(seasons, reverse=True)
    # Data initialization
    data = []
    for season in seasons:
        df, target = playstyle.df_season(country, league, season, curr_week, drop_goals=False)
        tup = (season, df, target)
        data.append(tup)
    all_data, all_target = playstyle.get_all(data)
    pezzali = playstyle.pezzali_data(all_data)
    return pezzali, all_data, all_target

In [222]:
"""
    Function that maps the dataset with the clusters
"""
def max_appearance_rank(dct, season):
    dct_rank = dict()
    season = str(int(season))
    for team in dct:
        if season in dct[team]:
            rank = dct[team][season].value
            if rank not in dct_rank:
                dct_rank[rank] = 1
            else:
                dct_rank[rank] += 1
    return max(dct_rank.keys(), key=dct_rank.get)

def get_rank(df, dct, team):
    rank = []
    for index, row in df.iterrows():
        season = row["season"]
        season = season - 1
        if str(row[team]) in dct:
            if str(season) in dct[str(row[team])]:
                rank.append(dct[str(row[team])][str(season)].value)
            else:
                max_appear = max_appearance_rank(dct, season)
                rank.append(max_appear)
        else:
            max_appear = max_appearance_rank(dct, season)
            rank.append(max_appear)
    return rank

def get_data(dct, statistics):
    statistics["home_team.overall_rank"] = get_rank(statistics, dct[0], "home_team.id")
    statistics["away_team.overall_rank"] = get_rank(statistics, dct[0], "away_team.id")
    statistics["home_team.ranking"] = get_rank(statistics, dct[1], "home_team.id")
    statistics["away_team.ranking"] = get_rank(statistics, dct[2], "away_team.id")
    return statistics, statistics["home_team.overall_rank"].values.reshape(-1, 1), statistics["away_team.overall_rank"].values.reshape(-1, 1), statistics["home_team.ranking"].values.reshape(-1, 1), statistics["away_team.ranking"].values.reshape(-1, 1)

In [223]:
def get_median(df, method=None, season=None, week=None):
    # GET median for team_home.stats_home. team_home.stats_away. team_away.stats_home. team_away.stats_away.
    # Filter df according to method
    if method == '1':
        print("Current and past season median")
        df = df[(df["season"].isin([season, season-1]))]
    elif method == '2':
        print("Current and all past seasons median")
        df = df[(df["season"] <= season)]
    elif method == '3':
        print("Current season only median")
        # First and second week zero/one home/away games
        if week > 3:
            df = df[(df["season"] == season)]
        else:
            df = df[(df["season"].isin([season, season-1]))]
    elif method == '4':
        print("Last 5 games median")
        if week > 5:
            weeks = [w for w in range(week-1, week-6, -1)]
            df = df[(df["season"] == season) & (df["week"].isin(weeks))]
        else:
            df = df[(df["season"].isin([season, season-1]))]
    # Median home
    extra_columns = df.filter(["goals_home"]).columns
    columns_home = df.filter(regex='^stats_home').columns
    columns_home = columns_home.append(extra_columns)
    median_home = df.groupby(['home_team.id'], as_index=True)[columns_home].median()
    # Median away
    extra_columns = df.filter(["goals_away"]).columns
    columns_away = df.filter(regex='^stats_away').columns
    columns_away = columns_away.append(extra_columns)
    median_away = df.groupby(['away_team.id'], as_index=True)[columns_away].median()
    return median_home, median_away

In [224]:
def create_test_set(df, teams, season, week, pezzali=True, method=None):
    columns_home = ["goals_home", "stats_home.s_off_g", "stats_home.s_on_g", "stats_home.s_in", 
                   "stats_home.saves", "stats_home.s_blocked", "stats_home.c_yellow", "stats_home.s_out"]
    columns_away = ["goals_away", "stats_away.s_off_g", "stats_away.s_on_g", "stats_away.s_in", 
                   "stats_away.saves", "stats_away.s_blocked", "stats_away.c_yellow", "stats_away.s_out"]
    columns = ["season", "week"]
    columns = columns_home + columns_away + columns
    
    if pezzali == False:
        columns_home = df.filter(regex='^stats_home').columns
        columns_away = df.filter(regex='^stats_away').columns
        columns = columns_home.append(columns_away)
    
    test_set = pd.DataFrame(columns=columns)
    home_teams = [match[0] for match in teams]
    away_teams = [match[1] for match in teams]
    seasons = [season for i in range(0, len(teams))]
    weeks = [week for i in range(0, len(teams))]
    test_set["home_team.id"] = home_teams
    test_set["away_team.id"] = away_teams
    test_set["season"] = seasons
    test_set["week"] = weeks
    
    columns_home = test_set.filter(columns_home).columns
    columns_away = test_set.filter(columns_away).columns
    i_th = test_set.columns.get_loc("home_team.id")
    i_ta = test_set.columns.get_loc("away_team.id")
    
    median_home, median_away = get_median(df, method, season, week)

    for i in range(0, len(test_set)):
        for index, row in median_home.iterrows():
            if test_set.iloc[i,i_th] == index:
                for c in columns_home:
                    try:
                        i_c = test_set.columns.get_loc(c)
                        test_set.iloc[i,i_c] = row[c]
                    except:
                        print(index, c)
    for i in range(0, len(test_set)):
        for index, row in median_away.iterrows():
            if test_set.iloc[i,i_ta] == index:
                for c in columns_away:
                    try:
                        i_c = test_set.columns.get_loc(c)
                        test_set.iloc[i,i_c] = row[c]
                    except:
                        print(index, c)
    # For non existing teams (this means ascending teams fil with median of column)
    test_set = test_set.apply(lambda x: x.fillna(x.median()),axis=0)
    return test_set

In [225]:
"""
    Everything to retrieve the data
"""
country = 'ES'
league = '140'
curr_week = 38
season = 2019

# Dictionary for the clusters (clusters from 2016 - 2019)
dct = get_clusters(country, league, curr_week, season, year_window=4)

# Dataframe for the match (test from 2017 to 2019)
statistics_training, statistics_to_test, target = get_statistics(country, league, curr_week, season, year_window=3)
# Maps previous years data with ranking clusters
statistics_training = get_data(dct, statistics_training)

In [226]:
"""
    A proper evaluation: we cannot test week 10 2018 with week 20 2018
    The sequence of events must be maintained
"""
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

def models_evaluation(train_set, target, test_set, test_target, accuracy_dct, total_tests):
    total_tests = total_tests + len(test_target)
    st = preprocessing.StandardScaler().fit_transform(train_set)
    sp = preprocessing.StandardScaler().fit_transform(test_set)
    classifiers = dict(nb=GaussianNB(), svc=SVC(kernel='linear', probability=True), lreg=linear_model.LogisticRegression(max_iter=1000), rforest=RandomForestClassifier(criterion="entropy", max_features="log2", min_samples_leaf=2))
    for name, clf in classifiers.items():
        clf.fit(st, target)
        Y = clf.predict(sp)
        accuracy_dct[name] += accuracy_score(test_target, Y, normalize=False)
    return accuracy_dct, total_tests
        
def subtraining_testset(df, season, week):
    df_sub = df[(df["season"] == season) & (df["week"] < week)]
    
    if week < 3:
        df_past = df[(df["season"] == season-1)]
        df_sub = df_sub.append(df_past)
    
    return df_sub

def subtraining_trainset(df, season, week, up_to_season):
    df_sub = df[(df["season"] == season) & (df["week"] < week)]
    target = []
    
    past_seasons = [s for s in range(up_to_season, season)]
    
    df_past = df[(df["season"].isin(past_seasons))]
    
    df_sub = df_sub.append(df_past)
    
    for index, row in df_sub.iterrows():
        target.append(playstyle.get_status(row))
    
    return df_sub, target

def test_list(df, season, week):
    df_test = df[(df["season"] == season) & (df["week"] == week)]
    teams = []
    target = []
    for index, row in df_test.iterrows():
        teams.append([row["home_team.id"], row["away_team.id"]])
        target.append(playstyle.get_status(row))
    return teams, target


In [227]:
"""
    This function returns a training set with the given method
"""
def train_method(dct, train_set, s, w, seasons, method, extras=None):
    print("Train Method")
    statistics_subset = None
    target = None
    if method == '1':
        print("Current and past season median")
    elif method == '2':
        print("Current and all past seasons median")
    elif method == '3':
        print("Current season only median")
    elif method == '4':
        print("Last 5 games median")
    elif method == '5':
        print("Current and past season linear regression")
    elif method == '6':
        print("Current and all past seasons linear regression")
    elif method == '7':
        print("Current season only linear regression")
    elif method == '8':
        print("Plain train set: all games from past seasons")
        statistics_subset, target = subtraining_trainset(train_set, s, w, seasons)
    if extras == '1':
        print("Applies pezzali to train set")
        statistics_subset = playstyle.pezzali_data(statistics_subset, is_train=True)
        statistics_subset,_,_,_,_ = get_data(dct, statistics_subset)
        statistics_subset = statistics_subset.drop(columns=['home_team.name', 'away_team.name'])
    elif extras == '2':
        print("Applies pca to train set")
        statistics_subset,ho,ao,hr,ar = get_data(dct, statistics_subset)
        statistics_subset = statistics_subset.drop(columns=['home_team.name', 'away_team.name'])
        pca = PCA(n_components=5)
        statistics_subset = pca.fit_transform(statistics_subset)
        np.append(statistics_subset, ho, axis=1)
        np.append(statistics_subset, ao, axis=1)
        np.append(statistics_subset, hr, axis=1)
        np.append(statistics_subset, ar, axis=1)
    elif extras == '3':
        print("Adds pezzali and then pca to train set")
        statistics_subset = playstyle.pezzali_data(statistics_subset, is_train=True, both=True)
        statistics_subset,ho,ao,hr,ar = get_data(dct, statistics_subset)
        statistics_subset = statistics_subset.drop(columns=['home_team.name', 'away_team.name'])
        pca = PCA(n_components=5)
        statistics_subset = pca.fit_transform(statistics_subset)
        np.append(statistics_subset, ho, axis=1)
        np.append(statistics_subset, ao, axis=1)
        np.append(statistics_subset, hr, axis=1)
        np.append(statistics_subset, ar, axis=1)
    elif extras == '0':
        print("Plain train set with clusters")
        statistics_subset = statistics_subset.drop(columns=['index','goals_away','goals_home'])
        statistics_subset,_,_,_,_ = get_data(dct, statistics_subset)
        statistics_subset = statistics_subset.drop(columns=['home_team.id', 'away_team.id','home_team.name', 'away_team.name'])
    return statistics_subset, target

In [228]:
"""
    This function returns a test set with the given method
"""
def test_method(dct, test_set, s, w, method, extras=None):
    print("Test Method")
    teams, test_target = test_list(test_set, s, w)
    if extras == '2' or extras == '0':
        p = False
    else:
        p = True
    if 1 <= int(method) <= 4:
        statistics_test = create_test_set(test_set, teams, s, w, p, method)
    elif method == '5':
        print("Current and past season linear regression")
    elif method == '6':
        print("Current and all past seasons linear regression")
    elif method == '7':
        print("Current season only linear regression")
    if extras == '1':
        print("Applies pezzali to test set")
        # Get test pezalli and mappings
        statistics_test = playstyle.pezzali_data(statistics_test, is_train=False)
        statistics_test,_,_,_,_ = get_data(dct, statistics_test)
    elif extras == '2':
        print("Applies pca to train set")
        statistics_test,ho,ao,hr,ar = get_data(dct, statistics_test)
        pca = PCA(n_components=5)
        statistics_test = pca.fit_transform(statistics_test)
        np.append(statistics_test, ho, axis=1)
        np.append(statistics_test, ao, axis=1)
        np.append(statistics_test, hr, axis=1)
        np.append(statistics_test, ar, axis=1)
    elif extras == '3':
        print("Adds pezzali and then pca to test set")
        statistics_test = playstyle.pezzali_data(statistics_test, is_train=False, both=True)
        statistics_test,ho,ao,hr,ar = get_data(dct, statistics_test)
        n = 5
        pca = PCA(n_components=n)
        statistics_test = pca.fit_transform(statistics_test)
        np.append(statistics_test, ho, axis=1)
        np.append(statistics_test, ao, axis=1)
        np.append(statistics_test, hr, axis=1)
        np.append(statistics_test, ar, axis=1)
    elif extras == '0':
        print("Plain test set with clusters")
        statistics_test,_,_,_,_ = get_data(dct, statistics_test)
        statistics_test = statistics_test.drop(columns=['home_team.id', 'away_team.id'])
    return statistics_test, test_target

In [229]:
# This test is when testing against current season and previous seasons
def custom_test(clusters_dct, train_m, test_m, df, season, year_window=1, week=39, extras=None):
    # Historicity: multi season directory
    start_season = season
    seasons = list(range(start_season, start_season - year_window, -1))
    seasons.reverse()
    accuracy_dct = {
        "nb":0,
        "lreg":0,
        "svc":0,
        "rforest":0
    }
    total_tests = 0
    for i in range(1, len(seasons)):
        # Test against 38 weeks
        s = seasons[i]
        if s != season:
            up_to_week = 39
        else:
            up_to_week = week
        for w in range(1, up_to_week):
            statistics_training, target = train_method(clusters_dct, df, s, w, seasons[0], train_m, extras)
            statistics_predict, test_target = test_method(clusters_dct, df, s, w, test_m, extras)
            accuracy_dct, total_tests = models_evaluation(statistics_training, target, statistics_predict, test_target, accuracy_dct, total_tests)
    
    for clf in accuracy_dct:
        accuracy_dct[clf] = accuracy_dct[clf]/total_tests
    
    return accuracy_dct

In [241]:
# Dataframe for the match (test from 2017 to 2019)
up_to_season = 2019
year_window = 3 #2019, 2018 +1 because 2017 needs information about 2016
accuracy = custom_test(dct, '8', '2', statistics_to_test, up_to_season, year_window=year_window, extras='3')
print(accuracy)

Train Method
Plain train set: all games from past seasons
Adds pezzali and then pca to train set
Test Method
Current and all past seasons median
Adds pezzali and then pca to test set
Train Method
Plain train set: all games from past seasons
Adds pezzali and then pca to train set
Test Method
Current and all past seasons median
Adds pezzali and then pca to test set
Train Method
Plain train set: all games from past seasons
Adds pezzali and then pca to train set
Test Method
Current and all past seasons median
Adds pezzali and then pca to test set
Train Method
Plain train set: all games from past seasons
Adds pezzali and then pca to train set
Test Method
Current and all past seasons median
Adds pezzali and then pca to test set
Train Method
Plain train set: all games from past seasons
Adds pezzali and then pca to train set
Test Method
Current and all past seasons median
Adds pezzali and then pca to test set
Train Method
Plain train set: all games from past seasons
Adds pezzali and then pca t

Train Method
Plain train set: all games from past seasons
Adds pezzali and then pca to train set
Test Method
Current and all past seasons median
Adds pezzali and then pca to test set
Train Method
Plain train set: all games from past seasons
Adds pezzali and then pca to train set
Test Method
Current and all past seasons median
Adds pezzali and then pca to test set
Train Method
Plain train set: all games from past seasons
Adds pezzali and then pca to train set
Test Method
Current and all past seasons median
Adds pezzali and then pca to test set
Train Method
Plain train set: all games from past seasons
Adds pezzali and then pca to train set
Test Method
Current and all past seasons median
Adds pezzali and then pca to test set
Train Method
Plain train set: all games from past seasons
Adds pezzali and then pca to train set
Test Method
Current and all past seasons median
Adds pezzali and then pca to test set
Train Method
Plain train set: all games from past seasons
Adds pezzali and then pca t

In [24]:
# Dataframe for the match (test from 2017 to 2019)
up_to_season = 2019
year_window = 3 #2019, 2018 +1 because 2017 needs information about 2016
#accuracy = custom_test(dct, statistics_to_test, up_to_season, year_window, pezzali=True)
#print(accuracy)
accuracy = custom_test(dct, '8', '3', statistics_to_test, up_to_season, year_window, extras='2')
print(accuracy)

{'nb': 0.4026315789473684, 'lreg': 0.41710526315789476, 'svc': 0.4355263157894737, 'rforest': 0.3815789473684211}


In [None]:
"""
    TODOs
    Test set: checar mean, checar mean/median en un intervalo de 5 partidos
    Train set: checar todas las diferencias, 
"""

In [10]:
# Testing set to predict
# Creating fake records
teams = [[545, 538], [715, 531], [724, 727], [542, 543], [720, 548], [533, 726], [532, 539]]
season = 2020
week = 1
statistics_test = create_test_set(statistics_to_test, teams, season, week)
# Calculating pezzali score
test_pezzali = playstyle.pezzali_data(statistics_test, is_train=False)
# 1. Predicting with clusters of last season
test_pezzali.head()

Unnamed: 0,home_team.id,away_team.id,diff_pezzali,diff_s_fraction,diff_defensive,stats_away.c_yellow,stats_home.c_yellow,season,week
0,545,538,-0.507937,-0.096591,-3.0,2.0,3.0,2020,1
1,715,531,-0.647727,0.02381,1.0,2.0,2.0,2020,1
2,724,727,-0.236111,0.018519,0.5,3.0,2.0,2020,1
3,542,543,0.236111,0.046154,1.0,3.0,3.0,2020,1
4,720,548,-0.267857,0.0,-1.0,2.0,2.0,2020,1


In [11]:
statistics_predict = get_data(dct, test_pezzali)

In [12]:
statistics_predict

Unnamed: 0,home_team.id,away_team.id,diff_pezzali,diff_s_fraction,diff_defensive,stats_away.c_yellow,stats_home.c_yellow,season,week,home_team.overall_rank,away_team.overall_rank,home_team.ranking,away_team.ranking
0,545,538,-0.507937,-0.096591,-3.0,2.0,3.0,2020,1,3,3,4,2
1,715,531,-0.647727,0.02381,1.0,2.0,2.0,2020,1,3,3,4,2
2,724,727,-0.236111,0.018519,0.5,3.0,2.0,2020,1,3,3,4,2
3,542,543,0.236111,0.046154,1.0,3.0,3.0,2020,1,3,3,4,2
4,720,548,-0.267857,0.0,-1.0,2.0,2.0,2020,1,3,3,4,2
5,533,726,0.174242,0.033333,0.5,2.0,2.0,2020,1,3,3,4,2
6,532,539,0.583333,-0.010989,-1.0,2.0,1.0,2020,1,3,3,4,2


In [14]:
st = remove_before_models(statistics_training)
sp = remove_before_models(statistics_predict, is_train=False)

In [15]:
from scipy import stats
# Downsampling TODO
for c in st.columns:
    shapiro_test = stats.shapiro(st[c])
    print("column = %s \t statistic = %f pvalue = %f" %(c,shapiro_test.statistic, shapiro_test.pvalue))

column = diff_pezzali 	 statistic = 0.972824 pvalue = 0.000000
column = diff_defensive 	 statistic = 0.991345 pvalue = 0.000003
column = stats_away.c_yellow 	 statistic = 0.941131 pvalue = 0.000000
column = stats_home.c_yellow 	 statistic = 0.937945 pvalue = 0.000000
column = home_team.overall_rank 	 statistic = 0.626416 pvalue = 0.000000
column = away_team.overall_rank 	 statistic = 0.626416 pvalue = 0.000000
column = home_team.ranking 	 statistic = 0.756818 pvalue = 0.000000
column = away_team.ranking 	 statistic = 0.878469 pvalue = 0.000000


In [16]:
for c in sp.columns:
    shapiro_test = stats.shapiro(sp[c])
    print("column = %s \t statistic = %f pvalue = %f" %(c,shapiro_test.statistic, shapiro_test.pvalue))

column = diff_pezzali 	 statistic = 0.952753 pvalue = 0.754677
column = diff_defensive 	 statistic = 0.847514 pvalue = 0.116616
column = stats_away.c_yellow 	 statistic = 0.600399 pvalue = 0.000275
column = stats_home.c_yellow 	 statistic = 0.840044 pvalue = 0.099451
column = home_team.overall_rank 	 statistic = 1.000000 pvalue = 1.000000
column = away_team.overall_rank 	 statistic = 1.000000 pvalue = 1.000000
column = home_team.ranking 	 statistic = 1.000000 pvalue = 1.000000
column = away_team.ranking 	 statistic = 1.000000 pvalue = 1.000000




In [17]:
st = preprocessing.StandardScaler().fit_transform(st)
sp = preprocessing.StandardScaler().fit_transform(sp)
sv_p = SVC(kernel='linear', probability=True)
sv_p.fit(st, target)
print(sv_p.predict(sp))

[ 0 -1 -1  1  0  1  1]


In [18]:
# using scaler on nb
from sklearn.naive_bayes import GaussianNB
sv_p = SVC(kernel='linear', probability=True)
sv_p.fit(st, target)
print(sv_p.predict(sp))

[-1 -1  0  1  0  1  1]


In [21]:
from sklearn import linear_model
ols = linear_model.LogisticRegression()
ols.fit(st, target)
print(ols.predict(sp))

[ 0 -1 -1  1  0  1  1]


In [20]:
from sklearn.ensemble import RandomForestClassifier
# At least two matches en each classification per week
premise_clf = RandomForestClassifier(criterion="entropy", max_features="log2", min_samples_leaf=2).fit(st, target)
print(premise_clf.predict(sp))

[ 0 -1  0  1  0  1  1]


In [9]:
def remove_before_models(df, is_train=True, pezzali=True):
    #df = df.drop(columns=["home_team.id", "away_team.id", "season", "week", "diff_s_fraction"], axis=1)
    if pezzali == False:
        df = df.drop(columns=['home_team.id', 'away_team.id', 'season', 'week'])
        if is_train:
            df = df.drop(columns=['home_team.name', 'away_team.name', 'index', 'goals_home', 'goals_away'])
        return df
    df = df.drop(columns=["home_team.id", "away_team.id", "week", "season", "home_team.overall_rank", "away_team.overall_rank", "home_team.ranking", "away_team.ranking"], axis=1)
    if is_train:
        df = df.drop(columns=["home_team.name", "away_team.name"], axis=1)
    return df

In [None]:
def get_median(df, method=None, season=None, week=None):
    # GET median for team_home.stats_home. team_home.stats_away. team_away.stats_home. team_away.stats_away.
    # Filter df according to method
    if method == '1':
        print("Current and past season median")
        df = df[(df["season"].isin([season, season-1]))]
    elif method == '2':
        print("Current and all past seasons median")
        df = df[(df["season"] <= season)]
    elif method == '3':
        print("Current season only median")
        if week > 3:
            df = df[(df["season"] == season)]
        else:
            df = df[(df["season"].isin([season, season-1]))]
    elif method == '4':
        print("Last 5 games median")
        if week > 5:
            weeks = [w for w in range(week-1, week-6, -1)]
            df = df[(df["season"] == season) & (df["week"].isin(weeks))]
        else:
            df = df[(df["season"].isin([season, season-1]))]
    extra_columns = df.filter(["goals_home"]).columns
    columns_home = df.filter(regex='^stats_home').columns
    columns_home = columns_home.append(extra_columns)
    median_home = df.groupby(['home_team.id'], as_index=True)[columns_home].median()
    #median_home = df.groupby(['home_team.id','season'], as_index=True)[columns_home].median()
    try:
        p = median_home.index.values
        b = list(zip(*p))
        median_home.insert(0, column="home_team_id",value = b[0])
    except:
        print("The indexes that are trying to be registered are already created.")
    extra_columns = df.filter(["goals_away"]).columns
    columns_away = df.filter(regex='^stats_away').columns
    columns_away = columns_away.append(extra_columns)
    median_away = df.groupby(['away_team.id'], as_index=True)[columns_away].median()
    #median_away = df.groupby(['away_team.id','season'], as_index=True)[columns_away].median()
    try:
        p = median_away.index.values
        b = list(zip(*p))
        median_away.insert(0, column="away_team_id",value = b[0])
    except:
        print("The indexes that are trying to be registered are already created.")
    return median_home, median_away

In [None]:
# This test is when testing against current season and previous seasons
def custom_test(clusters_dct, train_method, test_method, df, season, year_window=1, week=39, extras=None):
    # Historicity: multi season directory
    start_season = season
    seasons = list(range(start_season, start_season - year_window, -1))
    seasons.reverse()
    accuracy_dct = {
        "nb":0,
        "lreg":0,
        "svc":0,
        "rforest":0
    }
    total_tests = 0
    for i in range(1, len(seasons)):
        # Test against 38 weeks
        s = seasons[i]
        if s != season:
            up_to_week = 39
        else:
            up_to_week = week
        for w in range(1, up_to_week):
            # This is to create the train set
            statistics_subset, target = subtraining_trainset(df, s, w, seasons[0])
            teams, test_target = test_list(df, s, w)
            # Just current season for creating the test set
            statistics_testset = subtraining_testset(df, s, w)
            if pezzali:
                statistics_test = create_test_set(statistics_testset, teams, s, w)
                # Creates pezzali
                statistics_pezzali = playstyle.pezzali_data(statistics_subset, is_train=True)
                # Maps previous years data with ranking clusters
                statistics_training = get_data(dct, statistics_pezzali)
                # Get test pezalli and mappings
                test_pezzali = playstyle.pezzali_data(statistics_test, is_train=False)
                statistics_predict = get_data(dct, test_pezzali)
            else:
                # Plain statistics
                statistics_training = statistics_subset
                statistics_training = get_data(dct, statistics_training)
                # Get test statistics and mappings
                statistics_predict = create_test_set(statistics_testset, teams, s, w, pezzali=pezzali)
                statistics_predict = get_data(dct, statistics_predict)
            accuracy_dct, total_tests = models_evaluation(statistics_training, target, statistics_predict, test_target, accuracy_dct, total_tests, pezzali)
    
    for clf in accuracy_dct:
        accuracy_dct[clf] = accuracy_dct[clf]/total_tests
    
    return accuracy_dct