In [1]:
"""
    Ranking:
        1.1 Hierarchical Clustering (ranking_prediction.ipynb)
        1.2 Map labels to future matches as home and away of past season
            1.2.1 If team is new, label it as normal (which is a not promoted team)
        1.3 Compute difference in ranking (NO because of no sequential)
        1.4 Future work: study the week development of the league to see if there is a point where
            it can be started to predict the actual behavior of the team as the promotion label
    Prediction:
        2.0 Drop features --> computation differences (visualization.ipynb)
        2.1 Features TEST
            2.1.0 First match as median of past season (already started) TODO
            2.1.2 Median of current season (home and away games are definetely a factor)
            2.1.2.1 Median at home/away <-- Just this
            2.1.2.2 Past average of last n games (test with 3, 5, etc)
            2.1.2.3 Average of last n games at home/away
"""

'\n    Ranking:\n        1.1 Hierarchical Clustering (ranking_prediction.ipynb)\n        1.2 Map labels to future matches as home and away of past season\n            1.2.1 If team is new, label it as normal (which is a not promoted team)\n        1.3 Compute difference in ranking (NO because of no sequential)\n        1.4 Future work: study the week development of the league to see if there is a point where\n            it can be started to predict the actual behavior of the team as the promotion label\n    Prediction:\n        2.0 Drop features --> computation differences (visualization.ipynb)\n        2.1 Features TEST\n            2.1.0 First match as median of past season (already started) TODO\n            2.1.2 Median of current season (home and away games are definetely a factor)\n            2.1.2.1 Median at home/away <-- Just this\n            2.1.2.2 Past average of last n games (test with 3, 5, etc)\n            2.1.2.3 Average of last n games at home/away\n'

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import playstyle
import ranking

In [3]:
"""
    Everything to create the clusters
"""
from sklearn import preprocessing
from scipy.cluster.hierarchy import linkage, cophenet, fcluster
from scipy.spatial.distance import pdist

# Returns Z, coph_matrix and best cophence score from HC
def HierarchicalClustering(data, label):
    methods = ["single","complete","average","centroid","ward"]

    # Pass the dataset into pdist to get your proximity matrix for calculating CPCC
    proximity_matrix = pdist(data)

    best_coph = -1
    best_method = None

    for method in methods:
        Z = linkage(data, method)
        coph, coph_matrix = cophenet(Z, proximity_matrix)
        if coph > best_coph:
            best_coph = coph
            best_method = method
            best_matrix = coph_matrix
        Z = linkage(data, best_method)
        coph_matrix = cophenet(Z)
    return Z, coph_matrix, best_coph

#Returns a dictionary with the clusters in the form
#"id": "season" : point -> Attr object
def dct_clusters(Z, coph_matrix, dendo_label, criterion='distance'):
    clusters = fcluster(Z, t=coph_matrix[0], criterion=criterion)
    dct_clusters = dict()
    for i in range(0, len(dendo_label)):
        point = ranking.Attr(dendo_label[i], clusters[i])
        if point.value2 not in dct_clusters:
            dct_clusters[point.value2] = {}
            dct_clusters[point.value2][point.value3] = point
        else:
            dct_clusters[point.value2][point.value3] = point
    return dct_clusters, clusters

# Return the cluster of an specific type
def get_cluster_of_type(country, league, seasons, target_col, clean_type):
    # Data initialization
    data, column_names = ranking.concat_data(country, league, seasons, target_col, clean_type)
    all_data, all_season, all_names, all_target = ranking.get_all_data(data, len(target_col))
    # Use Agglomerative
    all_data = preprocessing.StandardScaler().fit_transform(all_data)
    dendo_label = ranking.label_team_season(all_names, all_season)
    Z, coph_matrix, coph = HierarchicalClustering(all_data, dendo_label)
    all_dct, all_lst = dct_clusters(Z, coph_matrix, dendo_label)
    return all_dct

#Returns three clusters as: overall, home and away performance
def get_clusters(country, league, curr_week, season, year_window=1):
    # Historicity: multi season directory
    start_season = season
    seasons = list(range(start_season, start_season - year_window, -1))
    # Targets
    target_col = ["rank", "points", "description"]
    # Gets the dictionary
    overall_dct = get_cluster_of_type(country, league, seasons, target_col, clean_type=None)
    home_dct = get_cluster_of_type(country, league, seasons, target_col, clean_type='home')
    away_dct = get_cluster_of_type(country, league, seasons, target_col, clean_type='away')
    return (overall_dct, home_dct, away_dct)

In [4]:
"""
    Creates the statistics dataset with petzalli score
"""
def get_statistics(country, league, curr_week, season, year_window=1):
    # Historicity: multi season directory
    start_season = season
    seasons = list(range(start_season, start_season - year_window, -1))
    # Data initialization
    data = []
    for season in seasons:
        df, target = playstyle.df_season(country, league, season, curr_week, drop_goals=False)
        tup = (season, df, target)
        data.append(tup)
    all_data, all_target = playstyle.get_all(data)
    pezzali = playstyle.pezzali_data(all_data)
    return pezzali, all_data, all_target

In [29]:
"""
    Function that maps the dataset with the clusters
"""

def max_appearance_rank(dct, season):
    dct_rank = dict()
    season = str(season)
    for team in dct:
        rank = dct[team][season].value
        if rank not in dct_rank:
            dct_rank[rank] = 1
        else:
            dct_rank[rank] += 1
    return max(dct_rank.keys(), key=dct_rank.get)

def get_rank(df, dct, season, team):
    rank = []
    max_appear = max_appearance_rank(dct, season)
    for index, row in df.iterrows():
        if str(row[team]) in dct:
            rank.append(dct[str(row[team])][str(season)].value)
        else:
            rank.append(max_appear)
    return rank

def get_data(dct, statistics, season):
    statistics["home_team.overall_rank"] = get_rank(statistics, dct[0], season, "home_team.id")
    statistics["away_team.overall_rank"] = get_rank(statistics, dct[0], season, "away_team.id")
    statistics["home_team.ranking"] = get_rank(statistics, dct[1], season, "home_team.id")
    statistics["away_team.ranking"] = get_rank(statistics, dct[2], season, "away_team.id")
    return statistics

In [19]:
"""
    Everything to retrieve the data
"""
country = 'ES'
league = '140'
curr_week = 38
season = 2019

# Dictionary for the clusters
dct = get_clusters(country, league, curr_week, season)

# Dataframe for the match
statistics_training, statistics_to_test, target = get_statistics(country, league, curr_week, season)

In [32]:
# Maps previous years data with ranking clusters
statistics_training = get_data(dct, statistics_training, season)
statistics_training.head(5)

Unnamed: 0,home_team.id,away_team.id,diff_pezzali,diff_s_fraction,diff_defensive,stats_away.c_yellow,stats_home.c_yellow,season,week,home_team.name,away_team.name,home_team.overall_rank,away_team.overall_rank,home_team.ranking,away_team.ranking
0,533,727,4.651261,-0.178138,8.0,2.0,3.0,2019,22,Villarreal,Osasuna,3,4,4,2
1,533,542,1.455195,-0.097436,-7.0,2.0,2.0,2019,10,Villarreal,Alaves,3,3,4,2
2,533,532,-1.359477,0.233083,-4.0,2.0,0.0,2019,32,Villarreal,Valencia,3,3,4,2
3,533,715,0.174242,0.033333,-1.0,1.0,3.0,2019,1,Villarreal,Granada CF,3,3,4,2
4,533,530,-0.148352,-0.054118,-1.0,1.0,2.0,2019,16,Villarreal,Atletico Madrid,3,1,4,1


In [21]:
def get_median(df):
    # GET median for team_home.stats_home. team_home.stats_away. team_away.stats_home. team_away.stats_away.
    extra_columns = df.filter(["goals_home"]).columns
    columns_home = df.filter(regex='^stats_home').columns
    columns_home = columns_home.append(extra_columns)
    median_home = df.groupby(['home_team.id','season'], as_index=True)[columns_home].median()
    try:
        p = median_home.index.values
        b = list(zip(*p))
        median_home.insert(0, column="home_team_id",value = b[0])
    except:
        print("The indexes that are trying to be registered are already created.")
    extra_columns = df.filter(["goals_away"]).columns
    columns_away = df.filter(regex='^stats_away').columns
    columns_away = columns_away.append(extra_columns)
    median_away = df.groupby(['away_team.id','season'], as_index=True)[columns_away].median()
    try:
        p = median_away.index.values
        b = list(zip(*p))
        median_away.insert(0, column="away_team_id",value = b[0])
    except:
        print("The indexes that are trying to be registered are already created.")
    return median_home, median_away

In [22]:
def create_test_set(df, teams, season):
    median_home, median_away = get_median(df)
    columns_home = ["goals_home", "stats_home.s_off_g", "stats_home.s_on_g", "stats_home.s_in", 
                   "stats_home.saves", "stats_home.s_blocked", "stats_home.c_yellow", "stats_home.s_out"]
    columns_away = ["goals_away", "stats_away.s_off_g", "stats_away.s_on_g", "stats_away.s_in", 
                   "stats_away.saves", "stats_away.s_blocked", "stats_away.c_yellow", "stats_away.s_out"]
    columns = columns_home + columns_away
    
    test_set = pd.DataFrame(columns=columns)
    home_teams = [match[0] for match in teams]
    away_teams = [match[1] for match in teams]
    test_set["home_team.id"] = home_teams
    test_set["away_team.id"] = away_teams
    
    columns_home = test_set.filter(columns_home).columns
    columns_away = test_set.filter(columns_away).columns
    i_th = test_set.columns.get_loc("home_team.id")
    i_ta = test_set.columns.get_loc("away_team.id")

    for i in range(0, len(test_set)):
        for index, row in median_home.iterrows():
            if test_set.iloc[i,i_th] == index[0] and index[1] == season:
                for c in columns_home:
                    try:
                        i_c = test_set.columns.get_loc(c)
                        test_set.iloc[i,i_c] = row[c]
                    except:
                        print(index, c)
    for i in range(0, len(test_set)):
        for index, row in median_away.iterrows():
            if test_set.iloc[i,i_ta] == row["away_team_id"] and index[1] == season:
                for c in columns_away:
                    try:
                        i_c = test_set.columns.get_loc(c)
                        test_set.iloc[i,i_c] = row[c]
                    except:
                        print(index, c)
    # For non existing teams (this means ascending teams fil with median of column)
    test_set = test_set.apply(lambda x: x.fillna(x.median()),axis=0)
    return test_set

In [23]:
# Testing set to predict
# Creating fake records
teams = [[545, 538], [715, 531], [724, 727], [542, 543], [720, 548], [533, 726], [532, 539]]
season = 2019
statistics_test = create_test_set(statistics_to_test, teams, season)
# Calculating pezzali score
test_pezzali = playstyle.pezzali_data(statistics_test, is_train=False)
# 1. Predicting with clusters of last season
test_pezzali.head()

Unnamed: 0,home_team.id,away_team.id,diff_pezzali,diff_s_fraction,diff_defensive,stats_away.c_yellow,stats_home.c_yellow
0,545,538,-0.507937,-0.096591,-3.0,2.0,3.0
1,715,531,-0.647727,0.02381,1.0,2.0,2.0
2,724,727,-0.236111,0.018519,0.5,3.0,2.0
3,542,543,0.236111,0.046154,1.0,3.0,3.0
4,720,548,-0.267857,0.0,-1.0,2.0,2.0


In [30]:
statistics_predict = get_data(dct, test_pezzali, season)

In [31]:
statistics_predict

Unnamed: 0,home_team.id,away_team.id,diff_pezzali,diff_s_fraction,diff_defensive,stats_away.c_yellow,stats_home.c_yellow,home_team.overall_rank,away_team.overall_rank,home_team.ranking,away_team.ranking
0,545,538,-0.507937,-0.096591,-3.0,2.0,3.0,3,5,4,2
1,715,531,-0.647727,0.02381,1.0,2.0,2.0,3,4,4,2
2,724,727,-0.236111,0.018519,0.5,3.0,2.0,3,4,4,2
3,542,543,0.236111,0.046154,1.0,3.0,3.0,3,3,4,2
4,720,548,-0.267857,0.0,-1.0,2.0,2.0,3,3,3,2
5,533,726,0.174242,0.033333,0.5,2.0,2.0,3,3,4,2
6,532,539,0.583333,-0.010989,-1.0,2.0,1.0,3,3,2,2


In [46]:
def remove_before_models(df, is_train=True):
    df = df.drop(columns=["home_team.id", "away_team.id"], axis=1)
    if is_train:
        df = df.drop(columns=["home_team.name", "away_team.name", "season", "week"], axis=1)
    return df

In [47]:
from sklearn.svm import SVC

st = remove_before_models(statistics_training)
sp = remove_before_models(statistics_predict, is_train=False)

st = preprocessing.StandardScaler().fit_transform(st)
sp = preprocessing.StandardScaler().fit_transform(sp)
sv_p = SVC(kernel='linear', probability=True)
sv_p.fit(st, target)
print(sv_p.predict(sp))

[ 0 -1  0  1  0  1  1]


In [48]:
# using scaler on nb
from sklearn.naive_bayes import GaussianNB
gnb_p = GaussianNB()
gnb_p.fit(st, target)
print(gnb_p.predict(sp))

[-1 -1  0  1  1  1  1]


In [51]:
from sklearn import linear_model
ols = linear_model.LinearRegression()
ols.fit(st, target)
print(ols.predict(sp))

[-0.1656951  -0.96014108 -0.21357563  0.41268564  0.15266476  0.46318532
  1.65561294]


In [52]:
from sklearn.ensemble import RandomForestClassifier
# At least two matches en each classification per week
premise_clf = RandomForestClassifier(criterion="entropy", max_features="log2", min_samples_leaf=2).fit(st, target)
print(premise_clf.predict(sp))

[-1 -1 -1  1  0  1  1]
