In [1]:
#importando bibliotecas

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
# from imblearn.pipeline import Pipeline
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier, plot_importance
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import *
from hyperopt import tpe, hp, fmin, STATUS_OK,Trials
from hyperopt.pyll.base import scope
import matplotlib.cm as cm

import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [3]:
#Traditional

#ler csv
ger = pd.read_csv("analitics/traditional_Germany.csv")
ita = pd.read_csv("analitics/traditional_Italy.csv")
eng = pd.read_csv("analitics/traditional_England.csv")
fra = pd.read_csv("analitics/traditional_France.csv")
eur = pd.read_csv("analitics/traditional_European_Championship.csv")
cup = pd.read_csv("analitics/traditional_World_Cup.csv")

In [11]:
#merge dfs
eng.head()

Unnamed: 0,matchID,team1_ID,team2_ID,tournament,gk_saves_T1,gk_saves_T2,red_card_T1,red_card_T2,yellow_card_T1,yellow_card_T2,assists_T1,assists_T2,shots_T1,shots_T2,shots_on_target_T1,shots_on_target_T2,passes_T1,passes_T2,goals_T1,goals_T2,ball_possession_T1,ball_possession_T2,pass_acc_T1,pass_acc_T2,gk_acc_T1,gk_acc_T2,shot_acc_T1,shot_acc_T2,winner
0,2500089,1646,1659,England,3.0,3.0,0.0,0.0,0.0,0.0,1.0,2.0,9,14.0,4.0,5.0,321,305,1.0,2.0,0.51278,0.48722,0.769784,0.796345,0.6,0.75,0.444444,0.357143,0
1,2500090,1628,1627,England,1.0,4.0,0.0,0.0,2.0,3.0,1.0,0.0,11,6.0,5.0,1.0,476,305,2.0,0.0,0.609475,0.390525,0.883117,0.826558,1.0,0.666667,0.454545,0.166667,1
2,2500091,1673,1609,England,4.0,3.0,0.0,0.0,1.0,0.0,0.0,1.0,18,9.0,3.0,4.0,317,419,0.0,1.0,0.430707,0.569293,0.802532,0.836327,0.8,1.0,0.166667,0.444444,0
3,2500092,1612,1651,England,1.0,7.0,0.0,0.0,0.0,0.0,3.0,0.0,21,2.0,11.0,1.0,643,194,4.0,0.0,0.76822,0.23178,0.899301,0.76378,1.0,0.636364,0.52381,0.5,1
4,2500093,1611,1644,England,3.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,7,6.0,1.0,3.0,364,365,1.0,0.0,0.499314,0.500686,0.817978,0.860849,1.0,0.0,0.142857,0.5,1


In [None]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("matchID")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)

    return group


cols = ["gk_saves_T1", "shots_T1", "shots_on_target_T1", "passes_T1", "goals_T1", "ball_possession_T1", "pass_acc_T1"]
new_cols = [f"{c}_rolling" for c in cols]

eng = eng.groupby("team1_ID").apply(lambda x : rolling_averages(x, cols, new_cols))

In [None]:
#Separando dados e target
X = eng.drop(['winnner', 'matchID', 'team1_ID', 'team2_ID', 'tournament'],axis=1)
y = eng['winner'] 
col_names = X.columns
idx = eng['matchID']

#Normalizando os dados
XScaled = StandardScaler().fit_transform(X)

#Separando treino e teste
X_train, X_test, y_train, y_test = train_test_split(XScaled,
                                                    y.values,
                                                    stratify= y.values,
                                                    test_size=0.3,
                                                    random_state=42)


In [None]:
#Métricas de avaliação
def evaluation_metrics(clf,test_x, test_y, pred_y, ax_1, ax_2):
    accuracy = accuracy_score(test_y, pred_y)
    precision = precision_score(test_y, pred_y)
    recall = recall_score(test_y, pred_y)
    f1 = f1_score(test_y, pred_y)
        
    plot_precision_recall_curve(clf, test_x, test_y, ax=ax_1)
    plot_roc_curve(clf, test_x, test_y, ax=ax_2)
    
    return accuracy, precision, recall, f1

In [None]:
def testa_modelos(train_x, train_y, test_x, test_y):
    
    fig, (ax1, ax2) = plt.subplots(1,2, figsize=(20,6))

    for clf, name in [(rf, 'RandomForest'),
                      (logi, 'LogisticRegression'),
                      (xgb, 'XGBClassifier')]:
        
        # sem SMOTE
        clf.fit(train_x, train_y)
        y_pred = clf.predict(test_x)

        accuracy, precision, recall, f1 = evaluation_metrics(clf, test_x, test_y, y_pred, ax1, ax2)
        print(f"{name} sem SMOTE: accuracy: {100*accuracy:.2f}% | precision: {100*precision:.2f}% | recall: {100*recall:.2f}% | f1: {100*f1:.2f}%")
    
    ax1.legend(loc="right")
    ax1.set_title('Precision-Recall curve')
    
    ax2.legend(loc="right")
    ax2.set_title('ROC AUC curve')

In [None]:
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=6,
    random_state=42,
    class_weight='balanced',
    n_jobs = -1)

logi = LogisticRegression(
    penalty='l2', 
    solver='liblinear', 
    max_iter=100, 
    class_weight='balanced')

xgb = XGBClassifier(
    learning_rate=0.05,
    n_estimators=100,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    scale_pos_weight=10)

In [None]:
testa_modelos(X_train, y_train, X_test, y_test)

In [None]:
#Achando melhores hiperparâmetros com hyperopt para logistic regression
           
#Define Parameter Space for Optimization
space = {
    'C': hp.choice('C', [0.001, 0.01, 0.1, 1, 10, 100, 1000]),
    'penalty': hp.choice('penalty', ['l1']),
    'max_iter': hp.quniform('max_iter', 100, 700, 100),
    'solver': hp.choice('solver', ['liblinear', 'saga']), #tirei 'lbfgs' e'sag' pq não funciona com l1
    'class_weight' : hp.choice('class_weight', ['balanced', None])
}

#Defining a Function to Minimize (Objective Function)
def hyperparameter_tuning(params):
    clf = LogisticRegression(**params, random_state=42, n_jobs=-1)
    f1 = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1', error_score='raise').mean()
    
    return {"loss": -f1, "status": STATUS_OK}

# Fine Tune the Model
trials = Trials()

best = fmin(
    fn=hyperparameter_tuning,
    space = space, 
    algo=tpe.suggest, 
    max_evals=10, 
    trials=trials
)

print("Best: {}".format(best))

In [None]:
#Achando melhores hiperparâmetros com hyperopt para Random Forest

#Define Parameter Space for Optimization
space = {
    "n_estimators": hp.choice("n_estimators", [100, 200, 300, 400,500]),
    "max_depth": hp.quniform("max_depth", 2, 24, 2),
    "max_features": hp.choice("max_features", ['auto', 'sqrt', 'log2']),
    "criterion": hp.choice("criterion", ["gini", "entropy"]),
    "min_samples_leaf":hp.choice("min_samples_leaf", [1, 2, 3, 4, 5]),
    "min_samples_split":hp.choice("min_samples_split", [2, 3, 4, 5, 6]),
    "class_weight": hp.choice("class_weight", ['balanced', 'balanced_subsample', None]),
    "bootstrap": hp.choice("bootstrap", [True, False])
}

#Defining a Function to Minimize (Objective Function)
def hyperparameter_tuning(params):
    model = RandomForestClassifier(**params, n_jobs=-1)
    f1 = cross_val_score(model, X_train, y_train, cv = 5, scoring='f1', error_score='raise').mean()
    
    return {"loss": -f1, "status": STATUS_OK}

# Fine Tune the Model
trials = Trials()

best = fmin(
    fn=hyperparameter_tuning,
    space = space, 
    algo=tpe.suggest, 
    max_evals=10, 
    trials=trials
)

print("Best: {}".format(best))

In [None]:
#Achando melhores hiperparâmetros com hyperopt para XGBoosting

#Define Parameter Space for Optimization
space = {
 'max_depth' : hp.choice('max_depth', range(5, 15, 1)),
 'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01),
 'n_estimators' : hp.choice('n_estimators', range(50, 400, 10)),
 'gamma' : hp.quniform('gamma', 0, 0.30, 0.01),
 'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
 'subsample' : hp.quniform('subsample', 0.1, 1, 0.05),
 'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.01),
 'scale_pos_weight' : hp.choice('scale_pos_weight', range(4, 25, 1)),
}

#Defining a Function to Minimize (Objective Function)
def hyperparameter_tuning(params):
    clf = XGBClassifier(**params, random_state=42, verbosity=0, objective= 'binary:logistic', n_jobs=-1)
    f1 = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1').mean()
    
    return {"loss": -f1, "status": STATUS_OK}

# Fine Tune the Model
trials = Trials()

best = fmin(
    fn=hyperparameter_tuning,
    space = space, 
    algo=tpe.suggest, 
    max_evals=10, 
    trials=trials
)

print("Best: {}".format(best))