In [5]:
import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, ConfusionMatrixDisplay
from scipy.stats import randint
import joblib
import json
from datetime import datetime
import time

# Pastas de saída
MODEL_DIR = "../Treino/models_hierarquico"
RESULTS_DIR = "../Treino/results_hierarquico"
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

In [6]:
df = pd.read_csv('../Análise/players_prepared.csv')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
df.head()

Unnamed: 0,player_positions,overall,potential,age,height_cm,weight_kg,preferred_foot,weak_foot,skill_moves,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,main_position,secondary_position,macro_position,macro_position_secondary,macro_position_enc,main_position_enc,secondary_position_enc,feat_offensive_index,feat_defensive_index,feat_technical_index,feat_speed_index,feat_stamina_strength_ratio,feat_attack_defense_ratio,feat_shooting_passing_ratio,feat_aerial_ability,feat_composure_aggression_ratio,feat_mentality_index,feat_reaction_positioning_avg,feat_vision_passing_combo,feat_bmi,feat_power_index,feat_lightweight_flag,feat_lateral_score,feat_centreback_score,feat_defensive_mid_score,feat_central_mid_score,feat_winger_score,feat_striker_score,feat_attack_minus_defense,feat_pace_minus_shoot
0,"CAM, CM",90,94,22,186,75,1,4,4,80.0,86.0,83.0,90.0,78.0,85.0,66,88,75,90,77,91,73,68,89,91,81,80,83,91,83,86,85,94,80,87,85,82,91,90,74,90,77,79,77,CAM,CM,Midfielder,Midfielder,1,0,3,86.333333,81.5,90.25,80.5,1.175,1.059305,1.036145,80.0,1.058824,88.25,91.0,89.666667,21.678807,86.25,0,80.0,80.333333,89.0,90.0,79.25,83.0,4.833333,-5.5
1,"CM, CDM, RB",89,90,26,182,74,1,4,3,88.0,84.0,84.0,84.0,83.0,85.0,78,80,63,88,78,84,76,69,88,88,84,92,77,89,68,91,82,93,82,91,81,86,86,86,59,84,82,87,86,CM,CDM,Midfielder,Midfielder,1,3,2,84.0,84.0,86.5,88.0,1.134146,1.0,1.0,72.5,1.037037,85.5,87.5,87.333333,22.340297,87.0,0,87.666667,85.0,88.25,86.0,84.5,78.75,0.0,4.0
2,"CDM, RB, CM",89,89,30,177,75,1,4,3,72.0,74.0,89.0,84.0,83.0,79.0,92,68,72,89,69,82,85,79,92,86,75,70,83,89,84,78,77,92,69,83,89,85,78,86,71,86,82,86,85,CDM,RB,Midfielder,Defender,1,2,8,82.333333,81.0,86.25,72.5,1.333333,1.016461,0.831461,74.5,0.966292,83.75,83.5,89.0,23.939481,79.0,0,84.666667,80.0,88.0,87.0,80.25,73.25,1.333333,-1.5
3,"RB, RM",89,90,26,181,73,1,4,4,92.0,79.0,82.0,83.0,82.0,79.0,85,80,71,86,74,82,78,80,73,85,89,95,81,90,78,82,84,95,72,77,77,85,84,78,68,84,82,85,79,RB,RM,Defender,Winger,0,7,9,81.333333,80.5,83.0,92.0,1.319444,1.010352,0.963415,77.5,1.090909,82.75,87.0,79.0,22.282592,83.25,0,91.666667,80.666667,86.0,82.666667,88.0,76.5,0.833333,13.0
4,CM,87,87,28,175,68,1,4,3,80.0,78.0,84.0,86.0,81.0,76.0,79,77,60,89,80,84,79,67,87,87,82,78,91,91,88,80,75,89,66,79,87,85,84,85,73,88,80,85,80,CM,,Midfielder,,1,3,7,82.666667,78.5,86.75,80.0,1.348485,1.053079,0.928571,67.5,1.011494,85.5,87.5,87.0,22.204082,77.5,1,82.0,78.666667,87.0,87.333333,81.25,72.0,4.166667,2.0


In [7]:
df.dtypes

player_positions                    object
overall                              int64
potential                            int64
age                                  int64
height_cm                            int64
weight_kg                            int64
preferred_foot                       int64
weak_foot                            int64
skill_moves                          int64
pace                               float64
shooting                           float64
passing                            float64
dribbling                          float64
defending                          float64
physic                             float64
attacking_crossing                   int64
attacking_finishing                  int64
attacking_heading_accuracy           int64
attacking_short_passing              int64
attacking_volleys                    int64
skill_dribbling                      int64
skill_curve                          int64
skill_fk_accuracy                    int64
skill_long_

In [9]:
# colunas alvo e colunas de feature
targets = ['main_position_enc', 'macro_position_enc', 'secondary_position_enc']
drop_cols = ['player_positions', 'main_position', 'secondary_position',
             'macro_position', 'macro_position_secondary',
             'player_positions'] + targets  # remove targets e strings

X = df.select_dtypes(include=[np.number]).copy()
X = X.drop(columns=[c for c in drop_cols if c in X.columns], errors='ignore')

# targets
y_macro = df['macro_position_enc']
y_main = df['main_position_enc']

# split estratificado por macro_position
X_train, X_test, y_macro_train, y_macro_test, y_main_train, y_main_test = train_test_split(
    X, y_macro, y_main,
    test_size=0.20,
    stratify=y_macro,
    random_state=42
)

print("Shapes:")
print("X_train", X_train.shape, "X_test", X_test.shape)
print("y_macro_train dist:\n", y_macro_train.value_counts())
print("y_main_train dist (sample):\n", y_main_train.value_counts().head())

Shapes:
X_train (13074, 66) X_test (3269, 66)
y_macro_train dist:
 macro_position_enc
0    4893
1    3827
3    2327
2    2027
Name: count, dtype: int64
y_main_train dist (sample):
 main_position_enc
1     2667
10    2027
3     1789
7     1132
2     1131
Name: count, dtype: int64
