In [None]:
import pandas as pd
import numpy as np
import cupy as cp

import time
import requests
import datetime 
import logging
logging.getLogger('sqlalchemy.engine').setLevel(logging.WARNING)

import warnings
warnings.filterwarnings("ignore")

import sys 
sys.path.append('C:/Users/dohyu/Desktop/Github/side_proj_fifa')

import matplotlib.pyplot as plt 
import matplotlib as mpl

mpl.rc('font', family = 'Malgun Gothic') # 한글 설정 (맑은 고딕)
mpl.rc('axes', unicode_minus = False) # 음수 부호 깨지는거 설정

import seaborn as sns 
import scipy.stats as stats

import xgboost as xgb
import sqlalchemy

from sqlalchemy import create_engine, text

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
def db_conn(_username, _password, _host, _port, _database) : 
    db_engine = sqlalchemy.engine.URL.create(
        drivername = "mysql+pymysql",
        username = _username,
        password = _password,
        host = _host,
        port = _port,
        database = _database,
    )

    return create_engine(db_engine)

In [None]:
engine = db_conn('root', 'Dhyoon96!', '127.0.0.1', 3306, 'side_proj_fco')
df_query = 'SELECT * FROM match_user'

with engine.connect() as connection:
    df1 = pd.read_sql(df_query, con = connection)

df1 = df1[df1['match_endtype'] == 0].reset_index(drop = True) # 정상 종료된 데이터만 사용

# For rate columns (Variables that have 'try' and 'suc' columns)
df1['match_total_pass_rate'] = (df1['match_total_pass_suc'] / df1['match_total_pass_try']).replace(np.nan, 0)

df1['match_total_pass_short_rate'] = (df1['match_total_pass_short_suc'] / df1['match_total_pass_short_try']).replace(np.nan, 0)
df1['match_total_pass_long_rate'] = (df1['match_total_pass_long_suc'] / df1['match_total_pass_long_try']).replace(np.nan, 0)
df1['match_total_pass_through_rate'] = (df1['match_total_pass_through_suc'] / df1['match_total_pass_through_try']).replace(np.nan, 0)

df1['match_total_shoot_inpenalty_rate'] = (df1['match_total_shoot_inpenalty_suc'] / df1['match_total_shoot_inpenalty_try']).replace(np.nan, 0)
df1['match_total_shoot_outpenalty_rate'] = (df1['match_total_shoot_outpenalty_suc'] / df1['match_total_shoot_outpenalty_try']).replace(np.nan, 0)

# For fail columns (Variables that have 'try' and 'suc' columns)
df1['match_total_pass_fail'] = df1['match_total_pass_try'] - df1['match_total_pass_suc']

df1['match_total_pass_short_fail'] = df1['match_total_pass_short_try'] - df1['match_total_pass_short_suc']
df1['match_total_pass_long_fail'] = df1['match_total_pass_long_try'] - df1['match_total_pass_long_suc']
df1['match_total_pass_through_fail'] = df1['match_total_pass_through_try'] - df1['match_total_pass_through_suc']

df1['match_total_shoot_inpenalty_fail'] = df1['match_total_shoot_inpenalty_try'] - df1['match_total_shoot_inpenalty_suc']
df1['match_total_shoot_outpenalty_fail'] = df1['match_total_shoot_outpenalty_try'] - df1['match_total_shoot_outpenalty_suc']

In [None]:
rate_col = ['match_total_pass_rate',
            'match_total_pass_short_rate', 
            'match_total_pass_long_rate', 
            'match_total_pass_through_rate', 
            'match_total_shoot_inpenalty_rate', 
            'match_total_shoot_outpenalty_rate']

try_col = ['match_total_pass_try',                 
           'match_total_pass_short_try',
           'match_total_pass_long_try',
           'match_total_pass_through_try',
           'match_total_shoot_outpenalty_try',
           'match_total_shoot_inpenalty_try']     

# RandomForest Classifier

In [None]:
model_rfc = RandomForestClassifier(random_state = 42)

kf_rfc = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)

param_grid_rfc = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

grid_search_rfc = GridSearchCV(estimator = model_rfc, 
                           param_grid = param_grid_rfc, 
                           cv = kf_rfc, 
                           n_jobs = -1, 
                           verbose = 2)

In [None]:
def random_forest_classifier(_df, _col) : 
    df_temp = _df[_col]
    df_temp = df_temp[((_df['match_result'] == '승') | (df_temp['match_result'] == '패'))].reset_index(drop = True)

    df_temp['match_result'] = df_temp['match_result'].map({'승': 1, '패': 0})

    X = df_temp.drop('match_result', axis = 1)
    y = df_temp['match_result']

    X_temp, X_test, y_temp, y_test = train_test_split(X, y, 
                                                      test_size = 0.2, 
                                                      random_state = 42, 
                                                      stratify = y)

    scaler = MinMaxScaler()
    X_temp_scaled = scaler.fit_transform(X_temp)
    X_test_scaled = scaler.transform(X_test)

    X_temp_scaled_gpu = cp.array(X_temp_scaled)
    X_test_scaled_gpu = cp.array(X_test_scaled)
    y_temp_gpu = cp.array(y_temp)
    y_test_gpu = cp.array(y_test)

    print('Model Random Forest Classifier with Column:', _col)

    grid_search_rfc.fit(X_temp_scaled, y_temp)

    best_model_rfc = grid_search_rfc.best_estimator_

    y_test_pred_rfc = best_model_rfc.predict(X_test_scaled)
    y_test_pred_proba_rfc = best_model_rfc.predict_proba(X_test_scaled)[:, 1]

    test_result_rfc = pd.DataFrame({
        'Actual': y_test.values,
        'Predicted': y_test_pred_rfc
    })

    cm_rfc = confusion_matrix(y_test, y_test_pred_rfc)
    
    fpr_rfc, tpr_rfc, thresholds_rfc = roc_curve(y_test, y_test_pred_proba_rfc)
    roc_auc_score_rfc = roc_auc_score(y_test, y_test_pred_proba_rfc)

    print("\nTest Accuracy:", accuracy_score(y_test, y_test_pred_rfc))
    print("\nClassification Report on Test Set_RFC:\n", classification_report(y_test, y_test_pred_rfc))

    print('------------------------------------------------------------------------------------------------')
    print('------------------------------------------------------------------------------------------------')

    feature_importance_avg_rfc = best_model_rfc.feature_importances_
    sorted_idx_rfc = feature_importance_avg_rfc.argsort()

    plt.figure(figsize = (12, 4))
    plt.barh(X.columns[sorted_idx_rfc], feature_importance_avg_rfc[sorted_idx_rfc])
    plt.xlabel("Average Feature Importance (Random Forest Classifier)")
    plt.title("Feature Importance Across K-Folds_RFC")

    plt.tick_params(axis = 'y', labelsize = 5)  
    plt.tick_params(axis = 'x') 

    for i, v in enumerate(feature_importance_avg_rfc[sorted_idx_rfc]):
        plt.text(v + 0.001, i, str(round(v, 2)), color = 'black', va = 'center', fontsize = 'small')
    
    plt.show()

    fig, axes = plt.subplots(1, 2, figsize = (12, 6))
    
    disp_rfc = ConfusionMatrixDisplay(confusion_matrix = cm_rfc, display_labels = best_model_rfc.classes_)
    disp_rfc.plot(cmap = 'Blues', ax = axes[0], values_format = 'd', colorbar = False)    
    axes[0].set_title("Confusion Matrix for Test Data_RFC")

    axes[1].plot(fpr_rfc, tpr_rfc, 
                 label = 'Random Forest Classifier (AUC = %0.2f)' % roc_auc_score_rfc)
    axes[1].plot([0, 1], [0, 1], linestyle = '--', color = 'gray')
    axes[1].set_xlabel('False Positive Rate')
    axes[1].set_ylabel('True Positive Rate')
    axes[1].set_title('Receiver Operating Characteristic Curve')
    axes[1].legend(loc = 'lower right')

    plt.tight_layout()
    plt.show()

In [None]:
# Columns with try_col, rate_col
df_col_temp = ['match_result'] + try_col + rate_col

random_forest_classifier(df1, df_col_temp)

# XGBoost Classifier with K-fold, GridSearch

In [None]:
df_col_temp = ['match_result'] + rate_col + try_col

df2 = df1[df_col_temp]
df2 = df2[((df2['match_result'] == '승') | (df2['match_result'] == '패'))].reset_index(drop = True)

df2['match_result'] = df2['match_result'].map({'승': 1, '패': 0})

X = df2.drop('match_result', axis = 1)
y = df2['match_result']

X_temp, X_test, y_temp, y_test = train_test_split(X, y, 
                                                  test_size = 0.2, 
                                                  random_state = 42, 
                                                  stratify = y)

scaler = MinMaxScaler()
X_temp_scaled = scaler.fit_transform(X_temp)
X_test_scaled = scaler.transform(X_test)

X_temp_scaled_gpu = cp.array(X_temp_scaled)
X_test_scaled_gpu = cp.array(X_test_scaled)
y_temp_gpu = cp.array(y_temp)
y_test_gpu = cp.array(y_test)

In [None]:
model_xgb = xgb.XGBClassifier(
    tree_method = 'gpu_hist',  
    eval_metric = "logloss",
    use_label_encoder = False,  
    random_state = 42)

param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8, 0.9]}

kf_xgb = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)

grid_search_xgb = GridSearchCV(estimator = model_xgb, 
                               param_grid = param_grid_xgb, 
                               cv = kf_xgb, 
                               verbose = 1, 
                               n_jobs = -1)

In [None]:
grid_search_xgb.fit(X_temp_scaled_gpu.get(), y_temp_gpu.get())

print("Best parameters found: ", grid_search_xgb.best_params_)
print("Best cross-validation score: ", grid_search_xgb.best_score_)

best_model_xgb = grid_search_xgb.best_estimator_

y_test_pred_xgb = best_model_xgb.predict(X_test_scaled_gpu.get())

print("\nTest Accuracy:", accuracy_score(y_test_gpu.get(), y_test_pred_xgb))
print("\nClassification Report on Test Set:\n", classification_report(y_test_gpu.get(), y_test_pred_xgb))

In [None]:
feature_importance_avg_xgb = best_model_xgb.feature_importances_
sorted_idx_xgb = feature_importance_avg_xgb.argsort()

plt.figure(figsize = (10, 8))
plt.barh(X.columns[sorted_idx_xgb], feature_importance_avg_xgb[sorted_idx_xgb])
plt.xlabel("Average Feature Importance (XGBoost)")
plt.title("Feature Importance Across K-Folds_XGB")

plt.tick_params(axis = 'y', labelsize = 5)  
plt.tick_params(axis = 'x')  

for i, v in enumerate(feature_importance_avg_xgb[sorted_idx_xgb]):
    plt.text(v + 0.001, i, str(round(v, 2)), color = 'black', va = 'center', fontsize = 'small')

plt.show()

In [None]:
test_result_xgb = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_test_pred_xgb
})

cm_xgb = confusion_matrix(y_test, y_test_pred_xgb)

disp_rc = ConfusionMatrixDisplay(confusion_matrix = cm_xgb, display_labels = best_model_xgb.classes_)
disp_rc.plot(cmap = 'Blues', values_format = 'd')
plt.title("Confusion Matrix for Test Data_XGB")
plt.show()