In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score, mean_squared_error
import numpy as np

In [14]:
dataset = pd.read_excel("../data/dataset.xlsx", index_col=0, dtype = {"gameId": "str"})
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7194 entries, 0 to 7193
Data columns (total 54 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   gameId                        7194 non-null   object 
 1   Blue_Adc_championDamageShare  7194 non-null   float64
 2   Blue_Adc_creepScorePerTime    7194 non-null   float64
 3   Blue_Adc_goldEarnedPerTime    7194 non-null   float64
 4   Blue_Adc_kda                  7194 non-null   float64
 5   Blue_Adc_wardsScorePerTime    7194 non-null   float64
 6   Blue_Jgl_championDamageShare  7194 non-null   float64
 7   Blue_Jgl_creepScorePerTime    7194 non-null   float64
 8   Blue_Jgl_goldEarnedPerTime    7194 non-null   float64
 9   Blue_Jgl_kda                  7194 non-null   float64
 10  Blue_Jgl_wardsScorePerTime    7194 non-null   float64
 11  Blue_Mid_championDamageShare  7194 non-null   float64
 12  Blue_Mid_creepScorePerTime    7194 non-null   float64
 13  Blue_Mid

In [15]:
dataset["winner"].head()

0     Red
1     Red
2     Red
3     Red
4    Blue
Name: winner, dtype: object

In [16]:
label_incoder = LabelEncoder()
dataset["winner"] = label_incoder.fit_transform(dataset["winner"])
dataset["winner"].head()

0    1
1    1
2    1
3    1
4    0
Name: winner, dtype: int32

In [17]:
dropped_dataset = dataset.drop(["gameId"], axis=1)
desired_X_column_list = list(dropped_dataset.columns)
desired_X_column_list.remove("winner")
dropped_dataset.drop_duplicates(desired_X_column_list)
y = dropped_dataset["winner"]
X = dropped_dataset.drop(["winner"], axis=1)

X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7194 entries, 0 to 7193
Data columns (total 52 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Blue_Adc_championDamageShare  7194 non-null   float64
 1   Blue_Adc_creepScorePerTime    7194 non-null   float64
 2   Blue_Adc_goldEarnedPerTime    7194 non-null   float64
 3   Blue_Adc_kda                  7194 non-null   float64
 4   Blue_Adc_wardsScorePerTime    7194 non-null   float64
 5   Blue_Jgl_championDamageShare  7194 non-null   float64
 6   Blue_Jgl_creepScorePerTime    7194 non-null   float64
 7   Blue_Jgl_goldEarnedPerTime    7194 non-null   float64
 8   Blue_Jgl_kda                  7194 non-null   float64
 9   Blue_Jgl_wardsScorePerTime    7194 non-null   float64
 10  Blue_Mid_championDamageShare  7194 non-null   float64
 11  Blue_Mid_creepScorePerTime    7194 non-null   float64
 12  Blue_Mid_goldEarnedPerTime    7194 non-null   float64
 13  Blue_Mid

In [18]:
X.describe()

Unnamed: 0,Blue_Adc_championDamageShare,Blue_Adc_creepScorePerTime,Blue_Adc_goldEarnedPerTime,Blue_Adc_kda,Blue_Adc_wardsScorePerTime,Blue_Jgl_championDamageShare,Blue_Jgl_creepScorePerTime,Blue_Jgl_goldEarnedPerTime,Blue_Jgl_kda,Blue_Jgl_wardsScorePerTime,...,Red_Spt_goldEarnedPerTime,Red_Spt_kda,Red_Spt_wardsScorePerTime,Red_Top_championDamageShare,Red_Top_creepScorePerTime,Red_Top_goldEarnedPerTime,Red_Top_kda,Red_Top_wardsScorePerTime,teamWinrateDiff,headtoHeadWinrate
count,7194.0,7194.0,7194.0,7194.0,7194.0,7194.0,7194.0,7194.0,7194.0,7194.0,...,7194.0,7194.0,7194.0,7194.0,7194.0,7194.0,7194.0,7194.0,7194.0,7194.0
mean,0.276113,0.15395,7.299151,6.215429,0.016289,0.146274,0.091966,5.633765,5.523583,0.014861,...,4.086214,5.626504,0.042669,0.230872,0.134115,6.495914,4.525407,0.012511,0.002245,0.495038
std,0.032409,0.013381,0.531044,2.269533,0.003622,0.023871,0.008473,0.423589,2.188871,0.002938,...,0.326306,2.338421,0.008063,0.030368,0.009408,0.450459,1.798515,0.002395,0.202201,0.184054
min,0.168684,0.08317,5.203467,0.511111,0.001736,0.08305,0.065099,4.314235,0.569286,0.00354,...,3.260939,0.111111,0.004827,0.133625,0.07186,4.76527,0.166667,0.00149,-0.677419,0.083333
25%,0.255236,0.147298,6.965803,4.650417,0.014245,0.130388,0.086253,5.338454,3.968333,0.013018,...,3.876018,3.980893,0.039447,0.210632,0.128138,6.20597,3.28,0.011297,-0.130073,0.333333
50%,0.275305,0.155048,7.286751,6.12933,0.016371,0.145013,0.091962,5.632477,5.484813,0.014856,...,4.059265,5.434881,0.043285,0.230866,0.133585,6.46818,4.427688,0.012669,0.0,0.5
75%,0.297032,0.162799,7.645657,7.652963,0.018121,0.159702,0.097246,5.897875,6.9125,0.016852,...,4.234219,7.019167,0.047787,0.249935,0.140147,6.770159,5.609226,0.014071,0.131168,0.642857
max,0.42415,0.18862,9.544249,17.466667,0.034457,0.27158,0.132075,8.276353,14.45,0.024902,...,6.814723,16.25,0.064336,0.35193,0.166117,8.710932,14.733333,0.026733,0.69697,0.923077


In [19]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=50)
model.fit(X_train, y_train)

In [22]:
y_pred = model.predict(X_test)

In [23]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.6282140375260598


In [24]:
# 하이퍼파라미터 그리드 정의
param_grid = {
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 200, 300],
    'colsample_bytree': [0.3, 0.7]
}

# GridSearchCV 객체 생성
grid_search = GridSearchCV(estimator=xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
                           param_grid=param_grid,
                           scoring='neg_mean_squared_error',
                           cv=3,
                           verbose=1)

# 그리드 서치 수행
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 출력
print(f"Best parameters found: {grid_search.best_params_}")

# 최적의 하이퍼파라미터로 모델 재학습
best_model = grid_search.best_estimator_

# 예측
y_pred_best = best_model.predict(X_test)

# 정확도 출력
mse_best = mean_squared_error(y_test, y_pred_best)
print(f"mse with tuned parameters: {mse_best:.4f}")


Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best parameters found: {'colsample_bytree': 0.3, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200}
mse with tuned parameters: 0.3391
