In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
import xgboost as xgb

In [47]:
# Load the data
file_path = '/content/drive/MyDrive/Colab Notebooks/DAT/프로젝트/data/league_5_div_HTR_X.csv'
league_df = pd.read_csv(file_path)

In [48]:
league_df.columns

Index(['Date', 'HomeTeam', 'AwayTeam', 'HTOa', 'ATOa', 'HTAt', 'ATAt', 'HTMid',
       'ATMid', 'HTDef', 'ATDef', 'HomeSquad', 'AwaySquad', 'HomeAvgAge',
       'AwayAvgAge', 'HomeMV', 'AwayMV', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG',
       'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR',
       'AR', 'HxG', 'AxG', 'HxA', 'AxA', 'HxPTS', 'AxPTS', 'HPPDA', 'APPDA'],
      dtype='object')

In [49]:
drop_columns = ['Date','HomeTeam','AwayTeam','HomeMV','AwayMV','ATAt','ATDef','ATMid','ATOa','HTAt','HTDef','HTMid','HTOa']
league_df = league_df.drop(columns=drop_columns)

In [50]:
# Define the bins and labels for categorization
bins = [0, 1, 2, 3]
labels = ['0-1', '1-2', '2-3']

# Categorize HxPTS and AxPTS
league_df['HxPTS_category'] = pd.cut(league_df['HxPTS'], bins=bins, labels=labels, include_lowest=True)
league_df['AxPTS_category'] = pd.cut(league_df['AxPTS'], bins=bins, labels=labels, include_lowest=True)

In [30]:
league_df

Unnamed: 0,HomeSquad,AwaySquad,HomeAvgAge,AwayAvgAge,FTHG,FTAG,FTR,HTHG,HTAG,HS,...,HxG,AxG,HxA,AxA,HxPTS,AxPTS,HPPDA,APPDA,HxPTS_category,AxPTS_category
0,29.0,28.0,25.0,24.0,1.0,2.0,A,0.4,0.8,11.0,...,0.828,1.536,0.456,0.974,0.844,1.696,12.440,6.458,0-1,1-2
1,32.0,34.0,25.0,25.0,1.4,1.8,H,1.0,1.0,14.0,...,1.500,1.846,1.376,1.082,1.910,2.104,11.820,8.262,1-2,2-3
2,37.0,41.0,24.0,24.0,1.4,3.6,A,0.2,2.0,13.8,...,1.958,2.104,1.472,1.502,1.710,1.760,13.016,8.634,1-2,1-2
3,27.0,25.0,26.0,28.0,0.8,1.0,D,0.4,1.0,9.2,...,1.002,0.686,0.788,0.552,1.080,1.074,7.822,9.050,1-2,1-2
4,31.0,39.0,25.0,25.0,2.2,0.8,H,0.6,0.2,14.8,...,2.264,0.552,1.444,0.330,2.544,1.222,6.064,8.726,2-3,1-2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12543,40.0,34.0,26.0,24.0,1.4,0.6,H,0.8,0.4,11.0,...,1.560,0.902,1.118,0.634,1.762,0.642,16.288,13.936,1-2,0-1
12544,35.0,34.0,24.0,24.0,3.0,1.2,H,1.8,0.8,12.0,...,1.384,1.414,1.140,0.830,1.316,1.930,12.608,11.290,1-2,1-2
12545,40.0,37.0,24.0,25.0,1.6,1.4,H,0.6,0.4,11.8,...,1.776,1.550,1.512,0.816,1.366,1.914,11.720,8.246,1-2,1-2
12546,36.0,31.0,25.0,25.0,1.0,1.8,D,0.6,0.4,13.6,...,1.248,1.738,1.018,1.398,1.384,1.376,7.686,7.152,1-2,1-2


In [51]:
# Perform one-hot encoding on the HxPTS_category and AxPTS_category columns
data_encoded = pd.get_dummies(league_df, columns=['HxPTS_category', 'AxPTS_category'])

ftr_mapping = {'H': 0, 'A': 1, 'D': 2}
data_encoded['FTR'] = data_encoded['FTR'].map(ftr_mapping)

In [52]:
data_encoded.columns

Index(['HomeSquad', 'AwaySquad', 'HomeAvgAge', 'AwayAvgAge', 'FTHG', 'FTAG',
       'FTR', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC',
       'HY', 'AY', 'HR', 'AR', 'HxG', 'AxG', 'HxA', 'AxA', 'HxPTS', 'AxPTS',
       'HPPDA', 'APPDA', 'HxPTS_category_0-1', 'HxPTS_category_1-2',
       'HxPTS_category_2-3', 'AxPTS_category_0-1', 'AxPTS_category_1-2',
       'AxPTS_category_2-3'],
      dtype='object')

In [53]:
# Define the features and the target variable
X = data_encoded.drop(columns=['FTR'])
y = data_encoded['FTR']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [54]:
# Initialize and train the RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_report = classification_report(y_test, rf_y_pred)

print("RandomForestClassifier")
print(f"Accuracy: {rf_accuracy}")
print("Classification Report:")
print(rf_report)

RandomForestClassifier
Accuracy: 0.5118193891102257
Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.80      0.63      1666
           1       0.49      0.45      0.47      1140
           2       0.42      0.09      0.15       959

    accuracy                           0.51      3765
   macro avg       0.48      0.45      0.42      3765
weighted avg       0.49      0.51      0.46      3765



In [55]:
# Initialize and train the GradientBoostingClassifier
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
gb_y_pred = gb_model.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_y_pred)
gb_report = classification_report(y_test, gb_y_pred)

print("GradientBoostingClassifier")
print(f"Accuracy: {gb_accuracy}")
print("Classification Report:")
print(gb_report)

GradientBoostingClassifier
Accuracy: 0.4964143426294821
Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.80      0.63      1666
           1       0.47      0.45      0.46      1140
           2       0.24      0.03      0.05       959

    accuracy                           0.50      3765
   macro avg       0.41      0.43      0.38      3765
weighted avg       0.43      0.50      0.43      3765



In [56]:
# Initialize and train the AdaBoostClassifier
ada_model = AdaBoostClassifier(random_state=42)
ada_model.fit(X_train, y_train)
ada_y_pred = ada_model.predict(X_test)
ada_accuracy = accuracy_score(y_test, ada_y_pred)
ada_report = classification_report(y_test, ada_y_pred)

print("AdaBoostClassifier")
print(f"Accuracy: {ada_accuracy}")
print("Classification Report:")
print(ada_report)

AdaBoostClassifier
Accuracy: 0.4950863213811421
Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.79      0.62      1666
           1       0.45      0.46      0.46      1140
           2       0.39      0.03      0.05       959

    accuracy                           0.50      3765
   macro avg       0.45      0.43      0.38      3765
weighted avg       0.47      0.50      0.43      3765



In [57]:
# Initialize and train the XGBClassifier
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)
xgb_y_pred = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_y_pred)
xgb_report = classification_report(y_test, xgb_y_pred)

print("XGBClassifier")
print(f"Accuracy: {xgb_accuracy}")
print("Classification Report:")
print(xgb_report)

XGBClassifier
Accuracy: 0.47755644090305444
Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.70      0.60      1666
           1       0.44      0.43      0.44      1140
           2       0.33      0.15      0.21       959

    accuracy                           0.48      3765
   macro avg       0.43      0.43      0.42      3765
weighted avg       0.45      0.48      0.45      3765



In [59]:
rf_importances = rf_model.feature_importances_

# Display feature importances
rf_feature_importance = pd.Series(rf_importances, index=X.columns).sort_values(ascending=False)
print("Feature Importances:")
print(rf_feature_importance[:10])

Feature Importances:
AxPTS    0.052956
HxPTS    0.051623
AxA      0.049620
HxG      0.047546
HPPDA    0.047178
HxA      0.046698
AxG      0.046421
APPDA    0.043807
HS       0.040133
AS       0.039926
dtype: float64


In [58]:
gb_importances = gb_model.feature_importances_

# Display feature importances
gb_feature_importance = pd.Series(gb_importances, index=X.columns).sort_values(ascending=False)
print("Feature Importances:")
print(gb_feature_importance[:10])

Feature Importances:
AxPTS    0.175258
HxPTS    0.129245
AxA      0.086977
HST      0.072641
HxG      0.053784
HPPDA    0.053528
AST      0.046879
HC       0.040569
AF       0.034644
APPDA    0.028305
dtype: float64


In [60]:
ada_importances = ada_model.feature_importances_

# Display feature importances
ada_feature_importance = pd.Series(ada_importances, index=X.columns).sort_values(ascending=False)
print("Feature Importances:")
print(ada_feature_importance[:10])

Feature Importances:
AxPTS    0.16
HxPTS    0.14
HPPDA    0.06
HxA      0.06
HST      0.06
HF       0.06
AxA      0.04
HxG      0.04
AS       0.04
AST      0.04
dtype: float64


In [61]:
xgb_importances = xgb_model.feature_importances_

# Display feature importances
xgb_feature_importance = pd.Series(xgb_importances, index=X.columns).sort_values(ascending=False)
print("Feature Importances:")
print(xgb_feature_importance[:10])

Feature Importances:
AxPTS         0.057754
HxPTS         0.043059
AxA           0.039887
HxG           0.037054
HST           0.037039
HPPDA         0.034047
AST           0.034043
HxA           0.033673
AwayAvgAge    0.033501
HC            0.032816
dtype: float32


In [None]:
# Save the cleaned and processed DataFrame to a CSV file
output_file_path = '/content/drive/MyDrive/Colab Notebooks/DAT/프로젝트/data/noramlized_df.csv'
cleaned_df.to_csv(output_file_path, index=False)

# Load the cleaned data
df = pd.read_csv(output_file_path)

In [None]:
df

Unnamed: 0,HomeSquad,AwaySquad,HomeAvgAge,AwayAvgAge,FTHG,FTAG,FTR,HTHG,HTAG,HS,...,HxA,AxA,HPPDA,APPDA,HxPTS_category_0-1,HxPTS_category_1-2,HxPTS_category_2-3,AxPTS_category_0-1,AxPTS_category_1-2,AxPTS_category_2-3
0,29.0,28.0,25.0,24.0,1.0,2.0,1,0.4,0.8,11.0,...,0.456,0.974,12.440,6.458,True,False,False,False,True,False
1,32.0,34.0,25.0,25.0,1.4,1.8,0,1.0,1.0,14.0,...,1.376,1.082,11.820,8.262,False,True,False,False,False,True
2,37.0,41.0,24.0,24.0,1.4,3.6,1,0.2,2.0,13.8,...,1.472,1.502,13.016,8.634,False,True,False,False,True,False
3,27.0,25.0,26.0,28.0,0.8,1.0,2,0.4,1.0,9.2,...,0.788,0.552,7.822,9.050,False,True,False,False,True,False
4,31.0,39.0,25.0,25.0,2.2,0.8,0,0.6,0.2,14.8,...,1.444,0.330,6.064,8.726,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12543,40.0,34.0,26.0,24.0,1.4,0.6,0,0.8,0.4,11.0,...,1.118,0.634,16.288,13.936,False,True,False,True,False,False
12544,35.0,34.0,24.0,24.0,3.0,1.2,0,1.8,0.8,12.0,...,1.140,0.830,12.608,11.290,False,True,False,False,True,False
12545,40.0,37.0,24.0,25.0,1.6,1.4,0,0.6,0.4,11.8,...,1.512,0.816,11.720,8.246,False,True,False,False,True,False
12546,36.0,31.0,25.0,25.0,1.0,1.8,2,0.6,0.4,13.6,...,1.018,1.398,7.686,7.152,False,True,False,False,True,False


In [None]:
df

Unnamed: 0,HomeSquad,AwaySquad,HomeAvgAge,AwayAvgAge,FTHG,FTAG,FTR,HTHG,HTAG,HS,...,HxA,AxA,HPPDA,APPDA,HxPTS_category_0-1,HxPTS_category_1-2,HxPTS_category_2-3,AxPTS_category_0-1,AxPTS_category_1-2,AxPTS_category_2-3
0,29.0,28.0,25.0,24.0,1.0,2.0,1,0.4,0.8,11.0,...,0.456,0.974,12.440,6.458,True,False,False,False,True,False
1,32.0,34.0,25.0,25.0,1.4,1.8,0,1.0,1.0,14.0,...,1.376,1.082,11.820,8.262,False,True,False,False,False,True
2,37.0,41.0,24.0,24.0,1.4,3.6,1,0.2,2.0,13.8,...,1.472,1.502,13.016,8.634,False,True,False,False,True,False
3,27.0,25.0,26.0,28.0,0.8,1.0,2,0.4,1.0,9.2,...,0.788,0.552,7.822,9.050,False,True,False,False,True,False
4,31.0,39.0,25.0,25.0,2.2,0.8,0,0.6,0.2,14.8,...,1.444,0.330,6.064,8.726,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12543,40.0,34.0,26.0,24.0,1.4,0.6,0,0.8,0.4,11.0,...,1.118,0.634,16.288,13.936,False,True,False,True,False,False
12544,35.0,34.0,24.0,24.0,3.0,1.2,0,1.8,0.8,12.0,...,1.140,0.830,12.608,11.290,False,True,False,False,True,False
12545,40.0,37.0,24.0,25.0,1.6,1.4,0,0.6,0.4,11.8,...,1.512,0.816,11.720,8.246,False,True,False,False,True,False
12546,36.0,31.0,25.0,25.0,1.0,1.8,2,0.6,0.4,13.6,...,1.018,1.398,7.686,7.152,False,True,False,False,True,False


피처 중요도를 봤을 때 공통적으로 구단 가치가 가장 높은 중요도를 보였다. 구단 가치가 높을 수록 더 강팀인 경우가 많다. 그리고 강팀인 경우에 이길 확률이 높기 때문에 구단 가치가 더 높은 팀일 때 이길 확률이 더 크다. 그렇기 때문에 높은 피처 중요도를 보이는 것이다라고 생각해 볼 수 있다.

축구라는 스포츠는 강한 팀이 대게 이기는 편이다. 그렇다면 상대적으로 약팀이 이기거나 비겨서 승점을 가져오는 경우에는 어떤 것을 잘해야 하는 걸까?