In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
import xgboost as xgb

In [3]:
# Load the data
file_path = '/content/drive/MyDrive/Colab Notebooks/DAT/프로젝트/data/league_5_div_HTR_X.csv'
league_df = pd.read_csv(file_path)

In [4]:
league_df.columns

Index(['Date', 'HomeTeam', 'AwayTeam', 'HTOa', 'ATOa', 'HTAt', 'ATAt', 'HTMid',
       'ATMid', 'HTDef', 'ATDef', 'HomeSquad', 'AwaySquad', 'HomeAvgAge',
       'AwayAvgAge', 'HomeMV', 'AwayMV', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG',
       'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR',
       'AR', 'HxG', 'AxG', 'HxA', 'AxA', 'HxPTS', 'AxPTS', 'HPPDA', 'APPDA'],
      dtype='object')

In [6]:
drop_columns = ['Date','HomeTeam', 'AwayTeam']
data = league_df.drop(columns=drop_columns)

In [7]:
# Derive new columns as the difference between corresponding home and away metrics
data['Diff_Oa'] = league_df['HTOa'] - league_df['ATOa']
data['Diff_At'] = league_df['HTAt'] - league_df['ATAt']
data['Diff_Mid'] = league_df['HTMid'] - league_df['ATMid']
data['Diff_Def'] = league_df['HTDef'] - league_df['ATDef']
data['Diff_R'] = league_df['HR'] - league_df['AR']
data['Diff_Y'] = league_df['HY'] - league_df['AY']
data['Diff_S'] = league_df['HS'] - league_df['AS']
data['Diff_F'] = league_df['HF'] - league_df['AF']
data['Diff_C'] = league_df['HC'] - league_df['AC']
data['Diff_xG'] = league_df['HxG'] - league_df['AxG']
data['Diff_xA'] = league_df['HxA'] - league_df['AxA']
data['Diff_xPTS'] = league_df['HxPTS'] - league_df['AxPTS']
data['Diff_PPDA'] = league_df['HPPDA'] - league_df['APPDA']
data['Diff_AvgAge'] = league_df['HomeAvgAge'] - league_df['AwayAvgAge']
data['Diff_MV'] = league_df['HomeMV'] - league_df['AwayMV']
data['Diff_Squad'] = league_df['HomeSquad'] - league_df['AwaySquad']
data['Diff_FTG'] = league_df['FTHG'] - league_df['FTAG']
data['Diff_HTG'] = league_df['HTHG'] - league_df['HTAG']

In [10]:
data = data.drop(columns=['HTOa','ATOa','HTAt','ATAt', 'HTMid', 'ATMid','HTDef','ATDef','HR','AR','HY','AY','HS','AS','HF','AF',
                            'HC','AC','HxG','AxG','HxA','AxA','HxPTS','AxPTS','HPPDA','APPDA','HomeAvgAge','AwayAvgAge','HomeMV','AwayMV',
                          'HomeSquad','AwaySquad','FTHG','FTAG','HTHG','HTAG'])

In [11]:
data

Unnamed: 0,FTR,HST,AST,Diff_Oa,Diff_At,Diff_Mid,Diff_Def,Diff_R,Diff_xG,Diff_xA,...,Diff_PPDA,Diff_Y,Diff_S,Diff_F,Diff_C,Diff_AvgAge,Diff_MV,Diff_Squad,Diff_FTG,Diff_HTG
0,A,3.4,5.4,-1.0,-4.0,-3.0,0.0,0.0,-0.708,-0.518,...,5.982,-1.0,-0.6,-1.8,-1.0,1.0,-21.800000,1.0,-1.0,-0.4
1,H,5.4,5.8,2.0,0.0,3.0,2.0,0.0,-0.346,0.294,...,3.558,0.6,0.2,-5.6,-0.8,0.0,189.250000,-2.0,-0.4,0.0
2,A,4.4,8.4,-9.0,-9.0,-10.0,-8.0,0.0,-0.146,-0.030,...,4.382,0.2,-5.4,2.0,0.8,0.0,-656.100001,-4.0,-2.2,-1.8
3,D,4.0,3.2,7.0,4.0,9.0,7.0,0.0,0.316,0.236,...,-1.228,0.0,-0.6,-0.4,-1.4,-2.0,82.700000,2.0,-0.2,-0.6
4,H,6.0,2.4,11.0,16.0,11.0,10.0,0.2,1.712,1.114,...,-2.662,-1.4,7.8,-4.0,0.8,0.0,536.250000,-8.0,1.4,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12543,H,5.8,4.0,2.0,5.0,1.0,1.0,0.0,0.658,0.484,...,2.352,0.2,-1.0,-2.8,1.0,2.0,50.050001,6.0,0.8,0.4
12544,H,4.8,4.2,5.0,6.0,4.0,0.0,0.2,-0.030,0.310,...,1.318,0.8,-0.2,-7.8,-0.2,0.0,324.020000,1.0,1.8,1.0
12545,H,3.8,4.8,-2.0,-3.0,-2.0,-2.0,0.0,0.226,0.696,...,3.474,-0.4,-1.4,-5.0,0.0,-1.0,-57.870000,3.0,0.2,0.2
12546,D,3.6,4.8,4.0,2.0,3.0,4.0,-0.2,-0.490,-0.380,...,0.534,0.0,-1.4,1.4,0.8,0.0,129.470000,5.0,-0.8,0.2


In [12]:
ftr_mapping = {'H': 0, 'A': 1, 'D': 2}
data['FTR'] = data['FTR'].map(ftr_mapping)

In [13]:
data

Unnamed: 0,FTR,HST,AST,Diff_Oa,Diff_At,Diff_Mid,Diff_Def,Diff_R,Diff_xG,Diff_xA,...,Diff_PPDA,Diff_Y,Diff_S,Diff_F,Diff_C,Diff_AvgAge,Diff_MV,Diff_Squad,Diff_FTG,Diff_HTG
0,1,3.4,5.4,-1.0,-4.0,-3.0,0.0,0.0,-0.708,-0.518,...,5.982,-1.0,-0.6,-1.8,-1.0,1.0,-21.800000,1.0,-1.0,-0.4
1,0,5.4,5.8,2.0,0.0,3.0,2.0,0.0,-0.346,0.294,...,3.558,0.6,0.2,-5.6,-0.8,0.0,189.250000,-2.0,-0.4,0.0
2,1,4.4,8.4,-9.0,-9.0,-10.0,-8.0,0.0,-0.146,-0.030,...,4.382,0.2,-5.4,2.0,0.8,0.0,-656.100001,-4.0,-2.2,-1.8
3,2,4.0,3.2,7.0,4.0,9.0,7.0,0.0,0.316,0.236,...,-1.228,0.0,-0.6,-0.4,-1.4,-2.0,82.700000,2.0,-0.2,-0.6
4,0,6.0,2.4,11.0,16.0,11.0,10.0,0.2,1.712,1.114,...,-2.662,-1.4,7.8,-4.0,0.8,0.0,536.250000,-8.0,1.4,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12543,0,5.8,4.0,2.0,5.0,1.0,1.0,0.0,0.658,0.484,...,2.352,0.2,-1.0,-2.8,1.0,2.0,50.050001,6.0,0.8,0.4
12544,0,4.8,4.2,5.0,6.0,4.0,0.0,0.2,-0.030,0.310,...,1.318,0.8,-0.2,-7.8,-0.2,0.0,324.020000,1.0,1.8,1.0
12545,0,3.8,4.8,-2.0,-3.0,-2.0,-2.0,0.0,0.226,0.696,...,3.474,-0.4,-1.4,-5.0,0.0,-1.0,-57.870000,3.0,0.2,0.2
12546,2,3.6,4.8,4.0,2.0,3.0,4.0,-0.2,-0.490,-0.380,...,0.534,0.0,-1.4,1.4,0.8,0.0,129.470000,5.0,-0.8,0.2


In [14]:
data.columns

Index(['FTR', 'HST', 'AST', 'Diff_Oa', 'Diff_At', 'Diff_Mid', 'Diff_Def',
       'Diff_R', 'Diff_xG', 'Diff_xA', 'Diff_xPTS', 'Diff_PPDA', 'Diff_Y',
       'Diff_S', 'Diff_F', 'Diff_C', 'Diff_AvgAge', 'Diff_MV', 'Diff_Squad',
       'Diff_FTG', 'Diff_HTG'],
      dtype='object')

In [16]:
# Define the features and the target variable
X = data.drop(columns=['FTR'])
y = data['FTR']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [17]:
# Initialize the model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.5389110225763613
Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.79      0.66      1666
           1       0.52      0.54      0.53      1140
           2       0.42      0.11      0.17       959

    accuracy                           0.54      3765
   macro avg       0.50      0.48      0.45      3765
weighted avg       0.51      0.54      0.49      3765



In [19]:
# Initialize and train the GradientBoostingClassifier
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
gb_y_pred = gb_model.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_y_pred)
gb_report = classification_report(y_test, gb_y_pred)

print("GradientBoostingClassifier")
print(f"Accuracy: {gb_accuracy}")
print("Classification Report:")
print(gb_report)

GradientBoostingClassifier
Accuracy: 0.5192563081009296
Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.80      0.64      1666
           1       0.50      0.53      0.51      1140
           2       0.25      0.03      0.05       959

    accuracy                           0.52      3765
   macro avg       0.43      0.45      0.40      3765
weighted avg       0.45      0.52      0.45      3765



In [20]:
# Initialize and train the AdaBoostClassifier
ada_model = AdaBoostClassifier(random_state=42)
ada_model.fit(X_train, y_train)
ada_y_pred = ada_model.predict(X_test)
ada_accuracy = accuracy_score(y_test, ada_y_pred)
ada_report = classification_report(y_test, ada_y_pred)

print("AdaBoostClassifier")
print(f"Accuracy: {ada_accuracy}")
print("Classification Report:")
print(ada_report)

AdaBoostClassifier
Accuracy: 0.5120849933598938
Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.80      0.64      1666
           1       0.47      0.51      0.49      1140
           2       0.31      0.02      0.04       959

    accuracy                           0.51      3765
   macro avg       0.44      0.44      0.39      3765
weighted avg       0.46      0.51      0.44      3765



In [21]:
# Initialize and train the XGBClassifier
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)
xgb_y_pred = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_y_pred)
xgb_report = classification_report(y_test, xgb_y_pred)

print("XGBClassifier")
print(f"Accuracy: {xgb_accuracy}")
print("Classification Report:")
print(xgb_report)

XGBClassifier
Accuracy: 0.5054448871181939
Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.71      0.62      1666
           1       0.50      0.48      0.49      1140
           2       0.32      0.17      0.22       959

    accuracy                           0.51      3765
   macro avg       0.46      0.46      0.45      3765
weighted avg       0.48      0.51      0.48      3765



In [22]:
rf_importances = model.feature_importances_

# Display feature importances
rf_feature_importance = pd.Series(rf_importances, index=X.columns).sort_values(ascending=False)
print("Feature Importances:")
print(rf_feature_importance[:10])

Feature Importances:
Diff_MV       0.091137
Diff_xPTS     0.065531
Diff_xA       0.062503
Diff_xG       0.061496
Diff_PPDA     0.061015
Diff_S        0.053813
Diff_F        0.052690
Diff_C        0.051855
Diff_At       0.049761
Diff_Squad    0.049644
dtype: float64


In [23]:
gb_importances = gb_model.feature_importances_

# Display feature importances
gb_feature_importance = pd.Series(gb_importances, index=X.columns).sort_values(ascending=False)
print("Feature Importances:")
print(gb_feature_importance[:10])

Feature Importances:
Diff_MV       0.517520
Diff_xPTS     0.064960
Diff_Mid      0.042146
Diff_Squad    0.040261
Diff_xA       0.034934
Diff_xG       0.033384
Diff_PPDA     0.032269
Diff_C        0.029954
Diff_At       0.024940
AST           0.023275
dtype: float64


In [24]:
ada_importances = ada_model.feature_importances_

# Display feature importances
ada_feature_importance = pd.Series(ada_importances, index=X.columns).sort_values(ascending=False)
print("Feature Importances:")
print(ada_feature_importance[:10])

Feature Importances:
Diff_MV        0.26
Diff_Squad     0.12
Diff_xPTS      0.08
Diff_xA        0.08
Diff_C         0.06
HST            0.04
Diff_AvgAge    0.04
Diff_S         0.04
AST            0.04
Diff_PPDA      0.04
dtype: float64


In [25]:
xgb_importances = xgb_model.feature_importances_

# Display feature importances
xgb_feature_importance = pd.Series(xgb_importances, index=X.columns).sort_values(ascending=False)
print("Feature Importances:")
print(xgb_feature_importance[:10])

Feature Importances:
Diff_MV       0.147533
Diff_Mid      0.057802
Diff_Squad    0.047354
Diff_xPTS     0.047089
Diff_xA       0.046338
Diff_PPDA     0.046064
Diff_S        0.045581
Diff_xG       0.045334
Diff_C        0.044761
Diff_Oa       0.044550
dtype: float32
