In [270]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np

## 0. DataFrame

In [271]:
football_df = pd.read_csv('data/all_data_with_elo.csv', low_memory = False)
football_df

Unnamed: 0.1,Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,...,WHD,WHA,AHh,B365AHH,B365AHA,AHCh,B365CAHH,B365CAHA,HomeTeamELO,AwayTeamELO
0,0,D1,2003/8/1,Bayern Munich,Ein Frankfurt,3.0,1.0,H,17.0,6.0,...,4.50,10.00,,,,,,,1859.379272,1593.249268
1,1,F1,2003/8/1,Lille,Lyon,1.0,0.0,H,,,...,3.00,2.20,,,,,,,1612.968018,1726.539795
2,2,F1,2003/8/2,Auxerre,Nice,1.0,2.0,A,,,...,3.10,5.00,-0.75,2.050,1.850,,,,1702.604858,1611.196045
3,3,F1,2003/8/2,Guingamp,Marseille,0.0,1.0,A,,,...,3.10,2.40,0.00,1.925,1.975,,,,1685.016113,1665.625732
4,4,D1,2003/8/2,Hamburg,Hannover,0.0,3.0,A,10.0,16.0,...,3.50,5.00,-0.75,1.800,2.100,,,,1718.566284,1649.805298
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37776,37776,F1,2024/10/6,Reims,Montpellier,4.0,2.0,H,18.0,10.0,...,3.80,4.00,-0.50,1.780,2.030,-0.75,1.83,2.10,1633.626221,1645.806641
37777,37777,SP1,2024/10/6,Sevilla,Betis,1.0,0.0,H,14.0,11.0,...,3.25,2.88,0.00,1.700,2.100,0.00,1.94,1.99,1676.242676,1709.259521
37778,37778,SP1,2024/10/6,Sociedad,Ath Madrid,1.0,1.0,D,16.0,4.0,...,3.20,2.30,0.25,1.890,2.010,0.00,1.98,1.95,1766.551880,1828.522095
37779,37779,F1,2024/10/6,Strasbourg,Lens,2.0,2.0,D,8.0,12.0,...,3.40,2.40,0.25,1.810,2.090,0.00,1.95,1.98,1608.732544,1719.138184


## 1. Descriptive Statistics 

**1.1 DataFrame Shape**

In [272]:
# no. rows and no. cols
football_df.shape

(37781, 29)

In [273]:
# feature names
print(football_df.columns.tolist())

['Unnamed: 0', 'Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HS', 'AS', 'HST', 'AST', 'B365H', 'B365D', 'B365A', 'IWH', 'IWD', 'IWA', 'WHH', 'WHD', 'WHA', 'AHh', 'B365AHH', 'B365AHA', 'AHCh', 'B365CAHH', 'B365CAHA', 'HomeTeamELO', 'AwayTeamELO']


**1.2 NaN Values**

In [274]:
football_df.isnull().sum()

Unnamed: 0         0
Div                0
Date               0
HomeTeam           0
AwayTeam           0
FTHG               0
FTAG               0
FTR                0
HS              1762
AS              1762
HST             2568
AST             2568
B365H             49
B365D             49
B365A             49
IWH             1324
IWD             1324
IWA             1324
WHH              573
WHD              573
WHA              573
AHh              262
B365AHH          276
B365AHA          276
AHCh           28479
B365CAHH       28481
B365CAHA       28481
HomeTeamELO      125
AwayTeamELO      126
dtype: int64

In [275]:
# total elements in 
football_df.size

1095649

In [276]:
# total number of NaN
football_df.size - football_df.count().sum()

101004

In [277]:
# total number of NaN rows
football_df.isnull().any(axis = 1).sum()

29734

In [278]:
# total number of NaN columns
football_df.isnull().any(axis = 0).sum()

21

## 2. Data Wrangling and Feature Transformation/Development

### 2.1 NaN Handling

`TODO`: drop NaN values along columns: {Date, Home Team, Away Team, FTR} <br>
`TODO`: identify betting odds w/ most available data

In [279]:
# 当前方法仅提取这几个字段 分区 日期 主队 客队 full-time-result 三家机构的胜平负 主队ELO评分 客队ELO评分
# nan_mask = ['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTR', 'B365H', 'B365D', 'B365A', 
#             'IWH', 'IWD', 'IWA', 'WHH', 'WHD', 'WHA', 'AHh', 'B365AHH', 'B365AHA', 'HomeTeamELO', 'AwayTeamELO']
nan_mask = ['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'B365H', 'B365D', 'B365A', 
            'IWH', 'IWD', 'IWA', 'WHH', 'WHD', 'WHA', 'AHh', 'B365AHH', 'B365AHA', 'HomeTeamELO', 'AwayTeamELO']

In [280]:
asia_mask = ['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'AHh', 'B365AHH', 'B365AHA', 'HomeTeamELO', 'AwayTeamELO']

In [281]:
asia_football_df = football_df[asia_mask]
asia_football_df

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO
0,D1,2003/8/1,Bayern Munich,Ein Frankfurt,3.0,1.0,,,,1859.379272,1593.249268
1,F1,2003/8/1,Lille,Lyon,1.0,0.0,,,,1612.968018,1726.539795
2,F1,2003/8/2,Auxerre,Nice,1.0,2.0,-0.75,2.050,1.850,1702.604858,1611.196045
3,F1,2003/8/2,Guingamp,Marseille,0.0,1.0,0.00,1.925,1.975,1685.016113,1665.625732
4,D1,2003/8/2,Hamburg,Hannover,0.0,3.0,-0.75,1.800,2.100,1718.566284,1649.805298
...,...,...,...,...,...,...,...,...,...,...,...
37776,F1,2024/10/6,Reims,Montpellier,4.0,2.0,-0.50,1.780,2.030,1633.626221,1645.806641
37777,SP1,2024/10/6,Sevilla,Betis,1.0,0.0,0.00,1.700,2.100,1676.242676,1709.259521
37778,SP1,2024/10/6,Sociedad,Ath Madrid,1.0,1.0,0.25,1.890,2.010,1766.551880,1828.522095
37779,F1,2024/10/6,Strasbourg,Lens,2.0,2.0,0.25,1.810,2.090,1608.732544,1719.138184


In [282]:
asia_football_df['asia_final_result'] = asia_football_df['FTHG'] - asia_football_df['FTAG'] + asia_football_df['AHh']
asia_football_df

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO,asia_final_result
0,D1,2003/8/1,Bayern Munich,Ein Frankfurt,3.0,1.0,,,,1859.379272,1593.249268,
1,F1,2003/8/1,Lille,Lyon,1.0,0.0,,,,1612.968018,1726.539795,
2,F1,2003/8/2,Auxerre,Nice,1.0,2.0,-0.75,2.050,1.850,1702.604858,1611.196045,-1.75
3,F1,2003/8/2,Guingamp,Marseille,0.0,1.0,0.00,1.925,1.975,1685.016113,1665.625732,-1.00
4,D1,2003/8/2,Hamburg,Hannover,0.0,3.0,-0.75,1.800,2.100,1718.566284,1649.805298,-3.75
...,...,...,...,...,...,...,...,...,...,...,...,...
37776,F1,2024/10/6,Reims,Montpellier,4.0,2.0,-0.50,1.780,2.030,1633.626221,1645.806641,1.50
37777,SP1,2024/10/6,Sevilla,Betis,1.0,0.0,0.00,1.700,2.100,1676.242676,1709.259521,1.00
37778,SP1,2024/10/6,Sociedad,Ath Madrid,1.0,1.0,0.25,1.890,2.010,1766.551880,1828.522095,0.25
37779,F1,2024/10/6,Strasbourg,Lens,2.0,2.0,0.25,1.810,2.090,1608.732544,1719.138184,0.25


In [283]:
asia_football_df_noNone = asia_football_df.dropna()
asia_football_df_noNone

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO,asia_final_result
2,F1,2003/8/2,Auxerre,Nice,1.0,2.0,-0.75,2.050,1.850,1702.604858,1611.196045,-1.75
3,F1,2003/8/2,Guingamp,Marseille,0.0,1.0,0.00,1.925,1.975,1685.016113,1665.625732,-1.00
4,D1,2003/8/2,Hamburg,Hannover,0.0,3.0,-0.75,1.800,2.100,1718.566284,1649.805298,-3.75
5,D1,2003/8/2,Hertha,Werder Bremen,0.0,3.0,-0.75,2.025,1.875,1719.916748,1692.120972,-3.75
7,F1,2003/8/2,Lens,Le Mans,0.0,0.0,-0.75,1.900,2.000,1697.354004,1539.958130,-0.75
...,...,...,...,...,...,...,...,...,...,...,...,...
37776,F1,2024/10/6,Reims,Montpellier,4.0,2.0,-0.50,1.780,2.030,1633.626221,1645.806641,1.50
37777,SP1,2024/10/6,Sevilla,Betis,1.0,0.0,0.00,1.700,2.100,1676.242676,1709.259521,1.00
37778,SP1,2024/10/6,Sociedad,Ath Madrid,1.0,1.0,0.25,1.890,2.010,1766.551880,1828.522095,0.25
37779,F1,2024/10/6,Strasbourg,Lens,2.0,2.0,0.25,1.810,2.090,1608.732544,1719.138184,0.25


In [284]:
asia_football_df_noNone.reset_index(inplace=True, drop=True)
asia_football_df_noNone

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO,asia_final_result
0,F1,2003/8/2,Auxerre,Nice,1.0,2.0,-0.75,2.050,1.850,1702.604858,1611.196045,-1.75
1,F1,2003/8/2,Guingamp,Marseille,0.0,1.0,0.00,1.925,1.975,1685.016113,1665.625732,-1.00
2,D1,2003/8/2,Hamburg,Hannover,0.0,3.0,-0.75,1.800,2.100,1718.566284,1649.805298,-3.75
3,D1,2003/8/2,Hertha,Werder Bremen,0.0,3.0,-0.75,2.025,1.875,1719.916748,1692.120972,-3.75
4,F1,2003/8/2,Lens,Le Mans,0.0,0.0,-0.75,1.900,2.000,1697.354004,1539.958130,-0.75
...,...,...,...,...,...,...,...,...,...,...,...,...
37251,F1,2024/10/6,Reims,Montpellier,4.0,2.0,-0.50,1.780,2.030,1633.626221,1645.806641,1.50
37252,SP1,2024/10/6,Sevilla,Betis,1.0,0.0,0.00,1.700,2.100,1676.242676,1709.259521,1.00
37253,SP1,2024/10/6,Sociedad,Ath Madrid,1.0,1.0,0.25,1.890,2.010,1766.551880,1828.522095,0.25
37254,F1,2024/10/6,Strasbourg,Lens,2.0,2.0,0.25,1.810,2.090,1608.732544,1719.138184,0.25


In [285]:
conditions = [
    asia_football_df_noNone['asia_final_result'] < -0.25,
    asia_football_df_noNone['asia_final_result'] == -0.25,
    asia_football_df_noNone['asia_final_result'] == 0,
    asia_football_df_noNone['asia_final_result'] == 0.25,
    asia_football_df_noNone['asia_final_result'] > 0.25,
]
easy_conditions = [
    asia_football_df_noNone['asia_final_result'] <= -0.25,
    asia_football_df_noNone['asia_final_result'] == 0,
    asia_football_df_noNone['asia_final_result'] >= 0.25,
]
labels = [-2, -1, 0, 1, 2]
easy_labels = [-1, 0, 1]

asia_football_df_noNone['label'] = np.select(conditions, labels)
asia_football_df_noNone['easy_label'] = np.select(easy_conditions, easy_labels)
asia_football_df_noNone

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO,asia_final_result,label,easy_label
0,F1,2003/8/2,Auxerre,Nice,1.0,2.0,-0.75,2.050,1.850,1702.604858,1611.196045,-1.75,-2,-1
1,F1,2003/8/2,Guingamp,Marseille,0.0,1.0,0.00,1.925,1.975,1685.016113,1665.625732,-1.00,-2,-1
2,D1,2003/8/2,Hamburg,Hannover,0.0,3.0,-0.75,1.800,2.100,1718.566284,1649.805298,-3.75,-2,-1
3,D1,2003/8/2,Hertha,Werder Bremen,0.0,3.0,-0.75,2.025,1.875,1719.916748,1692.120972,-3.75,-2,-1
4,F1,2003/8/2,Lens,Le Mans,0.0,0.0,-0.75,1.900,2.000,1697.354004,1539.958130,-0.75,-2,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37251,F1,2024/10/6,Reims,Montpellier,4.0,2.0,-0.50,1.780,2.030,1633.626221,1645.806641,1.50,2,1
37252,SP1,2024/10/6,Sevilla,Betis,1.0,0.0,0.00,1.700,2.100,1676.242676,1709.259521,1.00,2,1
37253,SP1,2024/10/6,Sociedad,Ath Madrid,1.0,1.0,0.25,1.890,2.010,1766.551880,1828.522095,0.25,1,1
37254,F1,2024/10/6,Strasbourg,Lens,2.0,2.0,0.25,1.810,2.090,1608.732544,1719.138184,0.25,1,1


In [286]:
asia_football_df_noNone['label'].mean()

0.018547348078161906

In [287]:
asia_football_df_noNone['easy_label'].mean()

0.0003757783981103715

In [288]:
asia_football_df_noNone_E0 = asia_football_df_noNone[asia_football_df_noNone['Div'] == 'E0']
asia_football_df_noNone_E0

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO,asia_final_result,label,easy_label
26,E0,2003/8/16,Arsenal,Everton,2.0,1.0,-1.25,1.975,1.925,1863.339844,1674.355957,-0.25,-1,-1
27,E0,2003/8/16,Birmingham,Tottenham,1.0,0.0,-0.25,2.100,1.800,1633.010498,1633.332397,0.75,2,1
28,E0,2003/8/16,Blackburn,Wolves,5.0,1.0,-0.75,1.950,1.950,1726.847656,1598.578247,3.25,2,1
29,E0,2003/8/16,Fulham,Middlesbrough,3.0,2.0,-0.25,2.100,1.800,1653.798340,1664.969727,0.75,2,1
30,E0,2003/8/16,Leicester,Southampton,2.0,2.0,0.00,1.850,2.050,1603.581177,1677.842041,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37226,E0,2024/10/5,Man City,Fulham,3.0,2.0,-1.75,1.900,2.000,2050.554443,1716.248901,-0.75,-2,-1
37235,E0,2024/10/5,West Ham,Ipswich,4.0,1.0,-0.75,2.040,1.860,1726.226807,1591.764038,2.25,2,1
37237,E0,2024/10/6,Aston Villa,Man United,0.0,0.0,-0.25,1.950,1.950,1770.394287,1779.007446,-0.25,-1,-1
37240,E0,2024/10/6,Brighton,Tottenham,3.0,2.0,0.25,1.930,1.970,1713.138794,1790.535156,1.25,2,1


In [289]:
# 队伍名唯一类别编码
asia_football_df_noNone['div'] = pd.Categorical(asia_football_df_noNone['Div']).codes
asia_football_df_noNone['home_team'] = pd.Categorical(asia_football_df_noNone['HomeTeam']).codes
asia_football_df_noNone['away_team'] = pd.Categorical(asia_football_df_noNone['AwayTeam']).codes
asia_football_df_noNone

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO,asia_final_result,label,easy_label,div,home_team,away_team
0,F1,2003/8/2,Auxerre,Nice,1.0,2.0,-0.75,2.050,1.850,1702.604858,1611.196045,-1.75,-2,-1,2,16,137
1,F1,2003/8/2,Guingamp,Marseille,0.0,1.0,0.00,1.925,1.975,1685.016113,1665.625732,-1.00,-2,-1,2,80,123
2,D1,2003/8/2,Hamburg,Hannover,0.0,3.0,-0.75,1.800,2.100,1718.566284,1649.805298,-3.75,-2,-1,0,81,82
3,D1,2003/8/2,Hertha,Werder Bremen,0.0,3.0,-0.75,2.025,1.875,1719.916748,1692.120972,-3.75,-2,-1,0,86,199
4,F1,2003/8/2,Lens,Le Mans,0.0,0.0,-0.75,1.900,2.000,1697.354004,1539.958130,-0.75,-2,-1,2,108,103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37251,F1,2024/10/6,Reims,Montpellier,4.0,2.0,-0.50,1.780,2.030,1633.626221,1645.806641,1.50,2,1,2,158,130
37252,SP1,2024/10/6,Sevilla,Betis,1.0,0.0,0.00,1.700,2.100,1676.242676,1709.259521,1.00,2,1,4,167,22
37253,SP1,2024/10/6,Sociedad,Ath Madrid,1.0,1.0,0.25,1.890,2.010,1766.551880,1828.522095,0.25,1,1,4,171,14
37254,F1,2024/10/6,Strasbourg,Lens,2.0,2.0,0.25,1.810,2.090,1608.732544,1719.138184,0.25,1,1,2,179,108


### one-hot 编码

In [290]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

div_encoder = OneHotEncoder()
home_encoder = OneHotEncoder()
away_encoder = OneHotEncoder()
onehot_div = div_encoder.fit_transform(asia_football_df_noNone.Div.values.reshape(-1, 1)).toarray().astype(int)
onehot_div_df = pd.DataFrame(onehot_div, columns=["Div " + str(int(i)) for i in range(onehot_div.shape[1])])

onehot_home = home_encoder.fit_transform(asia_football_df_noNone.HomeTeam.values.reshape(-1, 1)).toarray().astype(int)
onehot_home_df = pd.DataFrame(onehot_home, columns=['HomeTeam ' + str(int(i)) for i in np.arange(onehot_home.shape[1])])

onehot_away = away_encoder.fit_transform(asia_football_df_noNone.AwayTeam.values.reshape(-1, 1)).toarray().astype(int)
onehot_away_df = pd.DataFrame(onehot_away, columns=['AwayTeam ' + str(int(i)) for i in np.arange(onehot_away.shape[1])])
asia_football_df_noNone_noDiv = pd.concat([asia_football_df_noNone, onehot_div_df, onehot_home_df, onehot_away_df], axis=1)
asia_football_df_noNone_noDiv.drop(columns=['Div'], inplace=True)

asia_football_df_noNone_noDiv

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO,...,AwayTeam 197,AwayTeam 198,AwayTeam 199,AwayTeam 200,AwayTeam 201,AwayTeam 202,AwayTeam 203,AwayTeam 204,AwayTeam 205,AwayTeam 206
0,2003/8/2,Auxerre,Nice,1.0,2.0,-0.75,2.050,1.850,1702.604858,1611.196045,...,0,0,0,0,0,0,0,0,0,0
1,2003/8/2,Guingamp,Marseille,0.0,1.0,0.00,1.925,1.975,1685.016113,1665.625732,...,0,0,0,0,0,0,0,0,0,0
2,2003/8/2,Hamburg,Hannover,0.0,3.0,-0.75,1.800,2.100,1718.566284,1649.805298,...,0,0,0,0,0,0,0,0,0,0
3,2003/8/2,Hertha,Werder Bremen,0.0,3.0,-0.75,2.025,1.875,1719.916748,1692.120972,...,0,0,1,0,0,0,0,0,0,0
4,2003/8/2,Lens,Le Mans,0.0,0.0,-0.75,1.900,2.000,1697.354004,1539.958130,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37251,2024/10/6,Reims,Montpellier,4.0,2.0,-0.50,1.780,2.030,1633.626221,1645.806641,...,0,0,0,0,0,0,0,0,0,0
37252,2024/10/6,Sevilla,Betis,1.0,0.0,0.00,1.700,2.100,1676.242676,1709.259521,...,0,0,0,0,0,0,0,0,0,0
37253,2024/10/6,Sociedad,Ath Madrid,1.0,1.0,0.25,1.890,2.010,1766.551880,1828.522095,...,0,0,0,0,0,0,0,0,0,0
37254,2024/10/6,Strasbourg,Lens,2.0,2.0,0.25,1.810,2.090,1608.732544,1719.138184,...,0,0,0,0,0,0,0,0,0,0


### 日期时间 one-hot

In [291]:
import copy

In [292]:
asia_football_df_noNone_noDiv_noDate = copy.deepcopy(asia_football_df_noNone_noDiv)

asia_football_df_noNone_noDiv_noDate['Year'] = pd.DatetimeIndex(asia_football_df_noNone_noDiv.Date).year
asia_football_df_noNone_noDiv_noDate['Month'] = pd.DatetimeIndex(asia_football_df_noNone_noDiv.Date).month
asia_football_df_noNone_noDiv_noDate

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO,...,AwayTeam 199,AwayTeam 200,AwayTeam 201,AwayTeam 202,AwayTeam 203,AwayTeam 204,AwayTeam 205,AwayTeam 206,Year,Month
0,2003/8/2,Auxerre,Nice,1.0,2.0,-0.75,2.050,1.850,1702.604858,1611.196045,...,0,0,0,0,0,0,0,0,2003,8
1,2003/8/2,Guingamp,Marseille,0.0,1.0,0.00,1.925,1.975,1685.016113,1665.625732,...,0,0,0,0,0,0,0,0,2003,8
2,2003/8/2,Hamburg,Hannover,0.0,3.0,-0.75,1.800,2.100,1718.566284,1649.805298,...,0,0,0,0,0,0,0,0,2003,8
3,2003/8/2,Hertha,Werder Bremen,0.0,3.0,-0.75,2.025,1.875,1719.916748,1692.120972,...,1,0,0,0,0,0,0,0,2003,8
4,2003/8/2,Lens,Le Mans,0.0,0.0,-0.75,1.900,2.000,1697.354004,1539.958130,...,0,0,0,0,0,0,0,0,2003,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37251,2024/10/6,Reims,Montpellier,4.0,2.0,-0.50,1.780,2.030,1633.626221,1645.806641,...,0,0,0,0,0,0,0,0,2024,10
37252,2024/10/6,Sevilla,Betis,1.0,0.0,0.00,1.700,2.100,1676.242676,1709.259521,...,0,0,0,0,0,0,0,0,2024,10
37253,2024/10/6,Sociedad,Ath Madrid,1.0,1.0,0.25,1.890,2.010,1766.551880,1828.522095,...,0,0,0,0,0,0,0,0,2024,10
37254,2024/10/6,Strasbourg,Lens,2.0,2.0,0.25,1.810,2.090,1608.732544,1719.138184,...,0,0,0,0,0,0,0,0,2024,10


In [293]:
asia_football_df_noNone_noDiv_noDate['Sin_Month'] = np.sin(2*np.pi*asia_football_df_noNone_noDiv_noDate.Month/12)
asia_football_df_noNone_noDiv_noDate['Cos_Month'] = np.cos(2*np.pi*asia_football_df_noNone_noDiv_noDate.Month/12)

asia_football_df_noNone_noDiv_noDate['DayofYear'] = pd.DatetimeIndex(asia_football_df_noNone_noDiv_noDate.Date).dayofyear
asia_football_df_noNone_noDiv_noDate['Sin_Day'] = np.sin(2*np.pi*asia_football_df_noNone_noDiv_noDate.DayofYear/365)
asia_football_df_noNone_noDiv_noDate['Cos_Day'] = np.cos(2*np.pi*asia_football_df_noNone_noDiv_noDate.DayofYear/365)

asia_football_df_noNone_noDiv_noDate.drop(columns = ['Date','Month'], inplace = True)
# learning_df.drop(columns = ['Date'], inplace = True)
asia_football_df_noNone_noDiv_noDate

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO,asia_final_result,...,AwayTeam 203,AwayTeam 204,AwayTeam 205,AwayTeam 206,Year,Sin_Month,Cos_Month,DayofYear,Sin_Day,Cos_Day
0,Auxerre,Nice,1.0,2.0,-0.75,2.050,1.850,1702.604858,1611.196045,-1.75,...,0,0,0,0,2003,-0.866025,-0.5,214,-0.516062,-0.856551
1,Guingamp,Marseille,0.0,1.0,0.00,1.925,1.975,1685.016113,1665.625732,-1.00,...,0,0,0,0,2003,-0.866025,-0.5,214,-0.516062,-0.856551
2,Hamburg,Hannover,0.0,3.0,-0.75,1.800,2.100,1718.566284,1649.805298,-3.75,...,0,0,0,0,2003,-0.866025,-0.5,214,-0.516062,-0.856551
3,Hertha,Werder Bremen,0.0,3.0,-0.75,2.025,1.875,1719.916748,1692.120972,-3.75,...,0,0,0,0,2003,-0.866025,-0.5,214,-0.516062,-0.856551
4,Lens,Le Mans,0.0,0.0,-0.75,1.900,2.000,1697.354004,1539.958130,-0.75,...,0,0,0,0,2003,-0.866025,-0.5,214,-0.516062,-0.856551
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37251,Reims,Montpellier,4.0,2.0,-0.50,1.780,2.030,1633.626221,1645.806641,1.50,...,0,0,0,0,2024,-0.866025,0.5,280,-0.994218,0.107381
37252,Sevilla,Betis,1.0,0.0,0.00,1.700,2.100,1676.242676,1709.259521,1.00,...,0,0,0,0,2024,-0.866025,0.5,280,-0.994218,0.107381
37253,Sociedad,Ath Madrid,1.0,1.0,0.25,1.890,2.010,1766.551880,1828.522095,0.25,...,0,0,0,0,2024,-0.866025,0.5,280,-0.994218,0.107381
37254,Strasbourg,Lens,2.0,2.0,0.25,1.810,2.090,1608.732544,1719.138184,0.25,...,0,0,0,0,2024,-0.866025,0.5,280,-0.994218,0.107381


明确输入：hometeam，awayteam--onhot，AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO,

In [294]:
# 保存数据
import gzip
with gzip.open('pkl/asia_football_df_noNone_noDiv.pkl.gz', 'wb') as f:
    asia_football_df_noNone_noDiv_noDate.to_pickle(f)

In [295]:
with gzip.open('pkl/asia_football_df_noNone_noDiv.pkl.gz', 'rb') as f:
    df_loaded = pd.read_pickle(f)

# CNN 架构模型

In [296]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNNModel(nn.Module):
    def __init__(self, num_classes=5):
        super(CNNModel, self).__init__()
        # 特征序列 A
        self.convA = nn.Conv1d(in_channels=3, out_channels=32, kernel_size=3, padding=1)
        self.poolA = nn.MaxPool1d(kernel_size=2)
        
        # 特征序列 B
        self.convB = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.poolB = nn.MaxPool1d(kernel_size=2)
        
        # 特征序列 C
        self.convC = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        self.poolC = nn.MaxPool1d(kernel_size=2)
        
        # 全连接层
        self.fc1 = nn.Linear(in_features=128, out_features=256)
        self.fc2 = nn.Linear(in_features=256, out_features=num_classes)
        
    def forward(self, time_encoding, individual_encoding, featureA, featureB, featureC):
        # 特征序列 A
        xA = self.poolA(F.relu(self.convA(featureA.unsqueeze(1))))
        xA = xA.view(xA.size(0), -1)
        
        # 特征序列 B
        xB = self.poolB(F.relu(self.convB(featureB.unsqueeze(1))))
        xB = xB.view(xB.size(0), -1)
        
        # 特征序列 C
        xC = self.poolC(F.relu(self.convC(featureC.unsqueeze(1))))
        xC = xC.view(xC.size(0), -1)
        
        # 融合特征
        x = torch.cat((xA, xB, xC), dim=1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x
    
        

import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# 假设你的数据如下
# df = pd.DataFrame({
#     'feature1': [...],
#     'feature2': [...],
#     'feature3': [...],
#     'label': [...]
# })
index_row_start = 0
index_row_end = -1
# 1. 准备数据
# 提取特征和标签
X = asia_football_df_noNone_noDiv_noDate.iloc[index_row_start:index_row_end][[
    'AHh', 'B365AHH', 'B365AHA', 'HomeTeamELO', 'AwayTeamELO', 'Div 0', 'Div 1', 'Div 2', 'Div 3', 'Div 4', 'Year', 'Sin_Month', 'Cos_Month', 'DayofYear', 'Sin_Day', 'Cos_Day', 'DayofYear', 'Sin_Day', 'Cos_Day'
]]  # 特征
y = asia_football_df_noNone_noDiv_noDate.iloc[index_row_start:index_row_end]['easy_label']  # 标签

# 2. 数据标准化（重要）
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)
# tsne = TSNE(
#     n_components=2,      # 降维到2维
#     perplexity=30,      # 困惑度，通常在5-50之间
#     learning_rate=200,   # 学习率
#     n_iter=1000,        # 迭代次数
#     random_state=42     # 随机种子
# )
# 3. 使用TSNE降维到2维
tsne = TSNE(n_components=2, perplexity=20, random_state=13)
X_tsne = tsne.fit_transform(X)
# ['grey', 'lightcoral', 'chocolate', 'darkorange',
# 'gold', 'olivedrab', 'lawngreen', 'aquamarine',
# 'darkcyan', 'deepskyblue', 'cornflowerblue', 'blue',
# 'mediumslateblue', 'blueviolet', 'violet', 'deeppink']
# color_dict={'E0':'grey', 'F1':'lightcoral', 'D1':'chocolate', ''}
# 4. 可视化
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y, cmap='viridis')
plt.colorbar(scatter)
plt.title('TSNE Visualization')
plt.xlabel('TSNE Component 1')
plt.ylabel('TSNE Component 2')
plt.show()

# # 5. 如果想要保存TSNE结果
# df_tsne = pd.DataFrame(X_tsne, columns=['TSNE1', 'TSNE2'])
# df_tsne['label'] = y

# GRU 模型架构

In [297]:
import torch
import torch.nn as nn

class GRUModel(nn.Module):
    def __init__(self, num_classes=5):
        super(GRUModel, self).__init__()
        # 特征序列 A
        self.gruA = nn.GRU(input_size=1, hidden_size=16, batch_first=True)
        # 特征序列 B
        self.gruB = nn.GRU(input_size=1, hidden_size=16, batch_first=True)
        # 特征序列 C
        self.gruC = nn.GRU(input_size=1, hidden_size=16, batch_first=True)
        
        # 全连接层
        self.fc1 = nn.Linear(in_features=16 * 3, out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=num_classes)
        
    def forward(self, time_encoding, individual_encoding, featureA, featureB, featureC):
        # 特征序列 A
        xA, _ = self.gruA(featureA.unsqueeze(2))
        xA = xA[:, -1, :]
        
        # 特征序列 B
        xB, _ = self.gruB(featureB.unsqueeze(2))
        xB = xB[:, -1, :]
        
        # 特征序列 C
        xC, _ = self.gruC(featureC.unsqueeze(2))
        xC = xC[:, -1, :]
        
        # 融合特征
        x = torch.cat((xA, xB, xC), dim=1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x
            

# 改进的CNN 架构

In [298]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNNModelWithAttention(nn.Module):
    def __init__(self, num_classes=5):
        super(CNNModelWithAttention, self).__init__()
        # 特征序列 A
        self.convA = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3, padding=1)
        self.poolA = nn.MaxPool1d(kernel_size=2)
        
        # 特征序列 B
        self.convB = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3, padding=1)
        self.poolB = nn.MaxPool1d(kernel_size=2)
        
        # 特征序列 C
        self.convC = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3, padding=1)
        self.poolC = nn.MaxPool1d(kernel_size=2)
        
        # 注意力层
        self.attention = nn.Linear(48, 48)
        
        # 全连接层
        self.fc1 = nn.Linear(in_features=48, out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=num_classes)
        
    def forward(self, time_encoding, individual_encoding, featureA, featureB, featureC):
        # 特征序列 A
        xA = self.poolA(F.relu(self.convA(featureA.unsqueeze(1))))
        xA = xA.view(xA.size(0), -1)
        
        # 特征序列 B
        xB = self.poolB(F.relu(self.convB(featureB.unsqueeze(1))))
        xB = xB.view(xB.size(0), -1)
        
        # 特征序列 C
        xC = self.poolC(F.relu(self.convC(featureC.unsqueeze(1))))
        xC = xC.view(xC.size(0), -1)
        
        # 融合特征
        x = torch.cat((xA, xB, xC), dim=1)
        
        # 注意力机制
        attention_weights = F.softmax(self.attention(x), dim = 1)
        x = x * attention_weights
        
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x

# 改进的GRU架构

In [299]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class GRUModelWithAttention(nn.Module):
    def __init__(self, num_classes=5):
        super(GRUModelWithAttention, self).__init__()
        # 特征序列 A
        self.gruA = nn.GRU(input_size=1, hidden_size=16, batch_first=True)
        # 特征序列 B
        self.gruB = nn.GRU(input_size=1, hidden_size=16, batch_first=True)
        # 特征序列 C
        self.gruC = nn.GRU(input_size=1, hidden_size=16, batch_first=True)
        # 注意力层
        self.attention = nn.Linear(48, 48)
        # 全连接层
        self.fc1 = nn.Linear(in_features=16 * 3, out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=num_classes)
        
    def forward(self, time_encoding, individual_encoding, featureA, featureB, featureC):
        # 特征序列 A
        xA, _ = self.gruA(featureA.unsqueeze(2))
        xA = xA[:, -1, :]
        
        # 特征序列 B
        xB, _ = self.gruB(featureB.unsqueeze(2))
        xB = xB[:, -1, :]
        
        # 特征序列 C
        xC, _ = self.gruC(featureC.unsqueeze(2))
        xC = xC[:, -1, :]
        
        # 融合特征
        x = torch.cat((xA, xB, xC), dim=1)
        
        # 注意力机制
        attention_weights = F.softmax(self.attention(x), dim = 1)
        x = x * attention_weights
        
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x

# 多特征处理模型架构

In [300]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiFeatureModel(nn.Module):
    def __init__(self, num_classes=3):
        super(MultiFeatureModel, self).__init__()
        
        # 数值特征处理
        self.fc_numerical = nn.Sequential(
            nn.Linear(in_features=16 * 3, out_features=128),
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=64)
        )
        
        # 序列特征处理
        self.gru = nn.GRU(input_size=3, hidden_size=16, batch_first=True)
        
        # 类别特征处理（使用嵌入层）
        self.embedding = nn.Embedding(num_embeddings=3, embedding_dim=128)
        
        # 输出层
        self.fc_output = nn.Linear(64+64+64, num_classes)
        
    def forward(self, numerical_features, sequence_features, categorical_features):
        # 处理数值特征
        numerical_out = self.fc_numeral(numerical_features)
        
        # 处理序列特征
        seq_out, _ = self.gru(sequence_features)
        seq_out = seq_out[:, -1, :] # 取最后的时刻输出
        
        # 处理类别特征
        categorical_out = self.embedding(categorical_features).view(categorical_features.size(0), -1)
        
        # 融合输出
        combined = torch.cat((numerical_out, seq_out, categorical_out), dim=1)
        output = self.fc_output(combined)
        
        return output
        
        

### 更新后的多特征处理模型

* 交互层InteractionLayer使用了一个线性层来捕捉特征之间的交互作用。输入的特征通过张量乘法得到交互特征，然后通过线性层进行处理
* 特征维度调整 在设计输出层时，考虑了来自数值特征、序列特征、类别特征以及交互特征的输出维度
* 灵活性 可以根据实际特征的数量和类型调整输入维度和结构
* 这种设计能够有效地捕捉特征之间的复杂交互，有助于提升模型的表现


In [301]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class InteractionLayer(nn.Module):
    def __init__(self, input_size):
        super(InteractionLayer, self).__init__()
        self.linear = nn.Linear(in_features=input_size, out_features=input_size)
        
    def forward(self, x):
        interactions = torch.bmm(x.unsqueeze(1), x.unsqueeze(2)).view(x.size(0), -1)
        return F.relu(self.linear(interactions))
    
class MultiFeatureModel(nn.Module):
    def __init__(self, num_classes, numerical_inputsize, categorical_inputsize, sequence_inputsize, embedding_size):
        super(MultiFeatureModel, self).__init__()
        # 数值特征处理
        self.fc_numerical = nn.Sequential(
            nn.Linear(in_features=numerical_inputsize, out_features=128),
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=64)
        )
        
        # 序列特征处理
        self.gru = nn.GRU(input_size=sequence_inputsize, hidden_size=16, batch_first=True)
        
        # 类别特征处理（使用嵌入层）
        self.embedding = nn.Embedding(num_embeddings=categorical_inputsize, embedding_dim=embedding_size)
        
        # 交互层
        self.interaction_layer = InteractionLayer(64*3)
        
        # 输出层
        self.fc_output = nn.Linear(64+64+64, num_classes)
        
    def forward(self, numerical_features, sequence_features, categorical_features):
        # 处理数值特征
        numerical_out = self.fc_numeral(numerical_features)
        
        # 处理序列特征
        seq_out, _ = self.gru(sequence_features)
        seq_out = seq_out[:, -1, :] # 取最后的时刻输出
        
        # 处理类别特征
        categorical_out = self.embedding(categorical_features).view(categorical_features.size(0), -1)
        
        # 融合输出
        combined = torch.cat((numerical_out, seq_out, categorical_out), dim=1)
        
        combined_out = self.interaction_layer(combined)
        final_out = torch.cat((numerical_out, seq_out, categorical_out, combined_out), dim=1)
        
        output = self.fc_output(final_out)
        
        return output

In [302]:
# 删除指定列中含有缺失值的行
#football_df.FTR.replace('nan', np.nan, inplace=True)
nan_football_df = football_df.dropna(subset = nan_mask)
nan_football_df

Unnamed: 0.1,Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,...,WHD,WHA,AHh,B365AHH,B365AHA,AHCh,B365CAHH,B365CAHA,HomeTeamELO,AwayTeamELO
2,2,F1,2003/8/2,Auxerre,Nice,1.0,2.0,A,,,...,3.1,5.00,-0.75,2.050,1.850,,,,1702.604858,1611.196045
3,3,F1,2003/8/2,Guingamp,Marseille,0.0,1.0,A,,,...,3.1,2.40,0.00,1.925,1.975,,,,1685.016113,1665.625732
4,4,D1,2003/8/2,Hamburg,Hannover,0.0,3.0,A,10.0,16.0,...,3.5,5.00,-0.75,1.800,2.100,,,,1718.566284,1649.805298
5,5,D1,2003/8/2,Hertha,Werder Bremen,0.0,3.0,A,23.0,19.0,...,3.4,4.00,-0.75,2.025,1.875,,,,1719.916748,1692.120972
7,7,F1,2003/8/2,Lens,Le Mans,0.0,0.0,D,,,...,3.3,4.50,-0.75,1.900,2.000,,,,1697.354004,1539.958130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36541,36541,SP1,2024/1/3,Granada,Cadiz,2.0,0.0,H,15.0,15.0,...,3.2,3.40,-0.25,1.940,1.990,-0.25,1.89,2.04,1563.726196,1628.365479
36542,36542,SP1,2024/1/3,Real Madrid,Mallorca,1.0,0.0,H,17.0,7.0,...,6.0,13.00,-1.75,1.990,1.940,-1.75,1.98,1.95,1967.505371,1645.844849
36543,36543,SP1,2024/1/4,Las Palmas,Barcelona,1.0,2.0,A,12.0,15.0,...,4.2,1.53,1.00,1.920,2.010,1.25,1.83,2.10,1629.086792,1858.904297
36544,36544,SP1,2024/1/4,Osasuna,Almeria,1.0,0.0,H,14.0,4.0,...,3.9,4.60,-0.75,1.950,1.980,-0.75,1.98,1.95,1649.258545,1554.821045


In [303]:
# resize shape
football_df.shape[0] - nan_football_df.shape[0]

2377

### 2.2 Feature Encoding <br>
* $\phi(Date)$ $\Rightarrow$ one column for *year*, second column for *month*, third column for *day of year*
* One hot encode Division, Home and Away Teams
* Label encode Full Time Result (Win/Draw/Loss)

In [304]:
feats = nan_mask

In [305]:
nan_football_df

Unnamed: 0.1,Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,...,WHD,WHA,AHh,B365AHH,B365AHA,AHCh,B365CAHH,B365CAHA,HomeTeamELO,AwayTeamELO
2,2,F1,2003/8/2,Auxerre,Nice,1.0,2.0,A,,,...,3.1,5.00,-0.75,2.050,1.850,,,,1702.604858,1611.196045
3,3,F1,2003/8/2,Guingamp,Marseille,0.0,1.0,A,,,...,3.1,2.40,0.00,1.925,1.975,,,,1685.016113,1665.625732
4,4,D1,2003/8/2,Hamburg,Hannover,0.0,3.0,A,10.0,16.0,...,3.5,5.00,-0.75,1.800,2.100,,,,1718.566284,1649.805298
5,5,D1,2003/8/2,Hertha,Werder Bremen,0.0,3.0,A,23.0,19.0,...,3.4,4.00,-0.75,2.025,1.875,,,,1719.916748,1692.120972
7,7,F1,2003/8/2,Lens,Le Mans,0.0,0.0,D,,,...,3.3,4.50,-0.75,1.900,2.000,,,,1697.354004,1539.958130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36541,36541,SP1,2024/1/3,Granada,Cadiz,2.0,0.0,H,15.0,15.0,...,3.2,3.40,-0.25,1.940,1.990,-0.25,1.89,2.04,1563.726196,1628.365479
36542,36542,SP1,2024/1/3,Real Madrid,Mallorca,1.0,0.0,H,17.0,7.0,...,6.0,13.00,-1.75,1.990,1.940,-1.75,1.98,1.95,1967.505371,1645.844849
36543,36543,SP1,2024/1/4,Las Palmas,Barcelona,1.0,2.0,A,12.0,15.0,...,4.2,1.53,1.00,1.920,2.010,1.25,1.83,2.10,1629.086792,1858.904297
36544,36544,SP1,2024/1/4,Osasuna,Almeria,1.0,0.0,H,14.0,4.0,...,3.9,4.60,-0.75,1.950,1.980,-0.75,1.98,1.95,1649.258545,1554.821045


In [306]:
learning_df = nan_football_df.copy()[feats]
learning_df

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTR,B365H,B365D,B365A,IWH,IWD,IWA,WHH,WHD,WHA,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO
2,F1,2003/8/2,Auxerre,Nice,A,1.727,3.100,4.500,1.70,3.10,4.40,1.66,3.1,5.00,-0.75,2.050,1.850,1702.604858,1611.196045
3,F1,2003/8/2,Guingamp,Marseille,A,2.500,2.875,2.625,2.70,2.90,2.40,2.60,3.1,2.40,0.00,1.925,1.975,1685.016113,1665.625732
4,D1,2003/8/2,Hamburg,Hannover,A,1.571,3.500,5.000,1.65,3.30,4.40,1.57,3.5,5.00,-0.75,1.800,2.100,1718.566284,1649.805298
5,D1,2003/8/2,Hertha,Werder Bremen,A,1.833,3.200,3.750,1.80,3.10,3.80,1.72,3.4,4.00,-0.75,2.025,1.875,1719.916748,1692.120972
7,F1,2003/8/2,Lens,Le Mans,D,1.571,3.250,5.500,1.60,3.30,4.80,1.66,3.3,4.50,-0.75,1.900,2.000,1697.354004,1539.958130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36541,SP1,2024/1/3,Granada,Cadiz,H,2.250,3.100,3.500,2.25,3.20,3.35,2.15,3.2,3.40,-0.25,1.940,1.990,1563.726196,1628.365479
36542,SP1,2024/1/3,Real Madrid,Mallorca,H,1.200,6.500,15.000,1.22,6.50,13.00,1.20,6.0,13.00,-1.75,1.990,1.940,1967.505371,1645.844849
36543,SP1,2024/1/4,Las Palmas,Barcelona,A,5.500,4.330,1.570,5.50,4.30,1.57,5.50,4.2,1.53,1.00,1.920,2.010,1629.086792,1858.904297
36544,SP1,2024/1/4,Osasuna,Almeria,H,1.700,4.000,4.750,1.70,3.90,4.80,1.65,3.9,4.60,-0.75,1.950,1.980,1649.258545,1554.821045


In [307]:
learning_df.reset_index(inplace=True, drop=True)
learning_df

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTR,B365H,B365D,B365A,IWH,IWD,IWA,WHH,WHD,WHA,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO
0,F1,2003/8/2,Auxerre,Nice,A,1.727,3.100,4.500,1.70,3.10,4.40,1.66,3.1,5.00,-0.75,2.050,1.850,1702.604858,1611.196045
1,F1,2003/8/2,Guingamp,Marseille,A,2.500,2.875,2.625,2.70,2.90,2.40,2.60,3.1,2.40,0.00,1.925,1.975,1685.016113,1665.625732
2,D1,2003/8/2,Hamburg,Hannover,A,1.571,3.500,5.000,1.65,3.30,4.40,1.57,3.5,5.00,-0.75,1.800,2.100,1718.566284,1649.805298
3,D1,2003/8/2,Hertha,Werder Bremen,A,1.833,3.200,3.750,1.80,3.10,3.80,1.72,3.4,4.00,-0.75,2.025,1.875,1719.916748,1692.120972
4,F1,2003/8/2,Lens,Le Mans,D,1.571,3.250,5.500,1.60,3.30,4.80,1.66,3.3,4.50,-0.75,1.900,2.000,1697.354004,1539.958130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35399,SP1,2024/1/3,Granada,Cadiz,H,2.250,3.100,3.500,2.25,3.20,3.35,2.15,3.2,3.40,-0.25,1.940,1.990,1563.726196,1628.365479
35400,SP1,2024/1/3,Real Madrid,Mallorca,H,1.200,6.500,15.000,1.22,6.50,13.00,1.20,6.0,13.00,-1.75,1.990,1.940,1967.505371,1645.844849
35401,SP1,2024/1/4,Las Palmas,Barcelona,A,5.500,4.330,1.570,5.50,4.30,1.57,5.50,4.2,1.53,1.00,1.920,2.010,1629.086792,1858.904297
35402,SP1,2024/1/4,Osasuna,Almeria,H,1.700,4.000,4.750,1.70,3.90,4.80,1.65,3.9,4.60,-0.75,1.950,1.980,1649.258545,1554.821045


**2.2.1 Division and Home/Away Team Encoding**

In [308]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

div_encoder = OneHotEncoder()
home_encoder = OneHotEncoder()
away_encoder = OneHotEncoder()

In [309]:
onehot_div = div_encoder.fit_transform(learning_df.Div.values.reshape(-1,1)).toarray().astype(int)
onehot_div_df = pd.DataFrame(onehot_div, columns = ["Div "+str(int(i)) for i in range(onehot_div.shape[1])])

onehot_home = home_encoder.fit_transform(learning_df.HomeTeam.values.reshape(-1,1)).toarray().astype(int)
onehot_home_df = pd.DataFrame(onehot_home, columns = ['HomeTeam ' + str(int(i)) for i in np.arange(onehot_home.shape[1])])

onehot_away = away_encoder.fit_transform(learning_df.AwayTeam.values.reshape(-1,1)).toarray().astype(int)
onehot_away_df = pd.DataFrame(onehot_away, columns = ['AwayTeam ' + str(int(i)) for i in np.arange(onehot_away.shape[1])])

In [310]:
learning_df = pd.concat([learning_df, onehot_div_df, onehot_home_df, onehot_away_df], axis = 1)
learning_df.drop(columns = ['Div'], inplace = True)

In [311]:
learning_df

Unnamed: 0,Date,HomeTeam,AwayTeam,FTR,B365H,B365D,B365A,IWH,IWD,IWA,...,AwayTeam 194,AwayTeam 195,AwayTeam 196,AwayTeam 197,AwayTeam 198,AwayTeam 199,AwayTeam 200,AwayTeam 201,AwayTeam 202,AwayTeam 203
0,2003/8/2,Auxerre,Nice,A,1.727,3.100,4.500,1.70,3.10,4.40,...,0,0,0,0,0,0,0,0,0,0
1,2003/8/2,Guingamp,Marseille,A,2.500,2.875,2.625,2.70,2.90,2.40,...,0,0,0,0,0,0,0,0,0,0
2,2003/8/2,Hamburg,Hannover,A,1.571,3.500,5.000,1.65,3.30,4.40,...,0,0,0,0,0,0,0,0,0,0
3,2003/8/2,Hertha,Werder Bremen,A,1.833,3.200,3.750,1.80,3.10,3.80,...,0,0,1,0,0,0,0,0,0,0
4,2003/8/2,Lens,Le Mans,D,1.571,3.250,5.500,1.60,3.30,4.80,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35399,2024/1/3,Granada,Cadiz,H,2.250,3.100,3.500,2.25,3.20,3.35,...,0,0,0,0,0,0,0,0,0,0
35400,2024/1/3,Real Madrid,Mallorca,H,1.200,6.500,15.000,1.22,6.50,13.00,...,0,0,0,0,0,0,0,0,0,0
35401,2024/1/4,Las Palmas,Barcelona,A,5.500,4.330,1.570,5.50,4.30,1.57,...,0,0,0,0,0,0,0,0,0,0
35402,2024/1/4,Osasuna,Almeria,H,1.700,4.000,4.750,1.70,3.90,4.80,...,0,0,0,0,0,0,0,0,0,0


**2.2.2 Full Time Result Encoding**

In [312]:
target_encoder = LabelEncoder()
learning_df['Result'] = target_encoder.fit_transform(learning_df.FTR) 

**2.2.3 Date Encoding**

In [313]:
learning_df['Year'] = pd.DatetimeIndex(learning_df.Date).year

learning_df['Month'] = pd.DatetimeIndex(learning_df.Date).month
learning_df['Sin_Month'] = np.sin(2*np.pi*learning_df.Month/12)
learning_df['Cos_Month'] = np.cos(2*np.pi*learning_df.Month/12)

learning_df['DayofYear'] = pd.DatetimeIndex(learning_df.Date).dayofyear
learning_df['Sin_Day'] = np.sin(2*np.pi*learning_df.DayofYear/365)
learning_df['Cos_Day'] = np.cos(2*np.pi*learning_df.DayofYear/365)

learning_df.drop(columns = ['Date','Month'], inplace = True)
# learning_df.drop(columns = ['Date'], inplace = True)

In [314]:
learning_df

Unnamed: 0,HomeTeam,AwayTeam,FTR,B365H,B365D,B365A,IWH,IWD,IWA,WHH,...,AwayTeam 201,AwayTeam 202,AwayTeam 203,Result,Year,Sin_Month,Cos_Month,DayofYear,Sin_Day,Cos_Day
0,Auxerre,Nice,A,1.727,3.100,4.500,1.70,3.10,4.40,1.66,...,0,0,0,0,2003,-0.866025,-0.500000,214,-0.516062,-0.856551
1,Guingamp,Marseille,A,2.500,2.875,2.625,2.70,2.90,2.40,2.60,...,0,0,0,0,2003,-0.866025,-0.500000,214,-0.516062,-0.856551
2,Hamburg,Hannover,A,1.571,3.500,5.000,1.65,3.30,4.40,1.57,...,0,0,0,0,2003,-0.866025,-0.500000,214,-0.516062,-0.856551
3,Hertha,Werder Bremen,A,1.833,3.200,3.750,1.80,3.10,3.80,1.72,...,0,0,0,0,2003,-0.866025,-0.500000,214,-0.516062,-0.856551
4,Lens,Le Mans,D,1.571,3.250,5.500,1.60,3.30,4.80,1.66,...,0,0,0,1,2003,-0.866025,-0.500000,214,-0.516062,-0.856551
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35399,Granada,Cadiz,H,2.250,3.100,3.500,2.25,3.20,3.35,2.15,...,0,0,0,2,2024,0.500000,0.866025,3,0.051620,0.998667
35400,Real Madrid,Mallorca,H,1.200,6.500,15.000,1.22,6.50,13.00,1.20,...,0,0,0,2,2024,0.500000,0.866025,3,0.051620,0.998667
35401,Las Palmas,Barcelona,A,5.500,4.330,1.570,5.50,4.30,1.57,5.50,...,0,0,0,0,2024,0.500000,0.866025,4,0.068802,0.997630
35402,Osasuna,Almeria,H,1.700,4.000,4.750,1.70,3.90,4.80,1.65,...,0,0,0,2,2024,0.500000,0.866025,4,0.068802,0.997630


In [None]:
# For Test


### 2.3 Feature Engineering <br>
* $\phi(x)$ feature transformation $\Rightarrow$ last match result, win/loss streak to date, wins to season date
* $\phi(x)$ feature engineering $\Rightarrow$ average the home, away, and draw odds

**2.3.1 Last Match Result** <br>
Indicate the result from the last match played between both teams

In [315]:
# 定义一个函数来计算两队之间上一场比赛的结果
def compute_last_matches(df):
    
    unique_matchups = list(set((list(zip(df.HomeTeam, df.AwayTeam)))))
    df['Last Match Result'] = np.nan
    for home, away in unique_matchups:
        matchup_df = df[(df.HomeTeam == home) & (df.AwayTeam == away)]
        # 使用 shift(1) 方法将 FTR（全场比赛结果）列中的数据向下移动一行，这样每行的 last_match_result 将对应于这两队之前的一场比赛的结果。fill_value='Na' 确保了数据移动后空出的位置填充为 'Na'。
        last_match_result = matchup_df.FTR.shift(1, fill_value='Na')
        df.loc[matchup_df.index, 'Last Match Result'] = last_match_result
        
    lmr_encoder = LabelEncoder()
    df['Last Match Result'] = lmr_encoder.fit_transform(df['Last Match Result'])
    df.drop(columns = ['FTR'], inplace = True)
    return df

In [316]:
def compute_last_n_matches(df, n=5):
    unique_matchups = list(set(zip(df.HomeTeam, df.AwayTeam)))
    df['Last 5 Match Results'] = np.nan  # 新增一列用于存储过去 5 场比赛的结果
    
    for home, away in unique_matchups:
        matchup_df = df[(df.HomeTeam == home) & (df.AwayTeam == away)]
        
        # 获取过去 n 场比赛的结果
        last_n_results = [matchup_df.FTR.shift(i, fill_value='Na') for i in range(1, n+1)]
        
        # 将计算得到的过去 n 场比赛的结果合并为一个字符串或列表，取决于需求
        # 这里使用字符串形式：'result1/result2/...'
        matchup_df['Last 5 Match Results'] = pd.DataFrame(last_n_results).T.apply(lambda x: '/'.join(x), axis=1)
        
        # 将计算得到的结果更新回原始 df 中
        df.loc[matchup_df.index, 'Last 5 Match Results'] = matchup_df['Last 5 Match Results']
    
    # 对 Last 5 Match Results 列进行标签编码
    lmr_encoder = LabelEncoder()
    df['Last 5 Match Results'] = lmr_encoder.fit_transform(df['Last 5 Match Results'])
    
    # 删除原始的 FTR 列
    df.drop(columns=['FTR'], inplace=True)
    
    return df


In [317]:
learning_df = compute_last_matches(learning_df)
# learning_df.drop(columns = ['FTR'], inplace = True)

  df.loc[matchup_df.index, 'Last Match Result'] = last_match_result


**2.3.2 Home and Away Win/Loss Streak** <br>
Important note about this feature: the win/loss streak is the teams *home* and *away* win streak, *not* its ***consecutive*** win/loss streak.

In [318]:
# https://stackoverflow.com/questions/52976336/compute-winning-streak-with-pandas
# https://joshdevlin.com/blog/calculate-streaks-in-pandas/

In [319]:
def compute_winstreak(df):
    
    years = df.Year.unique()
    df_lst = []    
    for year in years:
        
        year_df = df[df.Year == year]
        year_df['HomeWin'] = year_df.Result.replace([0, 1, 2], [0, 0, 1])
        year_df['AwayWin'] = year_df.Result.replace([0, 1, 2], [1, 0, 0])
        year_df['HomeWinStreak'] = None
        year_df['AwayWinStreak'] = None
        
        hometeams = year_df.HomeTeam.unique()
        awayteams = year_df.AwayTeam.unique()
        
        for team in hometeams:
            team_df = year_df[(year_df.HomeTeam == team)]
            team_df = team_df.sort_values(['Year', 'DayofYear'], ascending = (True, True))

            team_grouper = (team_df.HomeWin != team_df.HomeWin.shift()).cumsum()
            team_df['HomeWinStreak'] = team_df[['HomeWin']].groupby(team_grouper).cumsum()
            team_df.loc[team_df.HomeWinStreak >0, 'HomeWinStreak'] -= 1
            year_df.loc[team_df.index, 'HomeWinStreak'] = team_df.HomeWinStreak
            
        for team in awayteams:
            team_df = year_df[(year_df.AwayTeam == team)]
            team_df = team_df.sort_values(['Year', 'DayofYear'], ascending = (True, True))

            team_grouper = (team_df.AwayWin != team_df.AwayWin.shift()).cumsum()
            team_df['AwayWinStreak'] = team_df[['AwayWin']].groupby(team_grouper).cumsum()
            team_df.loc[team_df.AwayWinStreak >0, 'AwayWinStreak'] -= 1
            year_df.loc[team_df.index, 'AwayWinStreak'] = team_df.AwayWinStreak
            
        df_lst.append(year_df)
        
    return pd.concat(df_lst, axis = 0).drop(columns = ['HomeWin', 'AwayWin'])#,'DayofYear'])

In [320]:
learning_df = compute_winstreak(learning_df)

**2.3.4 Season Home/Away Wins to Date** <br>
Indicate the number of wins for a team as home and away to date within current season

In [321]:
toy = learning_df[(learning_df.Year == 2010) & (learning_df.HomeTeam == 'Barcelona')][['HomeTeam', 'AwayTeam', 'Result']]
toy['HomeWin'] = toy.Result.replace([0, 1, 2], [0, 0, 1])
toy['HomeWinsToDate'] = toy.HomeWin.cumsum()

In [322]:
def compute_winstodate(df):
    
    years = df.Year.unique()
    df_lst = []    
    for year in years:
        
        year_df = df[df.Year == year]
        year_df['HomeWin'] = year_df.Result.replace([0, 1, 2], [0, 0, 1])
        year_df['AwayWin'] = year_df.Result.replace([0, 1, 2], [1, 0, 0])
        year_df['HomeWinsToDate'] = None
        year_df['AwayWinsToDate'] = None
        
        hometeams = year_df.HomeTeam.unique()
        awayteams = year_df.AwayTeam.unique()
        
        for team in hometeams:
            team_df = year_df[(year_df.HomeTeam == team)]
            team_df = team_df.sort_values(['Year', 'DayofYear'], ascending = (True, True))

            team_df['HomeWinsToDate'] = team_df.HomeWin.cumsum()
            year_df.loc[team_df.index, 'HomeWinsToDate'] = team_df.HomeWinsToDate
            
        for team in awayteams:
            team_df = year_df[(year_df.AwayTeam == team)]
            team_df = team_df.sort_values(['Year', 'DayofYear'], ascending = (True, True))
            
            team_df['AwayWinsToDate'] = team_df.AwayWin.cumsum()
            year_df.loc[team_df.index, 'AwayWinsToDate'] = team_df.AwayWinsToDate
            
        df_lst.append(year_df)
        
    return pd.concat(df_lst, axis = 0).drop(columns = ['HomeWin', 'AwayWin','DayofYear'])

In [323]:
learning_df = compute_winstodate(learning_df)
learning_df.drop(columns = ['HomeTeam', 'AwayTeam'], inplace = True)

In [324]:
# learning_df
learning_df

Unnamed: 0,B365H,B365D,B365A,IWH,IWD,IWA,WHH,WHD,WHA,AHh,...,Year,Sin_Month,Cos_Month,Sin_Day,Cos_Day,Last Match Result,HomeWinStreak,AwayWinStreak,HomeWinsToDate,AwayWinsToDate
0,1.727,3.100,4.500,1.70,3.10,4.40,1.66,3.1,5.00,-0.75,...,2003,-0.866025,-0.500000,-0.516062,-0.856551,3,0,0,0,1
1,2.500,2.875,2.625,2.70,2.90,2.40,2.60,3.1,2.40,0.00,...,2003,-0.866025,-0.500000,-0.516062,-0.856551,3,0,0,0,1
2,1.571,3.500,5.000,1.65,3.30,4.40,1.57,3.5,5.00,-0.75,...,2003,-0.866025,-0.500000,-0.516062,-0.856551,3,0,0,0,1
3,1.833,3.200,3.750,1.80,3.10,3.80,1.72,3.4,4.00,-0.75,...,2003,-0.866025,-0.500000,-0.516062,-0.856551,3,0,0,0,1
4,1.571,3.250,5.500,1.60,3.30,4.80,1.66,3.3,4.50,-0.75,...,2003,-0.866025,-0.500000,-0.516062,-0.856551,3,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35399,2.250,3.100,3.500,2.25,3.20,3.35,2.15,3.2,3.40,-0.25,...,2024,0.500000,0.866025,0.051620,0.998667,1,0,0,1,0
35400,1.200,6.500,15.000,1.22,6.50,13.00,1.20,6.0,13.00,-1.75,...,2024,0.500000,0.866025,0.051620,0.998667,2,0,0,1,0
35401,5.500,4.330,1.570,5.50,4.30,1.57,5.50,4.2,1.53,1.00,...,2024,0.500000,0.866025,0.068802,0.997630,1,0,0,0,1
35402,1.700,4.000,4.750,1.70,3.90,4.80,1.65,3.9,4.60,-0.75,...,2024,0.500000,0.866025,0.068802,0.997630,2,0,0,1,0


In [325]:
# 保存为pkl文件
learning_df.to_pickle('E:/Data/PKL/learning_df.pkl')

**2.3.5 Website Odds** <br>
The `betting odds` recorded by various betting websites offer insight into sentiment surrounding the outcome of a particular game. 

In [326]:
# betting_feats = ['B365H', 'B365D', 'B365A', 'IWH', 'IWD', 'IWA', 'WHH', 'WHD', 'WHA', "AHh", "B365AHH", "B365AHA"]
betting_feats = ['B365H', 'B365D', 'B365A']
betting_feats

['B365H', 'B365D', 'B365A']

In [327]:
def compute_meanodds(df, betting_feats):
    """
    """
    home_odds = []
    away_odds = []
    draw_odds = []
    for odd in betting_feats:
        odd_type = odd[-1]
        if odd_type == 'H':
            home_odds.append(odd)
        elif odd_type == 'A':
            away_odds.append(odd)
        else:
            draw_odds.append(odd)
    avg_home_odds = df[home_odds].mean(axis=1)
    avg_away_odds = df[away_odds].mean(axis=1)
    avg_draw_odds = df[draw_odds].mean(axis=1)
    
    ordered_cols = ['HomeOdds', 'AwayOdds', 'DrawOdds'] + df.columns.tolist()
    
    df['HomeOdds'] = avg_home_odds
    df['AwayOdds'] = avg_away_odds
    df['DrawOdds'] = avg_draw_odds
    
    return df[ordered_cols]

In [328]:
learning_df = compute_meanodds(learning_df, betting_feats)

### 2.4 Peek @ Learning DataFrame

In [329]:
learning_df

Unnamed: 0,HomeOdds,AwayOdds,DrawOdds,B365H,B365D,B365A,IWH,IWD,IWA,WHH,...,Year,Sin_Month,Cos_Month,Sin_Day,Cos_Day,Last Match Result,HomeWinStreak,AwayWinStreak,HomeWinsToDate,AwayWinsToDate
0,1.727,4.500,3.100,1.727,3.100,4.500,1.70,3.10,4.40,1.66,...,2003,-0.866025,-0.500000,-0.516062,-0.856551,3,0,0,0,1
1,2.500,2.625,2.875,2.500,2.875,2.625,2.70,2.90,2.40,2.60,...,2003,-0.866025,-0.500000,-0.516062,-0.856551,3,0,0,0,1
2,1.571,5.000,3.500,1.571,3.500,5.000,1.65,3.30,4.40,1.57,...,2003,-0.866025,-0.500000,-0.516062,-0.856551,3,0,0,0,1
3,1.833,3.750,3.200,1.833,3.200,3.750,1.80,3.10,3.80,1.72,...,2003,-0.866025,-0.500000,-0.516062,-0.856551,3,0,0,0,1
4,1.571,5.500,3.250,1.571,3.250,5.500,1.60,3.30,4.80,1.66,...,2003,-0.866025,-0.500000,-0.516062,-0.856551,3,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35399,2.250,3.500,3.100,2.250,3.100,3.500,2.25,3.20,3.35,2.15,...,2024,0.500000,0.866025,0.051620,0.998667,1,0,0,1,0
35400,1.200,15.000,6.500,1.200,6.500,15.000,1.22,6.50,13.00,1.20,...,2024,0.500000,0.866025,0.051620,0.998667,2,0,0,1,0
35401,5.500,1.570,4.330,5.500,4.330,1.570,5.50,4.30,1.57,5.50,...,2024,0.500000,0.866025,0.068802,0.997630,1,0,0,0,1
35402,1.700,4.750,4.000,1.700,4.000,4.750,1.70,3.90,4.80,1.65,...,2024,0.500000,0.866025,0.068802,0.997630,2,0,0,1,0


In [330]:
learning_df.drop(columns = ['B365H', 'B365D', 'B365A', 'IWH', 'IWD', 'IWA', 'WHH', 'WHD', 'WHA', 'HomeOdds', 'AwayOdds', 'DrawOdds'], inplace = True)

# 3. Model Development

* Establish a baseline Logistic Regression model fit over the entire learning dataframe without special regard to *division* and *team*. 
* Train model over 16 seasons, and predict for the remaining 3 seasons (approximate 80-20 split)

### 3.1 Train and Test Split

In [331]:
split = 0.80
no_seasons = 20

print('No. seasons to train over: ' + str(round(split*no_seasons)))

No. seasons to train over: 16


In [332]:
X, y = learning_df.loc[:, learning_df.columns != 'Result'], learning_df[['Result']]

In [333]:
# full_feat = ['HomeWinStreak','AwayWinStreak','HomeWinsToDate', 'AwayWinsToDate', 'Last Match Result',
#              'HomeTeamELO', 'AwayTeamELO', 'HomeOdds', 'AwayOdds', 'DrawOdds'] + betting_feats

# exclude_feats = ['HomeWinsToDate', 'AwayWinsToDate', 'Last Match Result'] 

In [334]:
# X = X[X.columns[~X.columns.isin(exclude_feats)]]
# X

In [335]:
X

Unnamed: 0,B365H,B365D,B365A,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO,Div 0,Div 1,...,Year,Sin_Month,Cos_Month,Sin_Day,Cos_Day,Last Match Result,HomeWinStreak,AwayWinStreak,HomeWinsToDate,AwayWinsToDate
0,1.727,3.100,4.500,-0.75,2.050,1.850,1702.604858,1611.196045,0,0,...,2003,-0.866025,-0.500000,-0.516062,-0.856551,3,0,0,0,1
1,2.500,2.875,2.625,0.00,1.925,1.975,1685.016113,1665.625732,0,0,...,2003,-0.866025,-0.500000,-0.516062,-0.856551,3,0,0,0,1
2,1.571,3.500,5.000,-0.75,1.800,2.100,1718.566284,1649.805298,1,0,...,2003,-0.866025,-0.500000,-0.516062,-0.856551,3,0,0,0,1
3,1.833,3.200,3.750,-0.75,2.025,1.875,1719.916748,1692.120972,1,0,...,2003,-0.866025,-0.500000,-0.516062,-0.856551,3,0,0,0,1
4,1.571,3.250,5.500,-0.75,1.900,2.000,1697.354004,1539.958130,0,0,...,2003,-0.866025,-0.500000,-0.516062,-0.856551,3,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35399,2.250,3.100,3.500,-0.25,1.940,1.990,1563.726196,1628.365479,0,0,...,2024,0.500000,0.866025,0.051620,0.998667,1,0,0,1,0
35400,1.200,6.500,15.000,-1.75,1.990,1.940,1967.505371,1645.844849,0,0,...,2024,0.500000,0.866025,0.051620,0.998667,2,0,0,1,0
35401,5.500,4.330,1.570,1.00,1.920,2.010,1629.086792,1858.904297,0,0,...,2024,0.500000,0.866025,0.068802,0.997630,1,0,0,0,1
35402,1.700,4.000,4.750,-0.75,1.950,1.980,1649.258545,1554.821045,0,0,...,2024,0.500000,0.866025,0.068802,0.997630,2,0,0,1,0


In [336]:
y

Unnamed: 0,Result
0,0
1,0
2,0
3,0
4,1
...,...
35399,2
35400,2
35401,0
35402,2


In [337]:
split_year = 2022

In [338]:
# 切分训练集和测试集
xTr, xTe = X[X.Year <= split_year], X[X.Year > split_year]
yTr, yTe = y.loc[xTr.index, :], y.loc[xTe.index, :]

### 3.2 Normalization <br>
Following our various feature transformations and development, we arrived to a sparse dataframe with the exception of a few features(*Year, DayofYear*). It will be important to *normalize* these features as they are in gross magnitudes compared to the remaining features. During model training, having dominating features (in scale relative to others) can be dangerous as the weight updates may mistakengly favor these larger-scale features because it will have the largest influence on the target output. 

In [339]:
# minmax_scaler.fit_transform()：这个方法首先拟合数据，即计算数据的最小值和最大值，这些值用于后续的缩放。然后，它将这些参数用于转换数据，将原始数据缩放到0和1之间。
# minmax_scaler.transform()：这个方法使用在训练数据上计算得到的最小值和最大值来转换测试数据。这确保了训练数据和测试数据使用相同的缩放标准。
from sklearn.preprocessing import MinMaxScaler
minmax_scaler = MinMaxScaler()
xTr.loc[:, ['Year']] = minmax_scaler.fit_transform(xTr.loc[:, ['Year']])
xTe.loc[:, ['Year']] = minmax_scaler.transform(xTe.loc[:, ['Year']])

In [340]:
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
# to_scale = ['HomeWinStreak','AwayWinStreak','HomeWinsToDate', 'AwayWinsToDate', 'HomeTeamELO', 'AwayTeamELO', 'HomeOdds', 'AwayOdds', 'DrawOdds'] + betting_feats
to_scale = ['HomeTeamELO', 'AwayTeamELO'] + betting_feats

xTr.loc[:, to_scale] = std_scaler.fit_transform(xTr.loc[:, to_scale])
xTe.loc[:, to_scale] = std_scaler.transform(xTe.loc[:, to_scale])

In [341]:
xTr

Unnamed: 0,B365H,B365D,B365A,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO,Div 0,Div 1,...,Year,Sin_Month,Cos_Month,Sin_Day,Cos_Day,Last Match Result,HomeWinStreak,AwayWinStreak,HomeWinsToDate,AwayWinsToDate
0,-0.501052,-0.605573,-0.026640,-0.75,2.050,1.850,0.111364,-0.690595,0,0,...,0.0,-8.660254e-01,-0.5,-5.160624e-01,-0.856551,3,0,0,0,1
1,-0.070377,-0.793948,-0.534372,0.00,1.925,1.975,-0.043014,-0.213425,0,0,...,0.0,-8.660254e-01,-0.5,-5.160624e-01,-0.856551,3,0,0,0,1
2,-0.587967,-0.270684,0.108756,-0.75,1.800,2.100,0.251460,-0.352118,1,0,...,0.0,-8.660254e-01,-0.5,-5.160624e-01,-0.856551,3,0,0,0,1
3,-0.441994,-0.521851,-0.229732,-0.75,2.025,1.875,0.263313,0.018852,1,0,...,0.0,-8.660254e-01,-0.5,-5.160624e-01,-0.856551,3,0,0,0,1
4,-0.587967,-0.479990,0.244151,-0.75,1.900,2.000,0.065277,-1.315118,0,0,...,0.0,-8.660254e-01,-0.5,-5.160624e-01,-0.856551,3,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33470,-0.833669,4.334033,3.899822,-2.50,2.070,1.860,3.005253,-0.019842,0,1,...,1.0,-2.449294e-16,1.0,6.432491e-16,1.000000,2,0,0,14,2
33471,-0.655382,0.566537,0.650337,-1.25,2.030,1.900,1.445427,0.154165,0,1,...,1.0,-2.449294e-16,1.0,6.432491e-16,1.000000,1,0,0,12,5
33472,-0.432523,-0.438129,0.041058,-0.50,1.890,2.010,0.527804,-0.198540,0,0,...,1.0,-2.449294e-16,1.0,6.432491e-16,1.000000,2,0,0,10,5
33473,-0.376808,-0.354406,-0.107877,-0.50,1.970,1.930,0.624214,-0.002630,0,0,...,1.0,-2.449294e-16,1.0,6.432491e-16,1.000000,2,0,0,9,3


In [342]:
xTe

Unnamed: 0,B365H,B365D,B365A,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO,Div 0,Div 1,...,Year,Sin_Month,Cos_Month,Sin_Day,Cos_Day,Last Match Result,HomeWinStreak,AwayWinStreak,HomeWinsToDate,AwayWinsToDate
33475,0.068909,-0.354406,-0.568221,0.00,2.03,1.87,-1.826095,-1.051699,0,0,...,1.052632,0.5,0.866025,0.017213,0.999852,2,0,0,0,1
33476,0.347483,-0.186962,-0.649458,0.25,1.95,1.95,0.259133,1.883610,0,0,...,1.052632,0.5,0.866025,0.017213,0.999852,1,0,0,1,0
33477,-0.627524,0.566537,0.379546,-1.25,2.06,1.84,0.120091,-1.471695,0,0,...,1.052632,0.5,0.866025,0.017213,0.999852,1,0,0,0,1
33478,-0.627524,0.775842,0.244151,-1.00,1.75,2.05,0.240455,-1.465141,0,0,...,1.052632,0.5,0.866025,0.017213,0.999852,2,0,0,1,0
33479,-0.432523,-0.354406,-0.072674,-0.50,1.90,2.00,-0.884874,-1.695822,0,0,...,1.052632,0.5,0.866025,0.017213,0.999852,2,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35399,-0.209664,-0.605573,-0.297430,-0.25,1.94,1.99,-1.107592,-0.540075,0,0,...,1.105263,0.5,0.866025,0.051620,0.998667,1,0,0,1,0
35400,-0.794668,2.240980,2.816660,-1.75,1.99,1.94,2.436431,-0.386838,0,0,...,1.105263,0.5,0.866025,0.051620,0.998667,2,0,0,1,0
35401,1.601064,0.424209,-0.820056,1.00,1.92,2.01,-0.533913,1.480995,0,0,...,1.105263,0.5,0.866025,0.068802,0.997630,1,0,0,0,1
35402,-0.516095,0.147926,0.041058,-0.75,1.95,1.98,-0.356863,-1.184819,0,0,...,1.105263,0.5,0.866025,0.068802,0.997630,2,0,0,1,0


### 3.3 HomeWins Baseline Model

In [343]:
from sklearn.metrics import accuracy_score

In [344]:
xTr.shape

(33475, 431)

In [345]:
xTe.shape

(1929, 431)

In [346]:
# training score
baseline_Tr = np.full((xTr.shape[0], 1), 2) 
accuracy_score(yTr.Result.values, baseline_Tr.ravel())

0.45580283793876025

In [347]:
# testing score
baseline_preds_Te = np.full((xTe.shape[0]  , 1), 2) #predicts home wins all the time
accuracy_score(yTe.Result.values, baseline_preds_Te.ravel())

0.46137895282529806

### 3.4 Multinomial Logistic Regression

**3.4.1** $l2$ Regularized

In [348]:
from sklearn.linear_model import LogisticRegression
l2_lr = LogisticRegression(max_iter = 10000, n_jobs=-1).fit(xTr, yTr.values.ravel())

In [349]:
# training score
accuracy_score(yTr.Result.values, l2_lr.predict(xTr))

0.6424794622852875

In [350]:
# testing score
lr_preds = l2_lr.predict(xTe)
accuracy_score(yTe.Result.values, lr_preds)

0.6153447382063245

**3.4.1** $l2$ Penalty Tuning

In [351]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

logistic_params = {'C':[0.001,0.01,0.10]}

# logistic_randsearch = RandomizedSearchCV(estimator=LogisticRegression(max_iter=10000),
#                                          param_distributions=logistic_params,
logistic_randsearch = GridSearchCV(estimator=LogisticRegression(max_iter=10000),
                                         param_grid=logistic_params,
                                         scoring='accuracy',
                                         verbose=1,
                                         cv=5,
                                         n_jobs=-1)

logistic_rand_results = logistic_randsearch.fit(xTr, yTr.values.ravel())
print("Best: %f using %s" % (logistic_rand_results.best_score_, logistic_rand_results.best_params_))

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best: 0.621538 using {'C': 0.01}


In [352]:
l2_rs = logistic_rand_results.best_estimator_

In [353]:
# training score
accuracy_score(yTr.Result.values, l2_rs.predict(xTr))

0.6344137415982076

In [354]:
# testing score
accuracy_score(yTe.Result.values, l2_rs.predict(xTe))

0.6241575946086055

**3.4.4** $l1$ Regularized

In [355]:
l1_lr = LogisticRegression(penalty='l1', solver='saga', max_iter = 10000, n_jobs=-1).fit(xTr, yTr.values.ravel())

In [356]:
# training score
accuracy_score(yTr.Result.values, l1_lr.predict(xTr))

0.6411650485436893

In [357]:
# testing score
l1_preds = l1_lr.predict(xTe)
accuracy_score(yTe.Result.values, l1_preds)

0.6158631415241057

**3.4.5** Penalty Tuning

In [358]:
l1_params = {'C':[0.001,0.01,0.10]}

# l1_randsearch = RandomizedSearchCV(estimator=LogisticRegression(penalty='l1',solver='saga', max_iter=10000),
#                                          param_distributions=l1_params,
l1_randsearch = GridSearchCV(estimator=LogisticRegression(penalty='l1',solver='saga', max_iter=10000),
                                         param_grid=l1_params,
                                         scoring='accuracy',
                                         verbose=1,
                                         n_jobs=-1,
                                         cv=5)

l1_rand_results = l1_randsearch.fit(xTr, yTr.values.ravel())
print("Best: %f using %s" % (l1_rand_results.best_score_, l1_rand_results.best_params_))

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best: 0.624347 using {'C': 0.1}


In [359]:
l1_rs = l1_randsearch.best_estimator_ #LogisticRegression(C=0.10, solver='saga', max_iter=10000).fit(xTr, yTr.values.ravel())#

In [360]:
# training score
accuracy_score(yTr.Result.values, l1_rs.predict(xTr))

0.6306198655713219

In [361]:
# testing score
accuracy_score(yTe.Result.values, l1_rs.predict(xTe))

0.6246759979263867

### 3.5 Support Vector Machine

In [362]:
from sklearn.svm import SVC
svm = SVC(max_iter=100000).fit(xTr, yTr.values.ravel())

In [363]:
# training score
accuracy_score(yTr.Result.values, svm.predict(xTr))

0.6832561613144137

In [364]:
# testing score
accuracy_score(yTe.Result.values, svm.predict(xTe))

0.673924313115604

**3.5.2** Penalty Tuning

In [365]:
svm_params = {'C':[0.001,0.01,0.10]}

# svm_randsearch = RandomizedSearchCV(estimator=SVC(max_iter=100000),
#                                          param_distributions=svm_params,
svm_randsearch = GridSearchCV(estimator=SVC(max_iter=100000),
                                         param_grid=svm_params,
                                         scoring='accuracy',
                                         verbose=2,
                                         cv=5,
                                         n_jobs=-1)

svm_rand_results = svm_randsearch.fit(xTr, yTr.values.ravel())
print("Best: %f using %s" % (svm_rand_results.best_score_, svm_rand_results.best_params_))

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best: 0.632353 using {'C': 0.1}


In [366]:
svm_rs = svm_rand_results.best_estimator_

In [367]:
# training score
accuracy_score(yTr.Result.values, svm_rs.predict(xTr))

0.6433159073935772

In [368]:
# testing score
accuracy_score(yTe.Result.values, svm_rs.predict(xTe))

0.6485225505443235

### 3.6 Simple Neural Network ####

In [369]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(512,128,32),
                    activation='relu',
                    batch_size=64,
                    max_iter=200,
                    learning_rate_init=1e-4,
                    early_stopping=False,
                    alpha=1e-3,
                   ).fit(xTr, yTr.values.ravel())

In [370]:
# training score
accuracy_score(yTr.Result.values, mlp.predict(xTr))

0.9994025392083644

In [371]:
# testing score
accuracy_score(yTe.Result.values, mlp.predict(xTe))

0.654224987039917

In [1]:
xTr

NameError: name 'xTr' is not defined

### 3.7 Stacked Classifier ###

In [372]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [373]:
from sklearn.ensemble import StackingClassifier
stacked_clf = StackingClassifier(estimators=[('svm', SVC(max_iter=100000)), ('logistic', LogisticRegression(C=0.01, max_iter=10000))],
                                final_estimator=LogisticRegression(max_iter=10000),
                                n_jobs=-1).fit(xTr, yTr.values.ravel())

In [374]:
# training score
accuracy_score(yTr.Result.values, stacked_clf.predict(xTr))

0.6718745332337566

In [375]:
# testing score
accuracy_score(yTe.Result.values, stacked_clf.predict(xTe))

0.6552617936754795

## 4. Result Analysis ##

In [376]:
## TODO: breakdown results across divisions and/or teams; i.e., see how model performs individually at subgroups

## 5. Scrap Code ##

In [377]:
barcelona_df = learning_df[(learning_df['HomeTeam 17'] == 1) | (learning_df['AwayTeam 17'] == 1)]
barcelona_df

Unnamed: 0,B365H,B365D,B365A,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO,Div 0,Div 1,...,Year,Sin_Month,Cos_Month,Sin_Day,Cos_Day,Last Match Result,HomeWinStreak,AwayWinStreak,HomeWinsToDate,AwayWinsToDate
51,2.60,3.20,2.50,0.25,1.80,2.10,1784.173828,1869.017090,0,0,...,2003,-8.660254e-01,-5.000000e-01,-0.854322,-0.519744,3,0,0,0,1
85,1.57,3.40,6.00,-0.75,1.90,2.00,1876.175049,1769.667114,0,0,...,2003,-1.000000e+00,-1.836970e-16,-0.888057,-0.459733,3,0,0,0,0
113,4.75,3.25,1.72,0.50,1.95,1.95,1628.195068,1871.692749,0,0,...,2003,-1.000000e+00,-1.836970e-16,-0.958718,-0.284359,3,0,1,0,2
135,1.36,4.00,8.50,-1.50,2.05,1.85,1876.988892,1744.449585,0,0,...,2003,-1.000000e+00,-1.836970e-16,-0.982927,-0.183998,3,0,0,0,1
201,2.50,3.20,2.50,0.25,1.80,2.10,1698.131470,1870.665649,0,0,...,2003,-1.000000e+00,-1.836970e-16,-0.998880,-0.047321,3,0,0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35186,2.00,3.75,3.50,-0.50,2.05,1.88,1890.724487,1892.089722,0,0,...,2023,-2.449294e-16,1.000000e+00,-0.463550,0.886071,2,1,0,16,8
35247,1.53,5.00,5.50,-1.00,1.82,2.08,1894.741333,1762.128540,0,0,...,2023,-2.449294e-16,1.000000e+00,-0.353676,0.935368,1,0,6,16,9
35291,5.00,3.80,1.67,0.75,2.05,1.88,1667.329956,1860.806641,0,0,...,2023,-2.449294e-16,1.000000e+00,-0.255353,0.966848,0,0,0,8,10
35323,1.14,8.50,15.00,-2.50,2.03,1.87,1857.604004,1556.121338,0,0,...,2023,-2.449294e-16,1.000000e+00,-0.188227,0.982126,2,0,0,17,1


In [378]:
bxTr = xTr[(xTr['HomeTeam 17'] == 1) | (xTr['AwayTeam 17'] == 1)]
bxTe = xTe[(xTe['HomeTeam 17'] == 1) | (xTe['AwayTeam 17'] == 1)]

In [379]:
byTr, byTe = yTr.loc[bxTr.index,:], yTe.loc[bxTe.index,:]

In [380]:
# training score
accuracy_score(byTr, l1_lr.predict(bxTr))

0.7647887323943662

In [381]:
# testing score
accuracy_score(byTe, l1_lr.predict(bxTe))

0.6666666666666666

In [382]:
# training score
accuracy_score(byTr, l2_lr.predict(bxTr))

0.7619718309859155

In [383]:
# testing score
accuracy_score(byTe, l2_lr.predict(bxTe))

0.7142857142857143

## 6. Pytorch MLP ##

In [384]:
type(xTr)

pandas.core.frame.DataFrame

In [385]:
xTr.shape

(33475, 431)

In [386]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
import time
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

Using device: cuda


In [387]:
class Attention(nn.Module):
    def __init__(self, feature_dim):
        super(Attention, self).__init__()
        self.attention_weights = nn.Parameter(torch.randn(feature_dim))

    def forward(self, x):
        # 应用注意力权重
        weights = F.softmax(self.attention_weights, dim=0)
        # 加权求和
        x = x * weights
        return x
    
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(in_features=xTr.shape[1], out_features=512)
        self.bn1 = nn.BatchNorm1d(num_features=512)
        self.attention = Attention(512)
        self.dropout1 = nn.Dropout(p=0.2)
        
        self.fc2 = nn.Linear(in_features=512, out_features=128)
        self.bn2 = nn.BatchNorm1d(num_features=128)
        self.dropout2 = nn.Dropout(p=0.2)
        
        self.fc3 = nn.Linear(in_features=128, out_features=32)
        self.bn3 = nn.BatchNorm1d(num_features=32)
        self.dropout3 = nn.Dropout(p=0.2)
        
        self.fc4 = nn.Linear(in_features=32, out_features=3)  # 输出层改为3，对应三个类别

    def forward(self, x):
        x = self.dropout1(torch.relu(self.bn1(self.fc1(x))))
        x = self.attention(x)
        x = self.dropout2(torch.relu(self.bn2(self.fc2(x))))
        x = self.dropout3(torch.relu(self.bn3(self.fc3(x))))
        x = self.fc4(x)
        return x

# 数据预处理
scaler = StandardScaler()
xTr_scaled = scaler.fit_transform(xTr)
xTr_tensor = torch.tensor(xTr_scaled, dtype=torch.float32).to(device)
yTr_tensor = torch.tensor(yTr.values.ravel(), dtype=torch.long).to(device)

# 创建数据加载器
dataset = TensorDataset(xTr_tensor, yTr_tensor)
dataloader = DataLoader(dataset, batch_size=1024, shuffle=True)

# 创建模型实例
model = MLP().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-4)

train_start = time.time()
# 训练模型
model.train()
for epoch in range(500):  # 假设训练200个epoch
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()  # 清除之前的梯度

        outputs = model(inputs)  # 前向传播
        loss = criterion(outputs, labels)  # 计算损失
        loss.backward()  # 反向传播
        optimizer.step()  # 更新参数

        running_loss += loss.item() * inputs.size(0)  # 累计损失
        _, predicted = torch.max(outputs.data, 1)  # 获取预测结果
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / total
    epoch_acc = correct / total * 100  # 计算准确率

    # 每个epoch结束后输出
    print(f'Epoch {epoch+1}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}%')
print(f'训练时长： {time.time() - train_start}s')

Epoch 1, Loss: 1.1523, Accuracy: 36.34%
Epoch 2, Loss: 1.0893, Accuracy: 40.73%
Epoch 3, Loss: 1.0358, Accuracy: 45.19%
Epoch 4, Loss: 1.0012, Accuracy: 48.50%
Epoch 5, Loss: 0.9692, Accuracy: 51.87%
Epoch 6, Loss: 0.9456, Accuracy: 53.47%
Epoch 7, Loss: 0.9270, Accuracy: 54.67%
Epoch 8, Loss: 0.9107, Accuracy: 56.12%
Epoch 9, Loss: 0.8936, Accuracy: 57.00%
Epoch 10, Loss: 0.8737, Accuracy: 58.13%
Epoch 11, Loss: 0.8606, Accuracy: 58.87%
Epoch 12, Loss: 0.8442, Accuracy: 59.76%
Epoch 13, Loss: 0.8309, Accuracy: 60.65%
Epoch 14, Loss: 0.8192, Accuracy: 61.06%
Epoch 15, Loss: 0.8080, Accuracy: 61.46%
Epoch 16, Loss: 0.7988, Accuracy: 61.98%
Epoch 17, Loss: 0.7891, Accuracy: 62.55%
Epoch 18, Loss: 0.7799, Accuracy: 63.17%
Epoch 19, Loss: 0.7726, Accuracy: 63.36%
Epoch 20, Loss: 0.7625, Accuracy: 63.82%
Epoch 21, Loss: 0.7571, Accuracy: 64.53%
Epoch 22, Loss: 0.7483, Accuracy: 64.95%
Epoch 23, Loss: 0.7405, Accuracy: 64.98%
Epoch 24, Loss: 0.7361, Accuracy: 65.31%
Epoch 25, Loss: 0.7308, A

In [388]:
# 假设 xTe 和 yTe 是 pandas DataFrame 或 Series
# 数据预处理
xTe_scaled = scaler.fit_transform(xTe)  # 使用与训练数据相同的标准化参数
xTe_tensor = torch.tensor(xTe_scaled, dtype=torch.float32).to(device)
yTe_tensor = torch.tensor(yTe.values.ravel(), dtype=torch.long).to(device)

# 创建数据加载器
test_dataset = TensorDataset(xTe_tensor, yTe_tensor)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# 设置模型为评估模式
model.eval()

# 初始化用于计算准确率的变量
correct = 0
total = 0

# 不计算梯度，因为在评估模式下不需要进行反向传播
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

# 计算准确率
accuracy = correct / total
print(f'Accuracy on test set: {accuracy * 100:.2f}%')

Accuracy on test set: 60.65%


## 7. Pytorch Transformer ##

In [389]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class TransformerModel(nn.Module):
    def __init__(self, input_dim, num_classes, num_heads, num_layers, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.input_dim = input_dim
        self.num_classes = num_classes
        self.model_dim = input_dim  # 通常情况下，模型维度与输入维度相同

        # Transformer Encoder Layer
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=self.model_dim,
            nhead=num_heads,
            dim_feedforward=512,  # 前馈网络的维度
            dropout=dropout,
            batch_first=True
        )

        # Transformer Encoder
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # 输出层
        self.output_layer = nn.Linear(self.model_dim, self.num_classes)

        # Batch Normalization
        self.bn = nn.BatchNorm1d(self.model_dim)

    def forward(self, x):
        # 增加一个假的序列维度
        x = x.unsqueeze(1)
        # Transformer Encoder
        x = self.transformer_encoder(x)

        # Batch Normalization
        x = self.bn(x[:, 0, :])  # 取序列的第一个元素进行批量归一化

        # 输出层
        x = self.output_layer(x)
        return x

# 补充维度
n_samples_xTr = xTr.shape[0]
n_samples_xTe = xTe.shape[0]
for i in range(1, 4):  # 从 1 到 3，因为需要添加三列
    xTr[f'pad{i}'] = 0  # 添加填充列，初始化为 0
    xTe[f'pad{i}'] = 0  # 添加填充列，初始化为 0

# 参数设置
input_dim = xTr.shape[1]  # 输入特征的维度
num_classes = 3  # 类别数
num_heads = 10  # 注意力头的数量
num_layers = 3  # Transformer层的数量
dropout = 0.8  # Dropout比率

# 创建模型
model = TransformerModel(input_dim, num_classes, num_heads, num_layers, dropout).to(device)

# 损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# 数据加载
# 假设 xTr 和 xTe 已经是适当的 torch.Tensor 对象
xTr_values = xTr.values.astype(float)
xTe_values = xTe.values.astype(float)

xTr_tensor = torch.tensor(xTr_values, dtype=torch.float32).to(device)
xTe_tensor = torch.tensor(xTe_values, dtype=torch.float32).to(device)
yTr_tensor = torch.tensor(yTr.values, dtype=torch.long).to(device).squeeze(1)
yTe_tensor = torch.tensor(yTe.values, dtype=torch.long).to(device).squeeze(1)
# 转换为 one-hot 编码
yTr_tensor = F.one_hot(yTr_tensor, num_classes=num_classes).float()
yTe_tensor = F.one_hot(yTe_tensor, num_classes=num_classes).float()

# 数据加载器
train_dataset = TensorDataset(xTr_tensor, yTr_tensor)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=False)

test_dataset = TensorDataset(xTe_tensor, yTe_tensor)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False)

train_start = time.time()
# 训练模型
model.train()
for epoch in range(500):  # 运行更多的 epoch 以获得更好的结果
    running_loss = 0.0
    correct = 0
    total = 0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)  # 累计损失
        _, predicted = torch.max(outputs.data, 1)  # 获取预测结果
        _, truth = torch.max(labels.data, 1)
        total += truth.size(0)
        correct += (predicted == truth).sum().item()

    epoch_loss = running_loss / total
    epoch_acc = correct / total * 100  # 计算准确率

    # 每个epoch结束后输出
    print(f'Epoch {epoch+1}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}%')
print(f'训练时长： {time.time() - train_start}s')

# 设置模型为评估模式
model.eval()

# 初始化用于计算准确率的变量
correct = 0
total = 0

# 不计算梯度，因为在评估模式下不需要进行反向传播
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        _, truth = torch.max(labels.data, 1)
        total += labels.size(0)
        correct += (predicted == truth).sum().item()

# 计算准确率
accuracy = correct / total
print(f'Accuracy on test set: {accuracy * 100:.2f}%')

AssertionError: embed_dim must be divisible by num_heads