In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np

## 0. DataFrame

In [2]:
football_df = pd.read_csv('data/all_data_with_elo.csv', low_memory = False)
football_df

Unnamed: 0.1,Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,...,WHD,WHA,AHh,B365AHH,B365AHA,AHCh,B365CAHH,B365CAHA,HomeTeamELO,AwayTeamELO
0,0,D1,2003/8/1,Bayern Munich,Ein Frankfurt,3.0,1.0,H,17.0,6.0,...,4.50,10.00,,,,,,,1859.379272,1593.249268
1,1,F1,2003/8/1,Lille,Lyon,1.0,0.0,H,,,...,3.00,2.20,,,,,,,1612.968018,1726.539795
2,2,F1,2003/8/2,Auxerre,Nice,1.0,2.0,A,,,...,3.10,5.00,-0.75,2.050,1.850,,,,1702.604858,1611.196045
3,3,F1,2003/8/2,Guingamp,Marseille,0.0,1.0,A,,,...,3.10,2.40,0.00,1.925,1.975,,,,1685.016113,1665.625732
4,4,D1,2003/8/2,Hamburg,Hannover,0.0,3.0,A,10.0,16.0,...,3.50,5.00,-0.75,1.800,2.100,,,,1718.566284,1649.805298
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37776,37776,F1,2024/10/6,Reims,Montpellier,4.0,2.0,H,18.0,10.0,...,3.80,4.00,-0.50,1.780,2.030,-0.75,1.83,2.10,1633.626221,1645.806641
37777,37777,SP1,2024/10/6,Sevilla,Betis,1.0,0.0,H,14.0,11.0,...,3.25,2.88,0.00,1.700,2.100,0.00,1.94,1.99,1676.242676,1709.259521
37778,37778,SP1,2024/10/6,Sociedad,Ath Madrid,1.0,1.0,D,16.0,4.0,...,3.20,2.30,0.25,1.890,2.010,0.00,1.98,1.95,1766.551880,1828.522095
37779,37779,F1,2024/10/6,Strasbourg,Lens,2.0,2.0,D,8.0,12.0,...,3.40,2.40,0.25,1.810,2.090,0.00,1.95,1.98,1608.732544,1719.138184


## 1. Descriptive Statistics 

**1.1 DataFrame Shape**

In [3]:
# no. rows and no. cols
football_df.shape

(37781, 29)

In [4]:
# feature names
print(football_df.columns.tolist())

['Unnamed: 0', 'Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HS', 'AS', 'HST', 'AST', 'B365H', 'B365D', 'B365A', 'IWH', 'IWD', 'IWA', 'WHH', 'WHD', 'WHA', 'AHh', 'B365AHH', 'B365AHA', 'AHCh', 'B365CAHH', 'B365CAHA', 'HomeTeamELO', 'AwayTeamELO']


**1.2 NaN Values**

In [5]:
football_df.isnull().sum()

Unnamed: 0         0
Div                0
Date               0
HomeTeam           0
AwayTeam           0
FTHG               0
FTAG               0
FTR                0
HS              1762
AS              1762
HST             2568
AST             2568
B365H             49
B365D             49
B365A             49
IWH             1324
IWD             1324
IWA             1324
WHH              573
WHD              573
WHA              573
AHh              262
B365AHH          276
B365AHA          276
AHCh           28479
B365CAHH       28481
B365CAHA       28481
HomeTeamELO      125
AwayTeamELO      126
dtype: int64

In [6]:
# total elements in 
football_df.size

1095649

In [7]:
# total number of NaN
football_df.size - football_df.count().sum()

101004

In [8]:
# total number of NaN rows
football_df.isnull().any(axis = 1).sum()

29734

In [9]:
# total number of NaN columns
football_df.isnull().any(axis = 0).sum()

21

## 2. Data Wrangling and Feature Transformation/Development

### 2.1 NaN Handling

`TODO`: drop NaN values along columns: {Date, Home Team, Away Team, FTR} <br>
`TODO`: identify betting odds w/ most available data

In [10]:
# 当前方法仅提取这几个字段 分区 日期 主队 客队 full-time-result 三家机构的胜平负 主队ELO评分 客队ELO评分
# nan_mask = ['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTR', 'B365H', 'B365D', 'B365A', 
#             'IWH', 'IWD', 'IWA', 'WHH', 'WHD', 'WHA', 'AHh', 'B365AHH', 'B365AHA', 'HomeTeamELO', 'AwayTeamELO']
nan_mask = ['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'B365H', 'B365D', 'B365A', 
            'IWH', 'IWD', 'IWA', 'WHH', 'WHD', 'WHA', 'AHh', 'B365AHH', 'B365AHA', 'HomeTeamELO', 'AwayTeamELO']

In [11]:
nan_football_df = football_df.dropna(subset = nan_mask)
nan_football_df

Unnamed: 0.1,Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,...,WHD,WHA,AHh,B365AHH,B365AHA,AHCh,B365CAHH,B365CAHA,HomeTeamELO,AwayTeamELO
2,2,F1,2003/8/2,Auxerre,Nice,1.0,2.0,A,,,...,3.1,5.00,-0.75,2.050,1.850,,,,1702.604858,1611.196045
3,3,F1,2003/8/2,Guingamp,Marseille,0.0,1.0,A,,,...,3.1,2.40,0.00,1.925,1.975,,,,1685.016113,1665.625732
4,4,D1,2003/8/2,Hamburg,Hannover,0.0,3.0,A,10.0,16.0,...,3.5,5.00,-0.75,1.800,2.100,,,,1718.566284,1649.805298
5,5,D1,2003/8/2,Hertha,Werder Bremen,0.0,3.0,A,23.0,19.0,...,3.4,4.00,-0.75,2.025,1.875,,,,1719.916748,1692.120972
7,7,F1,2003/8/2,Lens,Le Mans,0.0,0.0,D,,,...,3.3,4.50,-0.75,1.900,2.000,,,,1697.354004,1539.958130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36541,36541,SP1,2024/1/3,Granada,Cadiz,2.0,0.0,H,15.0,15.0,...,3.2,3.40,-0.25,1.940,1.990,-0.25,1.89,2.04,1563.726196,1628.365479
36542,36542,SP1,2024/1/3,Real Madrid,Mallorca,1.0,0.0,H,17.0,7.0,...,6.0,13.00,-1.75,1.990,1.940,-1.75,1.98,1.95,1967.505371,1645.844849
36543,36543,SP1,2024/1/4,Las Palmas,Barcelona,1.0,2.0,A,12.0,15.0,...,4.2,1.53,1.00,1.920,2.010,1.25,1.83,2.10,1629.086792,1858.904297
36544,36544,SP1,2024/1/4,Osasuna,Almeria,1.0,0.0,H,14.0,4.0,...,3.9,4.60,-0.75,1.950,1.980,-0.75,1.98,1.95,1649.258545,1554.821045


In [12]:
nan_football_df['asia_final_result'] = nan_football_df['FTHG'] - nan_football_df['FTAG'] + nan_football_df['AHh']
nan_football_df

Unnamed: 0.1,Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,...,WHA,AHh,B365AHH,B365AHA,AHCh,B365CAHH,B365CAHA,HomeTeamELO,AwayTeamELO,asia_final_result
2,2,F1,2003/8/2,Auxerre,Nice,1.0,2.0,A,,,...,5.00,-0.75,2.050,1.850,,,,1702.604858,1611.196045,-1.75
3,3,F1,2003/8/2,Guingamp,Marseille,0.0,1.0,A,,,...,2.40,0.00,1.925,1.975,,,,1685.016113,1665.625732,-1.00
4,4,D1,2003/8/2,Hamburg,Hannover,0.0,3.0,A,10.0,16.0,...,5.00,-0.75,1.800,2.100,,,,1718.566284,1649.805298,-3.75
5,5,D1,2003/8/2,Hertha,Werder Bremen,0.0,3.0,A,23.0,19.0,...,4.00,-0.75,2.025,1.875,,,,1719.916748,1692.120972,-3.75
7,7,F1,2003/8/2,Lens,Le Mans,0.0,0.0,D,,,...,4.50,-0.75,1.900,2.000,,,,1697.354004,1539.958130,-0.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36541,36541,SP1,2024/1/3,Granada,Cadiz,2.0,0.0,H,15.0,15.0,...,3.40,-0.25,1.940,1.990,-0.25,1.89,2.04,1563.726196,1628.365479,1.75
36542,36542,SP1,2024/1/3,Real Madrid,Mallorca,1.0,0.0,H,17.0,7.0,...,13.00,-1.75,1.990,1.940,-1.75,1.98,1.95,1967.505371,1645.844849,-0.75
36543,36543,SP1,2024/1/4,Las Palmas,Barcelona,1.0,2.0,A,12.0,15.0,...,1.53,1.00,1.920,2.010,1.25,1.83,2.10,1629.086792,1858.904297,0.00
36544,36544,SP1,2024/1/4,Osasuna,Almeria,1.0,0.0,H,14.0,4.0,...,4.60,-0.75,1.950,1.980,-0.75,1.98,1.95,1649.258545,1554.821045,0.25


In [13]:
nan_football_df_noNone = nan_football_df.dropna()
nan_football_df_noNone

Unnamed: 0.1,Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,...,WHA,AHh,B365AHH,B365AHA,AHCh,B365CAHH,B365CAHA,HomeTeamELO,AwayTeamELO,asia_final_result
28479,28479,E0,2019/8/9,Liverpool,Norwich,4.0,1.0,H,15.0,12.0,...,21.00,-2.25,1.96,1.94,-2.25,1.91,1.99,2050.401855,1634.853149,0.75
28480,28480,F1,2019/8/9,Monaco,Lyon,0.0,3.0,A,7.0,13.0,...,2.40,0.00,2.07,1.72,0.25,2.04,1.86,1568.403687,1732.636597,-3.00
28481,28481,F1,2019/8/10,Angers,Bordeaux,3.0,1.0,H,14.0,8.0,...,3.20,-0.25,2.02,1.88,-0.25,1.75,2.05,1560.621948,1558.477295,1.75
28482,28482,E0,2019/8/10,Bournemouth,Sheffield United,1.0,1.0,D,13.0,8.0,...,3.80,-0.50,2.01,1.89,-0.50,1.95,1.95,1695.075562,1623.745361,-0.50
28483,28483,F1,2019/8/10,Brest,Toulouse,1.0,1.0,D,16.0,13.0,...,3.10,-0.25,2.05,1.85,-0.50,2.10,1.80,1512.408203,1496.643433,-0.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36541,36541,SP1,2024/1/3,Granada,Cadiz,2.0,0.0,H,15.0,15.0,...,3.40,-0.25,1.94,1.99,-0.25,1.89,2.04,1563.726196,1628.365479,1.75
36542,36542,SP1,2024/1/3,Real Madrid,Mallorca,1.0,0.0,H,17.0,7.0,...,13.00,-1.75,1.99,1.94,-1.75,1.98,1.95,1967.505371,1645.844849,-0.75
36543,36543,SP1,2024/1/4,Las Palmas,Barcelona,1.0,2.0,A,12.0,15.0,...,1.53,1.00,1.92,2.01,1.25,1.83,2.10,1629.086792,1858.904297,0.00
36544,36544,SP1,2024/1/4,Osasuna,Almeria,1.0,0.0,H,14.0,4.0,...,4.60,-0.75,1.95,1.98,-0.75,1.98,1.95,1649.258545,1554.821045,0.25


In [14]:
nan_football_df_noNone.reset_index(inplace=True, drop=True)
nan_football_df_noNone

Unnamed: 0.1,Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,...,WHA,AHh,B365AHH,B365AHA,AHCh,B365CAHH,B365CAHA,HomeTeamELO,AwayTeamELO,asia_final_result
0,28479,E0,2019/8/9,Liverpool,Norwich,4.0,1.0,H,15.0,12.0,...,21.00,-2.25,1.96,1.94,-2.25,1.91,1.99,2050.401855,1634.853149,0.75
1,28480,F1,2019/8/9,Monaco,Lyon,0.0,3.0,A,7.0,13.0,...,2.40,0.00,2.07,1.72,0.25,2.04,1.86,1568.403687,1732.636597,-3.00
2,28481,F1,2019/8/10,Angers,Bordeaux,3.0,1.0,H,14.0,8.0,...,3.20,-0.25,2.02,1.88,-0.25,1.75,2.05,1560.621948,1558.477295,1.75
3,28482,E0,2019/8/10,Bournemouth,Sheffield United,1.0,1.0,D,13.0,8.0,...,3.80,-0.50,2.01,1.89,-0.50,1.95,1.95,1695.075562,1623.745361,-0.50
4,28483,F1,2019/8/10,Brest,Toulouse,1.0,1.0,D,16.0,13.0,...,3.10,-0.25,2.05,1.85,-0.50,2.10,1.80,1512.408203,1496.643433,-0.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8042,36541,SP1,2024/1/3,Granada,Cadiz,2.0,0.0,H,15.0,15.0,...,3.40,-0.25,1.94,1.99,-0.25,1.89,2.04,1563.726196,1628.365479,1.75
8043,36542,SP1,2024/1/3,Real Madrid,Mallorca,1.0,0.0,H,17.0,7.0,...,13.00,-1.75,1.99,1.94,-1.75,1.98,1.95,1967.505371,1645.844849,-0.75
8044,36543,SP1,2024/1/4,Las Palmas,Barcelona,1.0,2.0,A,12.0,15.0,...,1.53,1.00,1.92,2.01,1.25,1.83,2.10,1629.086792,1858.904297,0.00
8045,36544,SP1,2024/1/4,Osasuna,Almeria,1.0,0.0,H,14.0,4.0,...,4.60,-0.75,1.95,1.98,-0.75,1.98,1.95,1649.258545,1554.821045,0.25


In [15]:
conditions = [
    nan_football_df_noNone['asia_final_result'] < -0.25,
    nan_football_df_noNone['asia_final_result'] == -0.25,
    nan_football_df_noNone['asia_final_result'] == 0,
    nan_football_df_noNone['asia_final_result'] == 0.25,
    nan_football_df_noNone['asia_final_result'] > 0.25,
]
easy_conditions = [
    nan_football_df_noNone['asia_final_result'] <= -0.25,
    nan_football_df_noNone['asia_final_result'] == 0,
    nan_football_df_noNone['asia_final_result'] >= 0.25,
]
labels = [-2, -1, 0, 1, 2]
easy_labels = [-1, 0, 1]

nan_football_df_noNone['label'] = np.select(conditions, labels)
nan_football_df_noNone['easy_label'] = np.select(easy_conditions, easy_labels)
nan_football_df_noNone

Unnamed: 0.1,Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,...,B365AHH,B365AHA,AHCh,B365CAHH,B365CAHA,HomeTeamELO,AwayTeamELO,asia_final_result,label,easy_label
0,28479,E0,2019/8/9,Liverpool,Norwich,4.0,1.0,H,15.0,12.0,...,1.96,1.94,-2.25,1.91,1.99,2050.401855,1634.853149,0.75,2,1
1,28480,F1,2019/8/9,Monaco,Lyon,0.0,3.0,A,7.0,13.0,...,2.07,1.72,0.25,2.04,1.86,1568.403687,1732.636597,-3.00,-2,-1
2,28481,F1,2019/8/10,Angers,Bordeaux,3.0,1.0,H,14.0,8.0,...,2.02,1.88,-0.25,1.75,2.05,1560.621948,1558.477295,1.75,2,1
3,28482,E0,2019/8/10,Bournemouth,Sheffield United,1.0,1.0,D,13.0,8.0,...,2.01,1.89,-0.50,1.95,1.95,1695.075562,1623.745361,-0.50,-2,-1
4,28483,F1,2019/8/10,Brest,Toulouse,1.0,1.0,D,16.0,13.0,...,2.05,1.85,-0.50,2.10,1.80,1512.408203,1496.643433,-0.25,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8042,36541,SP1,2024/1/3,Granada,Cadiz,2.0,0.0,H,15.0,15.0,...,1.94,1.99,-0.25,1.89,2.04,1563.726196,1628.365479,1.75,2,1
8043,36542,SP1,2024/1/3,Real Madrid,Mallorca,1.0,0.0,H,17.0,7.0,...,1.99,1.94,-1.75,1.98,1.95,1967.505371,1645.844849,-0.75,-2,-1
8044,36543,SP1,2024/1/4,Las Palmas,Barcelona,1.0,2.0,A,12.0,15.0,...,1.92,2.01,1.25,1.83,2.10,1629.086792,1858.904297,0.00,0,0
8045,36544,SP1,2024/1/4,Osasuna,Almeria,1.0,0.0,H,14.0,4.0,...,1.95,1.98,-0.75,1.98,1.95,1649.258545,1554.821045,0.25,1,1


In [16]:
nan_football_df_noNone['label'].mean()

-0.012675531253883435

In [17]:
nan_football_df_noNone['easy_label'].mean()

-0.008450354169255624

In [18]:
# resize shape
football_df.shape[0] - nan_football_df_noNone.shape[0]

29734

### 2.2 Feature Encoding <br>
* $\phi(Date)$ $\Rightarrow$ one column for *year*, second column for *month*, third column for *day of year*
* One hot encode Division, Home and Away Teams
* Label encode Full Time Result (Win/Draw/Loss)

In [19]:
feats = nan_mask
feats.append('easy_label')

In [20]:
nan_football_df_noNone

Unnamed: 0.1,Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,...,B365AHH,B365AHA,AHCh,B365CAHH,B365CAHA,HomeTeamELO,AwayTeamELO,asia_final_result,label,easy_label
0,28479,E0,2019/8/9,Liverpool,Norwich,4.0,1.0,H,15.0,12.0,...,1.96,1.94,-2.25,1.91,1.99,2050.401855,1634.853149,0.75,2,1
1,28480,F1,2019/8/9,Monaco,Lyon,0.0,3.0,A,7.0,13.0,...,2.07,1.72,0.25,2.04,1.86,1568.403687,1732.636597,-3.00,-2,-1
2,28481,F1,2019/8/10,Angers,Bordeaux,3.0,1.0,H,14.0,8.0,...,2.02,1.88,-0.25,1.75,2.05,1560.621948,1558.477295,1.75,2,1
3,28482,E0,2019/8/10,Bournemouth,Sheffield United,1.0,1.0,D,13.0,8.0,...,2.01,1.89,-0.50,1.95,1.95,1695.075562,1623.745361,-0.50,-2,-1
4,28483,F1,2019/8/10,Brest,Toulouse,1.0,1.0,D,16.0,13.0,...,2.05,1.85,-0.50,2.10,1.80,1512.408203,1496.643433,-0.25,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8042,36541,SP1,2024/1/3,Granada,Cadiz,2.0,0.0,H,15.0,15.0,...,1.94,1.99,-0.25,1.89,2.04,1563.726196,1628.365479,1.75,2,1
8043,36542,SP1,2024/1/3,Real Madrid,Mallorca,1.0,0.0,H,17.0,7.0,...,1.99,1.94,-1.75,1.98,1.95,1967.505371,1645.844849,-0.75,-2,-1
8044,36543,SP1,2024/1/4,Las Palmas,Barcelona,1.0,2.0,A,12.0,15.0,...,1.92,2.01,1.25,1.83,2.10,1629.086792,1858.904297,0.00,0,0
8045,36544,SP1,2024/1/4,Osasuna,Almeria,1.0,0.0,H,14.0,4.0,...,1.95,1.98,-0.75,1.98,1.95,1649.258545,1554.821045,0.25,1,1


In [21]:
learning_df = nan_football_df_noNone.copy()[feats]
learning_df

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,B365H,B365D,B365A,IWH,...,IWA,WHH,WHD,WHA,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO,easy_label
0,E0,2019/8/9,Liverpool,Norwich,4.0,1.0,1.14,10.00,19.00,1.15,...,18.00,1.12,8.5,21.00,-2.25,1.96,1.94,2050.401855,1634.853149,1
1,F1,2019/8/9,Monaco,Lyon,0.0,3.0,3.00,3.30,2.37,2.90,...,2.45,2.80,3.5,2.40,0.00,2.07,1.72,1568.403687,1732.636597,-1
2,F1,2019/8/10,Angers,Bordeaux,3.0,1.0,2.37,3.10,3.20,2.35,...,3.30,2.35,3.1,3.20,-0.25,2.02,1.88,1560.621948,1558.477295,1
3,E0,2019/8/10,Bournemouth,Sheffield United,1.0,1.0,1.95,3.60,3.60,1.97,...,3.80,2.00,3.5,3.80,-0.50,2.01,1.89,1695.075562,1623.745361,-1
4,F1,2019/8/10,Brest,Toulouse,1.0,1.0,2.37,3.20,3.10,2.40,...,3.10,2.30,3.3,3.10,-0.25,2.05,1.85,1512.408203,1496.643433,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8042,SP1,2024/1/3,Granada,Cadiz,2.0,0.0,2.25,3.10,3.50,2.25,...,3.35,2.15,3.2,3.40,-0.25,1.94,1.99,1563.726196,1628.365479,1
8043,SP1,2024/1/3,Real Madrid,Mallorca,1.0,0.0,1.20,6.50,15.00,1.22,...,13.00,1.20,6.0,13.00,-1.75,1.99,1.94,1967.505371,1645.844849,-1
8044,SP1,2024/1/4,Las Palmas,Barcelona,1.0,2.0,5.50,4.33,1.57,5.50,...,1.57,5.50,4.2,1.53,1.00,1.92,2.01,1629.086792,1858.904297,0
8045,SP1,2024/1/4,Osasuna,Almeria,1.0,0.0,1.70,4.00,4.75,1.70,...,4.80,1.65,3.9,4.60,-0.75,1.95,1.98,1649.258545,1554.821045,1


In [22]:
learning_df.reset_index(inplace=True, drop=True)
learning_df

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,B365H,B365D,B365A,IWH,...,IWA,WHH,WHD,WHA,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO,easy_label
0,E0,2019/8/9,Liverpool,Norwich,4.0,1.0,1.14,10.00,19.00,1.15,...,18.00,1.12,8.5,21.00,-2.25,1.96,1.94,2050.401855,1634.853149,1
1,F1,2019/8/9,Monaco,Lyon,0.0,3.0,3.00,3.30,2.37,2.90,...,2.45,2.80,3.5,2.40,0.00,2.07,1.72,1568.403687,1732.636597,-1
2,F1,2019/8/10,Angers,Bordeaux,3.0,1.0,2.37,3.10,3.20,2.35,...,3.30,2.35,3.1,3.20,-0.25,2.02,1.88,1560.621948,1558.477295,1
3,E0,2019/8/10,Bournemouth,Sheffield United,1.0,1.0,1.95,3.60,3.60,1.97,...,3.80,2.00,3.5,3.80,-0.50,2.01,1.89,1695.075562,1623.745361,-1
4,F1,2019/8/10,Brest,Toulouse,1.0,1.0,2.37,3.20,3.10,2.40,...,3.10,2.30,3.3,3.10,-0.25,2.05,1.85,1512.408203,1496.643433,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8042,SP1,2024/1/3,Granada,Cadiz,2.0,0.0,2.25,3.10,3.50,2.25,...,3.35,2.15,3.2,3.40,-0.25,1.94,1.99,1563.726196,1628.365479,1
8043,SP1,2024/1/3,Real Madrid,Mallorca,1.0,0.0,1.20,6.50,15.00,1.22,...,13.00,1.20,6.0,13.00,-1.75,1.99,1.94,1967.505371,1645.844849,-1
8044,SP1,2024/1/4,Las Palmas,Barcelona,1.0,2.0,5.50,4.33,1.57,5.50,...,1.57,5.50,4.2,1.53,1.00,1.92,2.01,1629.086792,1858.904297,0
8045,SP1,2024/1/4,Osasuna,Almeria,1.0,0.0,1.70,4.00,4.75,1.70,...,4.80,1.65,3.9,4.60,-0.75,1.95,1.98,1649.258545,1554.821045,1


**2.2.1 Division and Home/Away Team Encoding**

In [23]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

div_encoder = OneHotEncoder()
home_encoder = OneHotEncoder()
away_encoder = OneHotEncoder()

In [24]:
onehot_div = div_encoder.fit_transform(learning_df.Div.values.reshape(-1,1)).toarray().astype(int)
onehot_div_df = pd.DataFrame(onehot_div, columns = ["Div "+str(int(i)) for i in range(onehot_div.shape[1])])

onehot_home = home_encoder.fit_transform(learning_df.HomeTeam.values.reshape(-1,1)).toarray().astype(int)
onehot_home_df = pd.DataFrame(onehot_home, columns = ['HomeTeam ' + str(int(i)) for i in np.arange(onehot_home.shape[1])])

onehot_away = away_encoder.fit_transform(learning_df.AwayTeam.values.reshape(-1,1)).toarray().astype(int)
onehot_away_df = pd.DataFrame(onehot_away, columns = ['AwayTeam ' + str(int(i)) for i in np.arange(onehot_away.shape[1])])

In [25]:
learning_df = pd.concat([learning_df, onehot_div_df, onehot_home_df, onehot_away_df], axis = 1)
learning_df.drop(columns = ['Div'], inplace = True)

In [26]:
learning_df

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,B365H,B365D,B365A,IWH,IWD,...,AwayTeam 123,AwayTeam 124,AwayTeam 125,AwayTeam 126,AwayTeam 127,AwayTeam 128,AwayTeam 129,AwayTeam 130,AwayTeam 131,AwayTeam 132
0,2019/8/9,Liverpool,Norwich,4.0,1.0,1.14,10.00,19.00,1.15,8.00,...,0,0,0,0,0,0,0,0,0,0
1,2019/8/9,Monaco,Lyon,0.0,3.0,3.00,3.30,2.37,2.90,3.35,...,0,0,0,0,0,0,0,0,0,0
2,2019/8/10,Angers,Bordeaux,3.0,1.0,2.37,3.10,3.20,2.35,3.10,...,0,0,0,0,0,0,0,0,0,0
3,2019/8/10,Bournemouth,Sheffield United,1.0,1.0,1.95,3.60,3.60,1.97,3.55,...,0,0,0,0,0,0,0,0,0,0
4,2019/8/10,Brest,Toulouse,1.0,1.0,2.37,3.20,3.10,2.40,3.20,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8042,2024/1/3,Granada,Cadiz,2.0,0.0,2.25,3.10,3.50,2.25,3.20,...,0,0,0,0,0,0,0,0,0,0
8043,2024/1/3,Real Madrid,Mallorca,1.0,0.0,1.20,6.50,15.00,1.22,6.50,...,0,0,0,0,0,0,0,0,0,0
8044,2024/1/4,Las Palmas,Barcelona,1.0,2.0,5.50,4.33,1.57,5.50,4.30,...,0,0,0,0,0,0,0,0,0,0
8045,2024/1/4,Osasuna,Almeria,1.0,0.0,1.70,4.00,4.75,1.70,3.90,...,0,0,0,0,0,0,0,0,0,0


**2.2.2 Full Time Result Encoding**

In [27]:
target_encoder = LabelEncoder()
learning_df['Result'] = target_encoder.fit_transform(learning_df.easy_label) 

**2.2.3 Date Encoding**

In [28]:
learning_df['Year'] = pd.DatetimeIndex(learning_df.Date).year

learning_df['Month'] = pd.DatetimeIndex(learning_df.Date).month
learning_df['Sin_Month'] = np.sin(2*np.pi*learning_df.Month/12)
learning_df['Cos_Month'] = np.cos(2*np.pi*learning_df.Month/12)

learning_df['DayofYear'] = pd.DatetimeIndex(learning_df.Date).dayofyear
learning_df['Sin_Day'] = np.sin(2*np.pi*learning_df.DayofYear/365)
learning_df['Cos_Day'] = np.cos(2*np.pi*learning_df.DayofYear/365)

learning_df.drop(columns = ['Date','Month'], inplace = True)
# learning_df.drop(columns = ['Date'], inplace = True)

In [29]:
learning_df

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,B365H,B365D,B365A,IWH,IWD,IWA,...,AwayTeam 130,AwayTeam 131,AwayTeam 132,Result,Year,Sin_Month,Cos_Month,DayofYear,Sin_Day,Cos_Day
0,Liverpool,Norwich,4.0,1.0,1.14,10.00,19.00,1.15,8.00,18.00,...,0,0,0,2,2019,-0.866025,-0.500000,221,-0.615285,-0.788305
1,Monaco,Lyon,0.0,3.0,3.00,3.30,2.37,2.90,3.35,2.45,...,0,0,0,0,2019,-0.866025,-0.500000,221,-0.615285,-0.788305
2,Angers,Bordeaux,3.0,1.0,2.37,3.10,3.20,2.35,3.10,3.30,...,0,0,0,2,2019,-0.866025,-0.500000,222,-0.628763,-0.777597
3,Bournemouth,Sheffield United,1.0,1.0,1.95,3.60,3.60,1.97,3.55,3.80,...,0,0,0,0,2019,-0.866025,-0.500000,222,-0.628763,-0.777597
4,Brest,Toulouse,1.0,1.0,2.37,3.20,3.10,2.40,3.20,3.10,...,0,0,0,0,2019,-0.866025,-0.500000,222,-0.628763,-0.777597
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8042,Granada,Cadiz,2.0,0.0,2.25,3.10,3.50,2.25,3.20,3.35,...,0,0,0,2,2024,0.500000,0.866025,3,0.051620,0.998667
8043,Real Madrid,Mallorca,1.0,0.0,1.20,6.50,15.00,1.22,6.50,13.00,...,0,0,0,0,2024,0.500000,0.866025,3,0.051620,0.998667
8044,Las Palmas,Barcelona,1.0,2.0,5.50,4.33,1.57,5.50,4.30,1.57,...,0,0,0,1,2024,0.500000,0.866025,4,0.068802,0.997630
8045,Osasuna,Almeria,1.0,0.0,1.70,4.00,4.75,1.70,3.90,4.80,...,0,0,0,2,2024,0.500000,0.866025,4,0.068802,0.997630


In [30]:
# For Test


### 2.3 Feature Engineering <br>
* $\phi(x)$ feature transformation $\Rightarrow$ last match result, win/loss streak to date, wins to season date
* $\phi(x)$ feature engineering $\Rightarrow$ average the home, away, and draw odds

**2.3.1 Last Match Result** <br>
Indicate the result from the last match played between both teams

In [31]:
# 定义一个函数来计算两队之间上一场比赛的结果
def compute_last_matches(df):
    
    unique_matchups = list(set((list(zip(df.HomeTeam, df.AwayTeam)))))
    df['Last Match Result'] = np.nan
    for home, away in unique_matchups:
        matchup_df = df[(df.HomeTeam == home) & (df.AwayTeam == away)]
        # 使用 shift(1) 方法将 FTR（全场比赛结果）列中的数据向下移动一行，这样每行的 last_match_result 将对应于这两队之前的一场比赛的结果。fill_value='Na' 确保了数据移动后空出的位置填充为 'Na'。
        # last_match_result = matchup_df.FTR.shift(1, fill_value='Na')
        last_match_result = matchup_df.easy_label.shift(1, fill_value='Na')
        df.loc[matchup_df.index, 'Last Match Result'] = last_match_result
        
    lmr_encoder = LabelEncoder()
    df['Last Match Result'] = lmr_encoder.fit_transform(df['Last Match Result'])
    df.drop(columns = ['easy_label'], inplace = True)
    return df

In [32]:
def compute_last_n_matches(df, n=5):
    unique_matchups = list(set(zip(df.HomeTeam, df.AwayTeam)))
    df['Last 5 Match Results'] = np.nan  # 新增一列用于存储过去 5 场比赛的结果
    
    for home, away in unique_matchups:
        matchup_df = df[(df.HomeTeam == home) & (df.AwayTeam == away)]
        
        # 获取过去 n 场比赛的结果
        # last_n_results = [matchup_df.FTR.shift(i, fill_value='Na') for i in range(1, n+1)]
        last_n_results = [matchup_df.easy_label.shift(i, fill_value='Na') for i in range(1, n+1)]
        
        # 将计算得到的过去 n 场比赛的结果合并为一个字符串或列表，取决于需求
        # 这里使用字符串形式：'result1/result2/...'
        matchup_df['Last 5 Match Results'] = pd.DataFrame(last_n_results).T.apply(lambda x: '/'.join(x), axis=1)
        
        # 将计算得到的结果更新回原始 df 中
        df.loc[matchup_df.index, 'Last 5 Match Results'] = matchup_df['Last 5 Match Results']
    
    # 对 Last 5 Match Results 列进行标签编码
    lmr_encoder = LabelEncoder()
    df['Last 5 Match Results'] = lmr_encoder.fit_transform(df['Last 5 Match Results'])
    
    # 删除原始的 FTR 列
    df.drop(columns=['easy_label'], inplace=True)
    
    return df


In [33]:
# 定义一个函数来计算两队之间上一场比赛的结果
def compute_last_matches(df):
    
    unique_matchups = list(set((list(zip(df.HomeTeam, df.AwayTeam)))))
    df['Last Match Result'] = np.nan
    for home, away in unique_matchups:
        matchup_df = df[(df.HomeTeam == home) & (df.AwayTeam == away)]
        # 使用 shift(1) 方法将 FTR（全场比赛结果）列中的数据向下移动一行，这样每行的 last_match_result 将对应于这两队之前的一场比赛的结果。fill_value='Na' 确保了数据移动后空出的位置填充为 'Na'。
        # last_match_result = matchup_df.FTR.shift(1, fill_value='Na')
        last_match_result = matchup_df.easy_label.shift(1, fill_value=3)
        df.loc[matchup_df.index, 'Last Match Result'] = last_match_result
        
    lmr_encoder = LabelEncoder()
    df['Last Match Result'] = lmr_encoder.fit_transform(df['Last Match Result'])
    df.drop(columns = ['easy_label'], inplace = True)
    return df
learning_df = compute_last_matches(learning_df)
# learning_df.drop(columns = ['FTR'], inplace = True)

**2.3.2 Home and Away Win/Loss Streak** <br>
Important note about this feature: the win/loss streak is the teams *home* and *away* win streak, *not* its ***consecutive*** win/loss streak.

In [34]:
# https://stackoverflow.com/questions/52976336/compute-winning-streak-with-pandas
# https://joshdevlin.com/blog/calculate-streaks-in-pandas/

In [35]:
def compute_winstreak(df):
    
    years = df.Year.unique()
    df_lst = []    
    for year in years:
        
        year_df = df[df.Year == year]
        year_df['HomeWin'] = year_df.Result.replace([0, 1, 2], [0, 0, 1])
        year_df['AwayWin'] = year_df.Result.replace([0, 1, 2], [1, 0, 0])
        year_df['HomeWinStreak'] = None
        year_df['AwayWinStreak'] = None
        
        hometeams = year_df.HomeTeam.unique()
        awayteams = year_df.AwayTeam.unique()
        
        for team in hometeams:
            team_df = year_df[(year_df.HomeTeam == team)]
            team_df = team_df.sort_values(['Year', 'DayofYear'], ascending = (True, True))

            team_grouper = (team_df.HomeWin != team_df.HomeWin.shift()).cumsum()
            team_df['HomeWinStreak'] = team_df[['HomeWin']].groupby(team_grouper).cumsum()
            team_df.loc[team_df.HomeWinStreak >0, 'HomeWinStreak'] -= 1
            year_df.loc[team_df.index, 'HomeWinStreak'] = team_df.HomeWinStreak
            
        for team in awayteams:
            team_df = year_df[(year_df.AwayTeam == team)]
            team_df = team_df.sort_values(['Year', 'DayofYear'], ascending = (True, True))

            team_grouper = (team_df.AwayWin != team_df.AwayWin.shift()).cumsum()
            team_df['AwayWinStreak'] = team_df[['AwayWin']].groupby(team_grouper).cumsum()
            team_df.loc[team_df.AwayWinStreak >0, 'AwayWinStreak'] -= 1
            year_df.loc[team_df.index, 'AwayWinStreak'] = team_df.AwayWinStreak
            
        df_lst.append(year_df)
        
    return pd.concat(df_lst, axis = 0).drop(columns = ['HomeWin', 'AwayWin'])#,'DayofYear'])

In [36]:
learning_df = compute_winstreak(learning_df)

**2.3.4 Season Home/Away Wins to Date** <br>
Indicate the number of wins for a team as home and away to date within current season

In [37]:
toy = learning_df[(learning_df.Year == 2010) & (learning_df.HomeTeam == 'Barcelona')][['HomeTeam', 'AwayTeam', 'Result']]
toy['HomeWin'] = toy.Result.replace([0, 1, 2], [0, 0, 1])
toy['HomeWinsToDate'] = toy.HomeWin.cumsum()

In [38]:
def compute_winstodate(df):
    
    years = df.Year.unique()
    df_lst = []    
    for year in years:
        
        year_df = df[df.Year == year]
        year_df['HomeWin'] = year_df.Result.replace([0, 1, 2], [0, 0, 1])
        year_df['AwayWin'] = year_df.Result.replace([0, 1, 2], [1, 0, 0])
        year_df['HomeWinsToDate'] = None
        year_df['AwayWinsToDate'] = None
        
        hometeams = year_df.HomeTeam.unique()
        awayteams = year_df.AwayTeam.unique()
        
        for team in hometeams:
            team_df = year_df[(year_df.HomeTeam == team)]
            team_df = team_df.sort_values(['Year', 'DayofYear'], ascending = (True, True))

            team_df['HomeWinsToDate'] = team_df.HomeWin.cumsum()
            year_df.loc[team_df.index, 'HomeWinsToDate'] = team_df.HomeWinsToDate
            
        for team in awayteams:
            team_df = year_df[(year_df.AwayTeam == team)]
            team_df = team_df.sort_values(['Year', 'DayofYear'], ascending = (True, True))
            
            team_df['AwayWinsToDate'] = team_df.AwayWin.cumsum()
            year_df.loc[team_df.index, 'AwayWinsToDate'] = team_df.AwayWinsToDate
            
        df_lst.append(year_df)
        
    return pd.concat(df_lst, axis = 0).drop(columns = ['HomeWin', 'AwayWin','DayofYear'])

In [39]:
learning_df = compute_winstodate(learning_df)
learning_df.drop(columns = ['HomeTeam', 'AwayTeam'], inplace = True)

In [40]:
# learning_df
learning_df

Unnamed: 0,FTHG,FTAG,B365H,B365D,B365A,IWH,IWD,IWA,WHH,WHD,...,Year,Sin_Month,Cos_Month,Sin_Day,Cos_Day,Last Match Result,HomeWinStreak,AwayWinStreak,HomeWinsToDate,AwayWinsToDate
0,4.0,1.0,1.14,10.00,19.00,1.15,8.00,18.00,1.12,8.5,...,2019,-0.866025,-0.500000,-0.615285,-0.788305,3,0,0,1,0
1,0.0,3.0,3.00,3.30,2.37,2.90,3.35,2.45,2.80,3.5,...,2019,-0.866025,-0.500000,-0.615285,-0.788305,3,0,0,0,1
2,3.0,1.0,2.37,3.10,3.20,2.35,3.10,3.30,2.35,3.1,...,2019,-0.866025,-0.500000,-0.628763,-0.777597,3,0,0,1,0
3,1.0,1.0,1.95,3.60,3.60,1.97,3.55,3.80,2.00,3.5,...,2019,-0.866025,-0.500000,-0.628763,-0.777597,3,0,0,0,1
4,1.0,1.0,2.37,3.20,3.10,2.40,3.20,3.10,2.30,3.3,...,2019,-0.866025,-0.500000,-0.628763,-0.777597,3,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8042,2.0,0.0,2.25,3.10,3.50,2.25,3.20,3.35,2.15,3.2,...,2024,0.500000,0.866025,0.051620,0.998667,0,0,0,1,0
8043,1.0,0.0,1.20,6.50,15.00,1.22,6.50,13.00,1.20,6.0,...,2024,0.500000,0.866025,0.051620,0.998667,2,0,0,0,1
8044,1.0,2.0,5.50,4.33,1.57,5.50,4.30,1.57,5.50,4.2,...,2024,0.500000,0.866025,0.068802,0.997630,3,0,0,0,0
8045,1.0,0.0,1.70,4.00,4.75,1.70,3.90,4.80,1.65,3.9,...,2024,0.500000,0.866025,0.068802,0.997630,2,0,0,1,0


In [41]:
# 保存为pkl文件
learning_df.to_pickle('E:/Data/PKL/learning_df.pkl')

**2.3.5 Website Odds** <br>
The `betting odds` recorded by various betting websites offer insight into sentiment surrounding the outcome of a particular game. 

In [42]:
# betting_feats = ['B365H', 'B365D', 'B365A', 'IWH', 'IWD', 'IWA', 'WHH', 'WHD', 'WHA', "AHh", "B365AHH", "B365AHA"]
betting_feats = ['B365H', 'B365D', 'B365A']
betting_feats

['B365H', 'B365D', 'B365A']

In [43]:
def compute_meanodds(df, betting_feats):
    """
    """
    home_odds = []
    away_odds = []
    draw_odds = []
    for odd in betting_feats:
        odd_type = odd[-1]
        if odd_type == 'H':
            home_odds.append(odd)
        elif odd_type == 'A':
            away_odds.append(odd)
        else:
            draw_odds.append(odd)
    avg_home_odds = df[home_odds].mean(axis=1)
    avg_away_odds = df[away_odds].mean(axis=1)
    avg_draw_odds = df[draw_odds].mean(axis=1)
    
    ordered_cols = ['HomeOdds', 'AwayOdds', 'DrawOdds'] + df.columns.tolist()
    
    df['HomeOdds'] = avg_home_odds
    df['AwayOdds'] = avg_away_odds
    df['DrawOdds'] = avg_draw_odds
    
    return df[ordered_cols]

In [44]:
learning_df = compute_meanodds(learning_df, betting_feats)

### 2.4 Peek @ Learning DataFrame

In [45]:
learning_df

Unnamed: 0,HomeOdds,AwayOdds,DrawOdds,FTHG,FTAG,B365H,B365D,B365A,IWH,IWD,...,Year,Sin_Month,Cos_Month,Sin_Day,Cos_Day,Last Match Result,HomeWinStreak,AwayWinStreak,HomeWinsToDate,AwayWinsToDate
0,1.14,19.00,10.00,4.0,1.0,1.14,10.00,19.00,1.15,8.00,...,2019,-0.866025,-0.500000,-0.615285,-0.788305,3,0,0,1,0
1,3.00,2.37,3.30,0.0,3.0,3.00,3.30,2.37,2.90,3.35,...,2019,-0.866025,-0.500000,-0.615285,-0.788305,3,0,0,0,1
2,2.37,3.20,3.10,3.0,1.0,2.37,3.10,3.20,2.35,3.10,...,2019,-0.866025,-0.500000,-0.628763,-0.777597,3,0,0,1,0
3,1.95,3.60,3.60,1.0,1.0,1.95,3.60,3.60,1.97,3.55,...,2019,-0.866025,-0.500000,-0.628763,-0.777597,3,0,0,0,1
4,2.37,3.10,3.20,1.0,1.0,2.37,3.20,3.10,2.40,3.20,...,2019,-0.866025,-0.500000,-0.628763,-0.777597,3,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8042,2.25,3.50,3.10,2.0,0.0,2.25,3.10,3.50,2.25,3.20,...,2024,0.500000,0.866025,0.051620,0.998667,0,0,0,1,0
8043,1.20,15.00,6.50,1.0,0.0,1.20,6.50,15.00,1.22,6.50,...,2024,0.500000,0.866025,0.051620,0.998667,2,0,0,0,1
8044,5.50,1.57,4.33,1.0,2.0,5.50,4.33,1.57,5.50,4.30,...,2024,0.500000,0.866025,0.068802,0.997630,3,0,0,0,0
8045,1.70,4.75,4.00,1.0,0.0,1.70,4.00,4.75,1.70,3.90,...,2024,0.500000,0.866025,0.068802,0.997630,2,0,0,1,0


In [46]:
learning_df.drop(columns = ['IWH', 'IWD', 'IWA', 'WHH', 'WHD', 'WHA', 'HomeOdds', 'AwayOdds', 'DrawOdds', 'FTHG', 'FTAG'], inplace = True)

# 3. Model Development

* Establish a baseline Logistic Regression model fit over the entire learning dataframe without special regard to *division* and *team*. 
* Train model over 16 seasons, and predict for the remaining 3 seasons (approximate 80-20 split)

### 3.1 Train and Test Split

In [47]:
split = 0.80
no_seasons = 20

print('No. seasons to train over: ' + str(round(split*no_seasons)))

No. seasons to train over: 16


In [48]:
X, y = learning_df.loc[:, learning_df.columns != 'Result'], learning_df[['Result']]

In [49]:
# full_feat = ['HomeWinStreak','AwayWinStreak','HomeWinsToDate', 'AwayWinsToDate', 'Last Match Result',
#              'HomeTeamELO', 'AwayTeamELO', 'HomeOdds', 'AwayOdds', 'DrawOdds'] + betting_feats

# exclude_feats = ['HomeWinsToDate', 'AwayWinsToDate', 'Last Match Result'] 

In [50]:
# X = X[X.columns[~X.columns.isin(exclude_feats)]]
# X

In [51]:
X

Unnamed: 0,B365H,B365D,B365A,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO,Div 0,Div 1,...,Year,Sin_Month,Cos_Month,Sin_Day,Cos_Day,Last Match Result,HomeWinStreak,AwayWinStreak,HomeWinsToDate,AwayWinsToDate
0,1.14,10.00,19.00,-2.25,1.96,1.94,2050.401855,1634.853149,0,1,...,2019,-0.866025,-0.500000,-0.615285,-0.788305,3,0,0,1,0
1,3.00,3.30,2.37,0.00,2.07,1.72,1568.403687,1732.636597,0,0,...,2019,-0.866025,-0.500000,-0.615285,-0.788305,3,0,0,0,1
2,2.37,3.10,3.20,-0.25,2.02,1.88,1560.621948,1558.477295,0,0,...,2019,-0.866025,-0.500000,-0.628763,-0.777597,3,0,0,1,0
3,1.95,3.60,3.60,-0.50,2.01,1.89,1695.075562,1623.745361,0,1,...,2019,-0.866025,-0.500000,-0.628763,-0.777597,3,0,0,0,1
4,2.37,3.20,3.10,-0.25,2.05,1.85,1512.408203,1496.643433,0,0,...,2019,-0.866025,-0.500000,-0.628763,-0.777597,3,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8042,2.25,3.10,3.50,-0.25,1.94,1.99,1563.726196,1628.365479,0,0,...,2024,0.500000,0.866025,0.051620,0.998667,0,0,0,1,0
8043,1.20,6.50,15.00,-1.75,1.99,1.94,1967.505371,1645.844849,0,0,...,2024,0.500000,0.866025,0.051620,0.998667,2,0,0,0,1
8044,5.50,4.33,1.57,1.00,1.92,2.01,1629.086792,1858.904297,0,0,...,2024,0.500000,0.866025,0.068802,0.997630,3,0,0,0,0
8045,1.70,4.00,4.75,-0.75,1.95,1.98,1649.258545,1554.821045,0,0,...,2024,0.500000,0.866025,0.068802,0.997630,2,0,0,1,0


In [52]:
y

Unnamed: 0,Result
0,2
1,0
2,2
3,0
4,0
...,...
8042,2
8043,0
8044,1
8045,2


In [108]:
split_year = 2023

In [109]:
# 切分训练集和测试集
xTr, xTe = X[X.Year <= split_year], X[X.Year > split_year]
yTr, yTe = y.loc[xTr.index, :], y.loc[xTe.index, :]

### 3.2 Normalization <br>
Following our various feature transformations and development, we arrived to a sparse dataframe with the exception of a few features(*Year, DayofYear*). It will be important to *normalize* these features as they are in gross magnitudes compared to the remaining features. During model training, having dominating features (in scale relative to others) can be dangerous as the weight updates may mistakengly favor these larger-scale features because it will have the largest influence on the target output. 

In [110]:
# minmax_scaler.fit_transform()：这个方法首先拟合数据，即计算数据的最小值和最大值，这些值用于后续的缩放。然后，它将这些参数用于转换数据，将原始数据缩放到0和1之间。
# minmax_scaler.transform()：这个方法使用在训练数据上计算得到的最小值和最大值来转换测试数据。这确保了训练数据和测试数据使用相同的缩放标准。
from sklearn.preprocessing import MinMaxScaler
minmax_scaler = MinMaxScaler()
xTr.loc[:, ['Year']] = minmax_scaler.fit_transform(xTr.loc[:, ['Year']])
xTe.loc[:, ['Year']] = minmax_scaler.transform(xTe.loc[:, ['Year']])

In [111]:
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
# to_scale = ['HomeWinStreak','AwayWinStreak','HomeWinsToDate', 'AwayWinsToDate', 'HomeTeamELO', 'AwayTeamELO', 'HomeOdds', 'AwayOdds', 'DrawOdds'] + betting_feats
to_scale = ['HomeTeamELO', 'AwayTeamELO'] + betting_feats

xTr.loc[:, to_scale] = std_scaler.fit_transform(xTr.loc[:, to_scale])
xTe.loc[:, to_scale] = std_scaler.transform(xTe.loc[:, to_scale])

In [112]:
xTr

Unnamed: 0,B365H,B365D,B365A,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO,Div 0,Div 1,...,Year,Sin_Month,Cos_Month,Sin_Day,Cos_Day,Last Match Result,HomeWinStreak,AwayWinStreak,HomeWinsToDate,AwayWinsToDate
0,-0.897150,4.971314,4.420250,-2.25,1.96,1.94,2.949087,-0.463331,0,1,...,0.0,-8.660254e-01,-0.5,-6.152846e-01,-0.788305,3,0,0,1,0
1,0.113310,-0.626709,-0.574913,0.00,2.07,1.72,-1.012535,0.339286,0,0,...,0.0,-8.660254e-01,-0.5,-6.152846e-01,-0.788305,3,0,0,0,1
2,-0.228942,-0.793814,-0.325605,-0.25,2.02,1.88,-1.076494,-1.090232,0,0,...,0.0,-8.660254e-01,-0.5,-6.287628e-01,-0.777597,3,0,0,1,0
3,-0.457111,-0.376051,-0.205457,-0.50,2.01,1.89,0.028602,-0.554505,0,1,...,0.0,-8.660254e-01,-0.5,-6.287628e-01,-0.777597,3,0,0,0,1
4,-0.228942,-0.710261,-0.355643,-0.25,2.05,1.85,-1.472771,-1.597771,0,0,...,0.0,-8.660254e-01,-0.5,-6.287628e-01,-0.777597,3,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8030,0.384939,-0.793814,-0.610958,0.25,1.98,1.95,-0.494205,0.493468,0,0,...,1.0,-2.449294e-16,1.0,-1.721336e-02,0.999852,0,0,0,6,11
8031,-0.478841,-0.543156,-0.025235,-0.50,1.95,1.98,-0.796193,-1.044273,0,0,...,1.0,-2.449294e-16,1.0,-1.721336e-02,0.999852,2,0,0,10,10
8032,-0.087695,-0.626709,-0.475791,0.00,1.91,1.99,0.289395,0.342852,0,1,...,1.0,-2.449294e-16,1.0,-1.721336e-02,0.999852,1,1,0,14,9
8033,1.335640,0.375922,-0.821217,1.00,1.91,1.99,0.112570,1.898553,0,1,...,1.0,-2.449294e-16,1.0,6.432491e-16,1.000000,0,0,0,11,8


In [113]:
xTe

Unnamed: 0,B365H,B365D,B365A,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO,Div 0,Div 1,...,Year,Sin_Month,Cos_Month,Sin_Day,Cos_Day,Last Match Result,HomeWinStreak,AwayWinStreak,HomeWinsToDate,AwayWinsToDate
8035,-0.72874,0.375922,0.815803,-1.25,1.99,1.91,2.032941,0.914889,0,1,...,1.25,0.5,0.866025,0.017213,0.999852,0,0,0,1,0
8036,-0.22351,-0.877366,-0.295568,-0.25,2.06,1.87,0.017564,-0.43024,0,0,...,1.25,0.5,0.866025,0.034422,0.999407,1,0,0,0,1
8037,-0.647251,-0.250722,0.665617,-1.0,2.08,1.85,1.12685,-0.835428,0,0,...,1.25,0.5,0.866025,0.034422,0.999407,1,0,0,0,1
8038,-0.402785,-0.459604,-0.205457,-0.5,2.05,1.88,-0.093836,0.206109,0,0,...,1.25,0.5,0.866025,0.034422,0.999407,1,0,0,1,0
8039,-0.185482,-0.250722,-0.460772,0.0,1.85,2.05,0.964589,1.061371,0,1,...,1.25,0.5,0.866025,0.034422,0.999407,0,0,0,0,0
8040,-0.212645,-0.626709,-0.38568,-0.25,2.11,1.82,-0.531461,0.388952,0,0,...,1.25,0.5,0.866025,0.05162,0.998667,2,0,0,1,0
8041,-0.131156,-0.376051,-0.496817,0.0,1.92,2.01,0.755387,1.467545,0,0,...,1.25,0.5,0.866025,0.05162,0.998667,0,0,0,1,0
8042,-0.294133,-0.793814,-0.235494,-0.25,1.94,1.99,-1.05098,-0.516583,0,0,...,1.25,0.5,0.866025,0.05162,0.998667,0,0,0,1,0
8043,-0.864554,2.046974,3.218767,-1.75,1.99,1.94,2.267747,-0.37311,0,0,...,1.25,0.5,0.866025,0.05162,0.998667,2,0,0,0,1
8044,1.471455,0.233883,-0.81521,1.0,1.92,2.01,-0.51377,1.375705,0,0,...,1.25,0.5,0.866025,0.068802,0.99763,3,0,0,0,0


### 3.3 HomeWins Baseline Model

In [114]:
from sklearn.metrics import accuracy_score

In [115]:
xTr.shape

(8035, 289)

In [116]:
xTe.shape

(12, 289)

In [117]:
# training score
baseline_Tr = np.full((xTr.shape[0], 1), 2) 
accuracy_score(yTr.Result.values, baseline_Tr.ravel())

0.4616054760423149

In [118]:
# testing score
baseline_preds_Te = np.full((xTe.shape[0]  , 1), 2) #predicts home wins all the time
accuracy_score(yTe.Result.values, baseline_preds_Te.ravel())

0.5

### 3.4 Multinomial Logistic Regression

**3.4.1** $l2$ Regularized

In [119]:
from sklearn.linear_model import LogisticRegression
l2_lr = LogisticRegression(max_iter = 10000, n_jobs=-1).fit(xTr, yTr.values.ravel())

In [120]:
# training score
accuracy_score(yTr.Result.values, l2_lr.predict(xTr))

0.7512134411947728

In [121]:
# testing score
lr_preds = l2_lr.predict(xTe)
accuracy_score(yTe.Result.values, lr_preds)

0.6666666666666666

**3.4.1** $l2$ Penalty Tuning

In [122]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

logistic_params = {'C':[0.001,0.01,0.10]}

# logistic_randsearch = RandomizedSearchCV(estimator=LogisticRegression(max_iter=10000),
#                                          param_distributions=logistic_params,
logistic_randsearch = GridSearchCV(estimator=LogisticRegression(max_iter=10000),
                                         param_grid=logistic_params,
                                         scoring='accuracy',
                                         verbose=1,
                                         cv=5,
                                         n_jobs=-1)

logistic_rand_results = logistic_randsearch.fit(xTr, yTr.values.ravel())
print("Best: %f using %s" % (logistic_rand_results.best_score_, logistic_rand_results.best_params_))

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best: 0.726571 using {'C': 0.01}


In [123]:
l2_rs = logistic_rand_results.best_estimator_

In [124]:
# training score
accuracy_score(yTr.Result.values, l2_rs.predict(xTr))

0.7411325451151214

In [125]:
# testing score
accuracy_score(yTe.Result.values, l2_rs.predict(xTe))

0.75

**3.4.4** $l1$ Regularized

In [126]:
l1_lr = LogisticRegression(penalty='l1', solver='saga', max_iter = 10000, n_jobs=-1).fit(xTr, yTr.values.ravel())

In [127]:
# training score
accuracy_score(yTr.Result.values, l1_lr.predict(xTr))

0.7494710640945862

In [128]:
# testing score
l1_preds = l1_lr.predict(xTe)
accuracy_score(yTe.Result.values, l1_preds)

0.6666666666666666

**3.4.5** Penalty Tuning

In [129]:
l1_params = {'C':[0.001,0.01,0.10]}

# l1_randsearch = RandomizedSearchCV(estimator=LogisticRegression(penalty='l1',solver='saga', max_iter=10000),
#                                          param_distributions=l1_params,
l1_randsearch = GridSearchCV(estimator=LogisticRegression(penalty='l1',solver='saga', max_iter=10000),
                                         param_grid=l1_params,
                                         scoring='accuracy',
                                         verbose=1,
                                         n_jobs=-1,
                                         cv=5)

l1_rand_results = l1_randsearch.fit(xTr, yTr.values.ravel())
print("Best: %f using %s" % (l1_rand_results.best_score_, l1_rand_results.best_params_))

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best: 0.746235 using {'C': 0.01}


In [130]:
l1_rs = l1_randsearch.best_estimator_ #LogisticRegression(C=0.10, solver='saga', max_iter=10000).fit(xTr, yTr.values.ravel())#

In [131]:
# training score
accuracy_score(yTr.Result.values, l1_rs.predict(xTr))

0.7489732420659614

In [132]:
# testing score
accuracy_score(yTe.Result.values, l1_rs.predict(xTe))

0.8333333333333334

### 3.5 Support Vector Machine

In [133]:
from sklearn.svm import SVC
svm = SVC(max_iter=100000).fit(xTr, yTr.values.ravel())

In [134]:
# training score
accuracy_score(yTr.Result.values, svm.predict(xTr))

0.7650280024891102

In [135]:
# testing score
accuracy_score(yTe.Result.values, svm.predict(xTe))

0.8333333333333334

**3.5.2** Penalty Tuning

In [136]:
svm_params = {'C':[0.001,0.01,0.10]}

# svm_randsearch = RandomizedSearchCV(estimator=SVC(max_iter=100000),
#                                          param_distributions=svm_params,
svm_randsearch = GridSearchCV(estimator=SVC(max_iter=100000),
                                         param_grid=svm_params,
                                         scoring='accuracy',
                                         verbose=2,
                                         cv=5,
                                         n_jobs=-1)

svm_rand_results = svm_randsearch.fit(xTr, yTr.values.ravel())
print("Best: %f using %s" % (svm_rand_results.best_score_, svm_rand_results.best_params_))

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best: 0.733790 using {'C': 0.1}


In [137]:
svm_rs = svm_rand_results.best_estimator_

In [138]:
# training score
accuracy_score(yTr.Result.values, svm_rs.predict(xTr))

0.7438705662725575

In [139]:
# testing score
accuracy_score(yTe.Result.values, svm_rs.predict(xTe))

0.8333333333333334

### 3.6 Simple Neural Network ####

In [140]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(512,128,32),
                    activation='relu',
                    batch_size=64,
                    max_iter=200,
                    learning_rate_init=1e-4,
                    early_stopping=False,
                    alpha=1e-3,
                   ).fit(xTr, yTr.values.ravel())

In [141]:
# training score
accuracy_score(yTr.Result.values, mlp.predict(xTr))

1.0

In [142]:
# testing score
accuracy_score(yTe.Result.values, mlp.predict(xTe))

1.0

In [143]:
xTr

Unnamed: 0,B365H,B365D,B365A,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO,Div 0,Div 1,...,Year,Sin_Month,Cos_Month,Sin_Day,Cos_Day,Last Match Result,HomeWinStreak,AwayWinStreak,HomeWinsToDate,AwayWinsToDate
0,-0.897150,4.971314,4.420250,-2.25,1.96,1.94,2.949087,-0.463331,0,1,...,0.0,-8.660254e-01,-0.5,-6.152846e-01,-0.788305,3,0,0,1,0
1,0.113310,-0.626709,-0.574913,0.00,2.07,1.72,-1.012535,0.339286,0,0,...,0.0,-8.660254e-01,-0.5,-6.152846e-01,-0.788305,3,0,0,0,1
2,-0.228942,-0.793814,-0.325605,-0.25,2.02,1.88,-1.076494,-1.090232,0,0,...,0.0,-8.660254e-01,-0.5,-6.287628e-01,-0.777597,3,0,0,1,0
3,-0.457111,-0.376051,-0.205457,-0.50,2.01,1.89,0.028602,-0.554505,0,1,...,0.0,-8.660254e-01,-0.5,-6.287628e-01,-0.777597,3,0,0,0,1
4,-0.228942,-0.710261,-0.355643,-0.25,2.05,1.85,-1.472771,-1.597771,0,0,...,0.0,-8.660254e-01,-0.5,-6.287628e-01,-0.777597,3,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8030,0.384939,-0.793814,-0.610958,0.25,1.98,1.95,-0.494205,0.493468,0,0,...,1.0,-2.449294e-16,1.0,-1.721336e-02,0.999852,0,0,0,6,11
8031,-0.478841,-0.543156,-0.025235,-0.50,1.95,1.98,-0.796193,-1.044273,0,0,...,1.0,-2.449294e-16,1.0,-1.721336e-02,0.999852,2,0,0,10,10
8032,-0.087695,-0.626709,-0.475791,0.00,1.91,1.99,0.289395,0.342852,0,1,...,1.0,-2.449294e-16,1.0,-1.721336e-02,0.999852,1,1,0,14,9
8033,1.335640,0.375922,-0.821217,1.00,1.91,1.99,0.112570,1.898553,0,1,...,1.0,-2.449294e-16,1.0,6.432491e-16,1.000000,0,0,0,11,8


In [144]:
yTr

Unnamed: 0,Result
0,2
1,0
2,2
3,0
4,0
...,...
8030,2
8031,0
8032,2
8033,2


In [145]:
xTe

Unnamed: 0,B365H,B365D,B365A,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO,Div 0,Div 1,...,Year,Sin_Month,Cos_Month,Sin_Day,Cos_Day,Last Match Result,HomeWinStreak,AwayWinStreak,HomeWinsToDate,AwayWinsToDate
8035,-0.72874,0.375922,0.815803,-1.25,1.99,1.91,2.032941,0.914889,0,1,...,1.25,0.5,0.866025,0.017213,0.999852,0,0,0,1,0
8036,-0.22351,-0.877366,-0.295568,-0.25,2.06,1.87,0.017564,-0.43024,0,0,...,1.25,0.5,0.866025,0.034422,0.999407,1,0,0,0,1
8037,-0.647251,-0.250722,0.665617,-1.0,2.08,1.85,1.12685,-0.835428,0,0,...,1.25,0.5,0.866025,0.034422,0.999407,1,0,0,0,1
8038,-0.402785,-0.459604,-0.205457,-0.5,2.05,1.88,-0.093836,0.206109,0,0,...,1.25,0.5,0.866025,0.034422,0.999407,1,0,0,1,0
8039,-0.185482,-0.250722,-0.460772,0.0,1.85,2.05,0.964589,1.061371,0,1,...,1.25,0.5,0.866025,0.034422,0.999407,0,0,0,0,0
8040,-0.212645,-0.626709,-0.38568,-0.25,2.11,1.82,-0.531461,0.388952,0,0,...,1.25,0.5,0.866025,0.05162,0.998667,2,0,0,1,0
8041,-0.131156,-0.376051,-0.496817,0.0,1.92,2.01,0.755387,1.467545,0,0,...,1.25,0.5,0.866025,0.05162,0.998667,0,0,0,1,0
8042,-0.294133,-0.793814,-0.235494,-0.25,1.94,1.99,-1.05098,-0.516583,0,0,...,1.25,0.5,0.866025,0.05162,0.998667,0,0,0,1,0
8043,-0.864554,2.046974,3.218767,-1.75,1.99,1.94,2.267747,-0.37311,0,0,...,1.25,0.5,0.866025,0.05162,0.998667,2,0,0,0,1
8044,1.471455,0.233883,-0.81521,1.0,1.92,2.01,-0.51377,1.375705,0,0,...,1.25,0.5,0.866025,0.068802,0.99763,3,0,0,0,0


In [146]:
yTe

Unnamed: 0,Result
8035,2
8036,0
8037,0
8038,2
8039,1
8040,2
8041,2
8042,2
8043,0
8044,1


### 3.7 Stacked Classifier ###

In [90]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [91]:
from sklearn.ensemble import StackingClassifier
stacked_clf = StackingClassifier(estimators=[('svm', SVC(max_iter=100000)), ('logistic', LogisticRegression(C=0.01, max_iter=10000))],
                                final_estimator=LogisticRegression(max_iter=10000),
                                n_jobs=-1).fit(xTr, yTr.values.ravel())

In [92]:
# training score
accuracy_score(yTr.Result.values, stacked_clf.predict(xTr))

0.7562929061784897

In [93]:
# testing score
accuracy_score(yTe.Result.values, stacked_clf.predict(xTe))

0.7589424572317263

## 4. Result Analysis ##

In [94]:
## TODO: breakdown results across divisions and/or teams; i.e., see how model performs individually at subgroups

## 5. Scrap Code ##

In [95]:
barcelona_df = learning_df[(learning_df['HomeTeam 17'] == 1) | (learning_df['AwayTeam 17'] == 1)]
barcelona_df

Unnamed: 0,B365H,B365D,B365A,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO,Div 0,Div 1,...,Year,Sin_Month,Cos_Month,Sin_Day,Cos_Day,Last Match Result,HomeWinStreak,AwayWinStreak,HomeWinsToDate,AwayWinsToDate
3568,1.53,4.50,5.75,-1.00,1.99,1.94,1769.324219,1563.644165,1,0,...,2021,-8.660254e-01,-5.000000e-01,-0.680773,-0.732494,3,0,0,5,0
3591,2.62,3.50,2.62,0.00,2.01,1.92,1561.397583,1672.445435,1,0,...,2021,-8.660254e-01,-5.000000e-01,-0.763889,-0.645348,3,0,0,1,8
3648,1.83,3.75,4.20,-0.50,1.90,2.03,1607.371338,1575.017212,1,0,...,2021,-8.660254e-01,-5.000000e-01,-0.835925,-0.548843,3,1,0,6,0
3704,2.80,3.40,2.50,0.00,2.01,1.92,1569.211182,1619.204712,1,0,...,2021,-1.000000e+00,-1.836970e-16,-0.948362,-0.317191,3,0,0,1,4
3738,1.06,13.00,29.00,-3.00,1.86,2.07,2012.453613,1557.261597,1,0,...,2021,-1.000000e+00,-1.836970e-16,-0.976011,-0.217723,3,1,0,10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7786,2.20,3.60,3.10,-0.25,1.96,1.97,1577.828491,1609.596436,1,0,...,2023,-5.000000e-01,8.660254e-01,-0.566702,0.823923,3,0,1,3,8
7811,2.63,3.60,2.55,0.00,1.96,1.97,1610.973022,1700.016113,1,0,...,2023,-2.449294e-16,1.000000e+00,-0.478734,0.877960,0,0,0,9,5
7865,1.67,4.20,4.75,-0.75,1.93,2.00,1675.109497,1623.018921,1,0,...,2023,-2.449294e-16,1.000000e+00,-0.385663,0.922640,1,0,0,6,8
7919,2.70,3.40,2.60,0.00,1.98,1.95,1608.252686,1654.181030,1,0,...,2023,-2.449294e-16,1.000000e+00,-0.255353,0.966848,2,1,0,10,6


In [96]:
bxTr = xTr[(xTr['HomeTeam 17'] == 1) | (xTr['AwayTeam 17'] == 1)]
bxTe = xTe[(xTe['HomeTeam 17'] == 1) | (xTe['AwayTeam 17'] == 1)]

In [97]:
byTr, byTe = yTr.loc[bxTr.index,:], yTe.loc[bxTe.index,:]

In [98]:
# training score
accuracy_score(byTr, l1_lr.predict(bxTr))

0.8163265306122449

In [99]:
# testing score
accuracy_score(byTe, l1_lr.predict(bxTe))

0.6

In [100]:
# training score
accuracy_score(byTr, l2_lr.predict(bxTr))

0.7551020408163265

In [101]:
# testing score
accuracy_score(byTe, l2_lr.predict(bxTe))

0.6

## 6. Pytorch MLP ##

In [102]:
type(xTr)

pandas.core.frame.DataFrame

In [103]:
xTr.shape

(6118, 289)

In [104]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
import time
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

Using device: cuda


In [105]:
class Attention(nn.Module):
    def __init__(self, feature_dim):
        super(Attention, self).__init__()
        self.attention_weights = nn.Parameter(torch.randn(feature_dim))

    def forward(self, x):
        # 应用注意力权重
        weights = F.softmax(self.attention_weights, dim=0)
        # 加权求和
        x = x * weights
        return x
    
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(in_features=xTr.shape[1], out_features=512)
        self.bn1 = nn.BatchNorm1d(num_features=512)
        self.attention = Attention(512)
        self.dropout1 = nn.Dropout(p=0.2)
        
        self.fc2 = nn.Linear(in_features=512, out_features=128)
        self.bn2 = nn.BatchNorm1d(num_features=128)
        self.dropout2 = nn.Dropout(p=0.2)
        
        self.fc3 = nn.Linear(in_features=128, out_features=32)
        self.bn3 = nn.BatchNorm1d(num_features=32)
        self.dropout3 = nn.Dropout(p=0.2)
        
        self.fc4 = nn.Linear(in_features=32, out_features=3)  # 输出层改为3，对应三个类别

    def forward(self, x):
        x = self.dropout1(torch.relu(self.bn1(self.fc1(x))))
        x = self.attention(x)
        x = self.dropout2(torch.relu(self.bn2(self.fc2(x))))
        x = self.dropout3(torch.relu(self.bn3(self.fc3(x))))
        x = self.fc4(x)
        return x

# 数据预处理
scaler = StandardScaler()
xTr_scaled = scaler.fit_transform(xTr)
xTr_tensor = torch.tensor(xTr_scaled, dtype=torch.float32).to(device)
yTr_tensor = torch.tensor(yTr.values.ravel(), dtype=torch.long).to(device)

# 创建数据加载器
dataset = TensorDataset(xTr_tensor, yTr_tensor)
dataloader = DataLoader(dataset, batch_size=1024, shuffle=True)

# 创建模型实例
model = MLP().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-4)

train_start = time.time()
# 训练模型
model.train()
for epoch in range(500):  # 假设训练200个epoch
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()  # 清除之前的梯度

        outputs = model(inputs)  # 前向传播
        loss = criterion(outputs, labels)  # 计算损失
        loss.backward()  # 反向传播
        optimizer.step()  # 更新参数

        running_loss += loss.item() * inputs.size(0)  # 累计损失
        _, predicted = torch.max(outputs.data, 1)  # 获取预测结果
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / total
    epoch_acc = correct / total * 100  # 计算准确率

    # 每个epoch结束后输出
    print(f'Epoch {epoch+1}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}%')
print(f'训练时长： {time.time() - train_start}s')

Epoch 1, Loss: 1.1728, Accuracy: 30.99%
Epoch 2, Loss: 1.1524, Accuracy: 32.80%
Epoch 3, Loss: 1.1357, Accuracy: 34.90%
Epoch 4, Loss: 1.1194, Accuracy: 36.83%
Epoch 5, Loss: 1.1124, Accuracy: 37.25%
Epoch 6, Loss: 1.0967, Accuracy: 39.36%
Epoch 7, Loss: 1.0850, Accuracy: 41.30%
Epoch 8, Loss: 1.0721, Accuracy: 42.82%
Epoch 9, Loss: 1.0589, Accuracy: 45.88%
Epoch 10, Loss: 1.0478, Accuracy: 46.71%
Epoch 11, Loss: 1.0350, Accuracy: 48.66%
Epoch 12, Loss: 1.0227, Accuracy: 49.12%
Epoch 13, Loss: 1.0122, Accuracy: 51.19%
Epoch 14, Loss: 0.9978, Accuracy: 52.89%
Epoch 15, Loss: 0.9824, Accuracy: 54.66%
Epoch 16, Loss: 0.9681, Accuracy: 56.54%
Epoch 17, Loss: 0.9584, Accuracy: 57.75%
Epoch 18, Loss: 0.9532, Accuracy: 57.65%
Epoch 19, Loss: 0.9416, Accuracy: 58.16%
Epoch 20, Loss: 0.9313, Accuracy: 59.84%
Epoch 21, Loss: 0.9149, Accuracy: 60.77%
Epoch 22, Loss: 0.9033, Accuracy: 61.31%
Epoch 23, Loss: 0.8844, Accuracy: 63.78%
Epoch 24, Loss: 0.8761, Accuracy: 64.86%
Epoch 25, Loss: 0.8645, A

In [106]:
# 假设 xTe 和 yTe 是 pandas DataFrame 或 Series
# 数据预处理
xTe_scaled = scaler.fit_transform(xTe)  # 使用与训练数据相同的标准化参数
xTe_tensor = torch.tensor(xTe_scaled, dtype=torch.float32).to(device)
yTe_tensor = torch.tensor(yTe.values.ravel(), dtype=torch.long).to(device)

# 创建数据加载器
test_dataset = TensorDataset(xTe_tensor, yTe_tensor)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# 设置模型为评估模式
model.eval()

# 初始化用于计算准确率的变量
correct = 0
total = 0

# 不计算梯度，因为在评估模式下不需要进行反向传播
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

# 计算准确率
accuracy = correct / total
print(f'Accuracy on test set: {accuracy * 100:.2f}%')

Accuracy on test set: 69.67%


## 7. Pytorch Transformer ##

In [107]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class TransformerModel(nn.Module):
    def __init__(self, input_dim, num_classes, num_heads, num_layers, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.input_dim = input_dim
        self.num_classes = num_classes
        self.model_dim = input_dim  # 通常情况下，模型维度与输入维度相同

        # Transformer Encoder Layer
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=self.model_dim,
            nhead=num_heads,
            dim_feedforward=512,  # 前馈网络的维度
            dropout=dropout,
            batch_first=True
        )

        # Transformer Encoder
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # 输出层
        self.output_layer = nn.Linear(self.model_dim, self.num_classes)

        # Batch Normalization
        self.bn = nn.BatchNorm1d(self.model_dim)

    def forward(self, x):
        # 增加一个假的序列维度
        x = x.unsqueeze(1)
        # Transformer Encoder
        x = self.transformer_encoder(x)

        # Batch Normalization
        x = self.bn(x[:, 0, :])  # 取序列的第一个元素进行批量归一化

        # 输出层
        x = self.output_layer(x)
        return x

# 补充维度
n_samples_xTr = xTr.shape[0]
n_samples_xTe = xTe.shape[0]
for i in range(1, 4):  # 从 1 到 3，因为需要添加三列
    xTr[f'pad{i}'] = 0  # 添加填充列，初始化为 0
    xTe[f'pad{i}'] = 0  # 添加填充列，初始化为 0

# 参数设置
input_dim = xTr.shape[1]  # 输入特征的维度
num_classes = 3  # 类别数
num_heads = 10  # 注意力头的数量
num_layers = 3  # Transformer层的数量
dropout = 0.8  # Dropout比率

# 创建模型
model = TransformerModel(input_dim, num_classes, num_heads, num_layers, dropout).to(device)

# 损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# 数据加载
# 假设 xTr 和 xTe 已经是适当的 torch.Tensor 对象
xTr_values = xTr.values.astype(float)
xTe_values = xTe.values.astype(float)

xTr_tensor = torch.tensor(xTr_values, dtype=torch.float32).to(device)
xTe_tensor = torch.tensor(xTe_values, dtype=torch.float32).to(device)
yTr_tensor = torch.tensor(yTr.values, dtype=torch.long).to(device).squeeze(1)
yTe_tensor = torch.tensor(yTe.values, dtype=torch.long).to(device).squeeze(1)
# 转换为 one-hot 编码
yTr_tensor = F.one_hot(yTr_tensor, num_classes=num_classes).float()
yTe_tensor = F.one_hot(yTe_tensor, num_classes=num_classes).float()

# 数据加载器
train_dataset = TensorDataset(xTr_tensor, yTr_tensor)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=False)

test_dataset = TensorDataset(xTe_tensor, yTe_tensor)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False)

train_start = time.time()
# 训练模型
model.train()
for epoch in range(500):  # 运行更多的 epoch 以获得更好的结果
    running_loss = 0.0
    correct = 0
    total = 0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)  # 累计损失
        _, predicted = torch.max(outputs.data, 1)  # 获取预测结果
        _, truth = torch.max(labels.data, 1)
        total += truth.size(0)
        correct += (predicted == truth).sum().item()

    epoch_loss = running_loss / total
    epoch_acc = correct / total * 100  # 计算准确率

    # 每个epoch结束后输出
    print(f'Epoch {epoch+1}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}%')
print(f'训练时长： {time.time() - train_start}s')

# 设置模型为评估模式
model.eval()

# 初始化用于计算准确率的变量
correct = 0
total = 0

# 不计算梯度，因为在评估模式下不需要进行反向传播
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        _, truth = torch.max(labels.data, 1)
        total += labels.size(0)
        correct += (predicted == truth).sum().item()

# 计算准确率
accuracy = correct / total
print(f'Accuracy on test set: {accuracy * 100:.2f}%')

AssertionError: embed_dim must be divisible by num_heads