In [193]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
from joblib import dump, load
import joblib

## 0. DataFrame

In [194]:
football_df = pd.read_csv('data/all_data_with_elo.csv', low_memory = False)
football_df

In [195]:
# 提前6个月，用来划分赛季
# 将字符串转换为日期类型，指定日期格式
football_df['Date'] = pd.to_datetime(football_df['Date'], format='%Y/%m/%d')
football_df['Date'] = football_df['Date'] - pd.DateOffset(months=6)
football_df

## 1. Descriptive Statistics 

**1.1 DataFrame Shape**

In [196]:
# no. rows and no. cols
football_df.shape

In [197]:
# feature names
print(football_df.columns.tolist())

**1.2 NaN Values**

In [198]:
football_df.isnull().sum()

In [199]:
# total elements in 
football_df.size

In [200]:
# total number of NaN
football_df.size - football_df.count().sum()

In [201]:
# total number of NaN rows
football_df.isnull().any(axis = 1).sum()

In [202]:
# total number of NaN columns
football_df.isnull().any(axis = 0).sum()

## 2. Data Wrangling and Feature Transformation/Development

### 2.1 NaN Handling

`TODO`: drop NaN values along columns: {Date, Home Team, Away Team, FTR} <br>
`TODO`: identify betting odds w/ most available data

In [203]:
# 当前方法仅提取这几个字段 分区 日期 主队 客队 full-time-result 三家机构的胜平负 主队ELO评分 客队ELO评分
# nan_mask = ['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTR', 'B365H', 'B365D', 'B365A', 
#             'IWH', 'IWD', 'IWA', 'WHH', 'WHD', 'WHA', 'AHh', 'B365AHH', 'B365AHA', 'HomeTeamELO', 'AwayTeamELO']
nan_mask = ['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'B365H', 'B365D', 'B365A', 'WHH', 'WHD', 'WHA', 'AHh', 'B365AHH', 'B365AHA', 'AHCh', 'B365CAHH', 'B365CAHA','HomeTeamELO', 'AwayTeamELO']

In [204]:
nan_football_df = football_df.dropna(subset = nan_mask)
nan_football_df

In [205]:
conditions = [
    nan_football_df['AHCh'] >= 2.5,         # AHh >= 2.75
    (nan_football_df['AHCh'] >= 1.5) & (nan_football_df['AHCh'] <= 2.25),  # 1.75 < AHh <= 2.75
    (nan_football_df['AHCh'] >= 0.25) & (nan_football_df['AHCh'] <= 1.25),  # 0.25 < AHh <= 1.75
    nan_football_df['AHCh'] == 0,            # AHh == 0
    (nan_football_df['AHCh'] >= -1.25) & (nan_football_df['AHCh'] <= -0.25),  # -1.75 < AHh <= -0.25
    (nan_football_df['AHCh'] >= -2.25) & (nan_football_df['AHCh'] <= -1.5),  # -2.75 < AHh <= -1.75
    nan_football_df['AHCh'] <= -2.5
]
# easy_conditions = [
#     nan_football_df['AHh'] <= -0.25,
#     nan_football_df['AHh'] == 0,
#     nan_football_df['AHh'] >= 0.25,
# ]
labels = [3, 2, 1, 0, -1, -2, -3]
# easy_labels = [-1, 0, 1]

nan_football_df['balance_val'] = np.select(conditions, labels)
nan_football_df

In [206]:
nan_football_df['asia_final_result'] = nan_football_df['FTHG'] - nan_football_df['FTAG'] + nan_football_df['balance_val']
nan_football_df

In [207]:
nan_football_df_noNone = nan_football_df.dropna(subset = nan_mask)
nan_football_df_noNone

In [208]:
nan_football_df_noNone.reset_index(inplace=True, drop=True)
nan_football_df_noNone

In [209]:
conditions = [
    nan_football_df_noNone['asia_final_result'] < 0,
    nan_football_df_noNone['asia_final_result'] == 0,
    nan_football_df_noNone['asia_final_result'] > 0,
]
easy_labels = [0, 1, 2]

nan_football_df_noNone['easy_label'] = np.select(conditions, easy_labels)
nan_football_df_noNone

In [210]:
# resize shape
football_df.shape[0] - nan_football_df_noNone.shape[0]

### 2.2 Feature Encoding <br>
* $\phi(Date)$ $\Rightarrow$ one column for *year*, second column for *month*, third column for *day of year*
* One hot encode Division, Home and Away Teams
* Label encode Full Time Result (Win/Draw/Loss)

In [211]:
feats = nan_mask
feats.append('easy_label')
feats.append('balance_val')

In [212]:
nan_football_df_noNone

In [213]:
learning_df_feat = nan_football_df_noNone.copy()[feats]
learning_df_feat

In [214]:
learning_df_feat.reset_index(inplace=True, drop=True)
# 保存文件作为历史文件
learning_df_feat.to_csv('.\prediction_data/history_data_balance.csv', index=False, encoding='utf-8-sig')
learning_df_feat

**2.2.1 Division and Home/Away Team Encoding**

In [215]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

div_encoder = OneHotEncoder()
home_encoder = OneHotEncoder()
away_encoder = OneHotEncoder()

In [216]:
onehot_div = div_encoder.fit_transform(learning_df_feat.Div.values.reshape(-1,1)).toarray().astype(int)
onehot_div_df = pd.DataFrame(onehot_div, columns = ["Div "+str(int(i)) for i in range(onehot_div.shape[1])])

onehot_home = home_encoder.fit_transform(learning_df_feat.HomeTeam.values.reshape(-1,1)).toarray().astype(int)
onehot_home_df = pd.DataFrame(onehot_home, columns = ['HomeTeam ' + str(int(i)) for i in np.arange(onehot_home.shape[1])])

onehot_away = away_encoder.fit_transform(learning_df_feat.AwayTeam.values.reshape(-1,1)).toarray().astype(int)
onehot_away_df = pd.DataFrame(onehot_away, columns = ['AwayTeam ' + str(int(i)) for i in np.arange(onehot_away.shape[1])])

In [217]:
# 保存编码器到本地
joblib.dump(div_encoder, 'div_encoder.pkl')
joblib.dump(home_encoder, 'home_encoder.pkl')
joblib.dump(away_encoder, 'away_encoder.pkl')

In [218]:
learning_df_div = pd.concat([learning_df_feat, onehot_div_df, onehot_home_df, onehot_away_df], axis = 1)
learning_df_div.drop(columns = ['Div'], inplace = True)

In [219]:
learning_df_div

**2.2.2 Full Time Result Encoding**

In [220]:
target_encoder = LabelEncoder()
learning_df_div['Result'] = target_encoder.fit_transform(learning_df_div.easy_label) 
learning_df_div['Result_FTR'] = target_encoder.fit_transform(learning_df_div.FTR)

**2.2.3 Date Encoding**

In [221]:
learning_df_div['Year'] = pd.DatetimeIndex(learning_df_div.Date).year

learning_df_div['Month'] = pd.DatetimeIndex(learning_df_div.Date).month
learning_df_div['Sin_Month'] = np.sin(2*np.pi*learning_df_div.Month/12)
learning_df_div['Cos_Month'] = np.cos(2*np.pi*learning_df_div.Month/12)

learning_df_div['DayofYear'] = pd.DatetimeIndex(learning_df_div.Date).dayofyear
learning_df_div['Sin_Day'] = np.sin(2*np.pi*learning_df_div.DayofYear/365)
learning_df_div['Cos_Day'] = np.cos(2*np.pi*learning_df_div.DayofYear/365)

# 注意 inplace是在原始frame修改，返回值是Nonetype
# learning_df = learning_df_div.drop(columns = ['Date','Month'], inplace = True)
# learning_df = learning_df_div.drop(columns = ['Date','Month'])
learning_df = learning_df_div.drop(columns = ['Date'])
# learning_df.drop(columns = ['Date'], inplace = True)

In [222]:
learning_df

In [223]:
# For Test


### 2.3 Feature Engineering <br>
* $\phi(x)$ feature transformation $\Rightarrow$ last match result, win/loss streak to date, wins to season date
* $\phi(x)$ feature engineering $\Rightarrow$ average the home, away, and draw odds

**2.3.1 Last Match Result** <br>
Indicate the result from the last match played between both teams

# 定义一个函数来计算两队之间上一场比赛的结果
def compute_last_matches(df):
    
    unique_matchups = list(set((list(zip(df.HomeTeam, df.AwayTeam)))))
    df['Last Match Result'] = np.nan
    for home, away in unique_matchups:
        matchup_df = df[(df.HomeTeam == home) & (df.AwayTeam == away)]
        # 使用 shift(1) 方法将 FTR（全场比赛结果）列中的数据向下移动一行，这样每行的 last_match_result 将对应于这两队之前的一场比赛的结果。fill_value='Na' 确保了数据移动后空出的位置填充为 'Na'。
        # last_match_result = matchup_df.FTR.shift(1, fill_value='Na')
        last_match_result = matchup_df.easy_label.shift(1, fill_value='Na')
        df.loc[matchup_df.index, 'Last Match Result'] = last_match_result
        
    lmr_encoder = LabelEncoder()
    df['Last Match Result'] = lmr_encoder.fit_transform(df['Last Match Result'])
    df.drop(columns = ['easy_label'], inplace = True)
    return df

In [224]:
def compute_last_n_matches(df, n=5):
    unique_matchups = list(set(zip(df.HomeTeam, df.AwayTeam)))
    df['Last 5 Match Results'] = np.nan  # 新增一列用于存储过去 5 场比赛的结果
    
    for home, away in unique_matchups:
        matchup_df = df[(df.HomeTeam == home) & (df.AwayTeam == away)]
        
        # 获取过去 n 场比赛的结果
        # last_n_results = [matchup_df.FTR.shift(i, fill_value='Na') for i in range(1, n+1)]
        last_n_results = [matchup_df.easy_label.shift(i, fill_value='Na') for i in range(1, n+1)]
        
        # 将计算得到的过去 n 场比赛的结果合并为一个字符串或列表，取决于需求
        # 这里使用字符串形式：'result1/result2/...'
        matchup_df['Last 5 Match Results'] = pd.DataFrame(last_n_results).T.apply(lambda x: '/'.join(x), axis=1)
        
        # 将计算得到的结果更新回原始 df 中
        df.loc[matchup_df.index, 'Last 5 Match Results'] = matchup_df['Last 5 Match Results']
    
    # 对 Last 5 Match Results 列进行标签编码
    lmr_encoder = LabelEncoder()
    df['Last 5 Match Results'] = lmr_encoder.fit_transform(df['Last 5 Match Results'])
    
    # 删除原始的 FTR 列
    df.drop(columns=['easy_label'], inplace=True)
    
    return df


In [225]:
# 定义一个函数来计算两队之间上一场比赛的结果
def compute_last_matches(df):
    
    unique_matchups = list(set((list(zip(df.HomeTeam, df.AwayTeam)))))
    df['Last Match Result'] = np.nan
    for home, away in unique_matchups:
        matchup_df = df[(df.HomeTeam == home) & (df.AwayTeam == away)]
        # 使用 shift(1) 方法将 FTR（全场比赛结果）列中的数据向下移动一行，这样每行的 last_match_result 将对应于这两队之前的一场比赛的结果。fill_value='Na' 确保了数据移动后空出的位置填充为 'Na'。
        # last_match_result = matchup_df.FTR.shift(1, fill_value='Na')
        # 因为easy_label 不适合作为上次比较结果
        last_match_result = matchup_df.Result.shift(1, fill_value=3)
        # last_match_result = matchup_df.Result_FTR.shift(1, fill_value=3)
        df.loc[matchup_df.index, 'Last Match Result'] = last_match_result
        
    lmr_encoder = LabelEncoder()
    df['Last Match Result'] = lmr_encoder.fit_transform(df['Last Match Result'])
    df.drop(columns = ['easy_label'], inplace = True)
    df.drop(columns = ['FTR'], inplace = True)
    return df
learning_df = compute_last_matches(learning_df)
# learning_df.drop(columns = ['FTR'], inplace = True)

In [226]:
learning_df

**2.3.2 Home and Away Win/Loss Streak** <br>
Important note about this feature: the win/loss streak is the teams *home* and *away* win streak, *not* its ***consecutive*** win/loss streak.

In [227]:
# https://stackoverflow.com/questions/52976336/compute-winning-streak-with-pandas
# https://joshdevlin.com/blog/calculate-streaks-in-pandas/

In [228]:
def compute_winstreak(df):
    
    years = df.Year.unique()
    df_lst = []    
    for year in years:
        
        year_df = df[df.Year == year]
        year_df['HomeWin'] = year_df.Result.replace([0, 1, 2], [0, 0, 1])
        year_df['AwayWin'] = year_df.Result.replace([0, 1, 2], [1, 0, 0])
        year_df['HomeWinStreak'] = None
        year_df['AwayWinStreak'] = None
        
        hometeams = year_df.HomeTeam.unique()
        awayteams = year_df.AwayTeam.unique()
        if year > 2024:
            # 将 AwayWin = 3 当作 0 来处理，保持计算连胜记录
            year_df['HomeWin'] = year_df['HomeWin'].replace(3, 0)
            year_df['AwayWin'] = year_df['AwayWin'].replace(3, 0)
        
        for team in hometeams:
            team_df = year_df[(year_df.HomeTeam == team)]
            team_df = team_df.sort_values(['Year', 'DayofYear'], ascending = (True, True))
            
            home_win_streak = 0  # 初始化连胜场数
            streaks = []  # 用来存储每场比赛的连续胜利次数
            for idx, row in team_df.iterrows():
                streaks.append(home_win_streak)  # 当前场次视为未进行，记录上一场的连胜次数
                # 计算当前场次的连胜，忽略当前比赛的胜负
                if row['HomeWin'] == 1:  # 如果上一场比赛主队赢
                    home_win_streak += 1  # 连胜场数递增
                else:  # 如果上一场比赛主队输了
                    home_win_streak = 0  # 连胜场数重置为 0
            # 将计算出的连胜场数赋值到 DataFrame 中
            team_df['HomeWinStreak'] = streaks
            # 将更新后的数据回写到原 DataFrame
            year_df.loc[team_df.index, 'HomeWinStreak'] = team_df.HomeWinStreak
            
            # team_grouper = (team_df.HomeWin != team_df.HomeWin.shift()).cumsum()
            # team_df['HomeWinStreak'] = team_df[['HomeWin']].groupby(team_grouper).cumsum()
            # team_df.loc[team_df.HomeWinStreak >0, 'HomeWinStreak'] -= 1
            # year_df.loc[team_df.index, 'HomeWinStreak'] = team_df.HomeWinStreak
            
        for team in awayteams:
            team_df = year_df[(year_df.AwayTeam == team)]
            team_df = team_df.sort_values(['Year', 'DayofYear'], ascending = (True, True))

            away_win_streak = 0  # 初始化连胜场数
            streaks = []  # 用来存储每场比赛的连续胜利次数
            for idx, row in team_df.iterrows():
                streaks.append(away_win_streak)  # 当前场次视为未进行，记录上一场的连胜次数
                # 计算当前场次的连胜，忽略当前比赛的胜负
                if row['AwayWin'] == 1:  # 如果上一场比赛主队赢
                    away_win_streak += 1  # 连胜场数递增
                else:  # 如果上一场比赛主队输了
                    away_win_streak = 0  # 连胜场数重置为 0
            # 将计算出的连胜场数赋值到 DataFrame 中
            team_df['AwayWinStreak'] = streaks
            # 将更新后的数据回写到原 DataFrame
            year_df.loc[team_df.index, 'AwayWinStreak'] = team_df.AwayWinStreak
            
            # team_grouper = (team_df.AwayWin != team_df.AwayWin.shift()).cumsum()
            # team_df['AwayWinStreak'] = team_df[['AwayWin']].groupby(team_grouper).cumsum()
            # team_df.loc[team_df.AwayWinStreak >0, 'AwayWinStreak'] -= 1
            # year_df.loc[team_df.index, 'AwayWinStreak'] = team_df.AwayWinStreak
            
        df_lst.append(year_df)
        
    return pd.concat(df_lst, axis = 0).drop(columns = ['HomeWin', 'AwayWin'])#,'DayofYear'])

In [229]:
learning_df = compute_winstreak(learning_df)
learning_df

**2.3.4 Season Home/Away Wins to Date** <br>
Indicate the number of wins for a team as home and away to date within current season

toy = learning_df[(learning_df.Year == 2010) & (learning_df.HomeTeam == 'Barcelona')][['HomeTeam', 'AwayTeam', 'Result']]
toy['HomeWin'] = toy.Result.replace([0, 1, 2], [0, 0, 1])
toy['HomeWinsToDate'] = toy.HomeWin.cumsum()

In [230]:
def compute_winstodate(df):
    
    years = df.Year.unique()
    df_lst = []    
    for year in years:
        
        year_df = df[df.Year == year]
        year_df['HomeWin'] = year_df.Result.replace([0, 1, 2], [0, 0, 1])
        year_df['AwayWin'] = year_df.Result.replace([0, 1, 2], [1, 0, 0])
        year_df['HomeWinsToDate'] = None
        year_df['AwayWinsToDate'] = None
        
        hometeams = year_df.HomeTeam.unique()
        awayteams = year_df.AwayTeam.unique()
        if year > 2024:
            # 将 AwayWin = 3 当作 0 来处理，保持计算连胜记录
            year_df['HomeWin'] = year_df['HomeWin'].replace(3, 0)
            year_df['AwayWin'] = year_df['AwayWin'].replace(3, 0)
            
        for team in hometeams:
            team_df = year_df[(year_df.HomeTeam == team)]
            team_df = team_df.sort_values(['Year', 'DayofYear'], ascending = (True, True))

            # 计算截至当前场次之前的累计胜利次数（不包含当前场次）
            team_df['HomeWinsToDate'] = team_df.HomeWin.shift(1).cumsum()
            # 填充 NaN 值为 0，因为第一场比赛没有上一场比赛的数据
            team_df['HomeWinsToDate'].fillna(0, inplace=True)
            # 将更新后的数据回写到原 DataFrame
            year_df.loc[team_df.index, 'HomeWinsToDate'] = team_df.HomeWinsToDate
    
            # team_df['HomeWinsToDate'] = team_df.HomeWin.cumsum()
            # year_df.loc[team_df.index, 'HomeWinsToDate'] = team_df.HomeWinsToDate
            
        for team in awayteams:
            team_df = year_df[(year_df.AwayTeam == team)]
            team_df = team_df.sort_values(['Year', 'DayofYear'], ascending = (True, True))
            
            # 计算截至当前场次之前的累计胜利次数（不包含当前场次）
            team_df['AwayWinsToDate'] = team_df.AwayWin.shift(1).cumsum()
            # 填充 NaN 值为 0，因为第一场比赛没有上一场比赛的数据
            team_df['AwayWinsToDate'].fillna(0, inplace=True)
            # 将更新后的数据回写到原 DataFrame
            year_df.loc[team_df.index, 'AwayWinsToDate'] = team_df.AwayWinsToDate
            
            # team_df['AwayWinsToDate'] = team_df.AwayWin.cumsum()
            # year_df.loc[team_df.index, 'AwayWinsToDate'] = team_df.AwayWinsToDate
            
        df_lst.append(year_df)
        
    return pd.concat(df_lst, axis = 0).drop(columns = ['HomeWin', 'AwayWin','DayofYear'])

In [231]:
learning_df = compute_winstodate(learning_df)
learning_df.drop(columns = ['HomeTeam', 'AwayTeam'], inplace = True)

In [232]:
# learning_df
learning_df

In [233]:
# 保存为pkl文件
learning_df.to_pickle('E:/Data/PKL/learning_df.pkl')

**2.3.5 Website Odds** <br>
The `betting odds` recorded by various betting websites offer insight into sentiment surrounding the outcome of a particular game. 

In [234]:
# betting_feats = ['B365H', 'B365D', 'B365A', 'IWH', 'IWD', 'IWA', 'WHH', 'WHD', 'WHA', "AHh", "B365AHH", "B365AHA"]
betting_feats = ['B365H', 'B365D', 'B365A']
betting_feats_asia = ['AHh', 'B365AHH', 'B365AHA', 'AHCh', 'B365CAHH', 'B365CAHA']
betting_feats

In [235]:
def compute_meanodds(df, betting_feats):
    """
    """
    home_odds = []
    away_odds = []
    draw_odds = []
    for odd in betting_feats:
        odd_type = odd[-1]
        if odd_type == 'H':
            home_odds.append(odd)
        elif odd_type == 'A':
            away_odds.append(odd)
        else:
            draw_odds.append(odd)
    avg_home_odds = df[home_odds].mean(axis=1)
    avg_away_odds = df[away_odds].mean(axis=1)
    avg_draw_odds = df[draw_odds].mean(axis=1)
    
    ordered_cols = ['HomeOdds', 'AwayOdds', 'DrawOdds'] + df.columns.tolist()
    
    df['HomeOdds'] = avg_home_odds
    df['AwayOdds'] = avg_away_odds
    df['DrawOdds'] = avg_draw_odds
    
    return df[ordered_cols]

In [236]:
learning_df = compute_meanodds(learning_df, betting_feats)

### 2.4 Peek @ Learning DataFrame

In [237]:
learning_df

In [238]:
learning_df.drop(columns = ['WHH', 'WHD', 'WHA', 'HomeOdds', 'AwayOdds', 'DrawOdds', 'FTHG', 'FTAG', 'Result_FTR'], inplace = True)

# 3. Model Development

* Establish a baseline Logistic Regression model fit over the entire learning dataframe without special regard to *division* and *team*. 
* Train model over 16 seasons, and predict for the remaining 3 seasons (approximate 80-20 split)

### 3.1 Train and Test Split

In [239]:
split = 0.80
no_seasons = 20

print('No. seasons to train over: ' + str(round(split*no_seasons)))

In [240]:
X, y = learning_df.loc[:, learning_df.columns != 'Result'], learning_df[['Result']]

In [241]:
# full_feat = ['HomeWinStreak','AwayWinStreak','HomeWinsToDate', 'AwayWinsToDate', 'Last Match Result',
#              'HomeTeamELO', 'AwayTeamELO', 'HomeOdds', 'AwayOdds', 'DrawOdds'] + betting_feats

# exclude_feats = ['HomeWinsToDate', 'AwayWinsToDate', 'Last Match Result'] 

In [242]:
# X = X[X.columns[~X.columns.isin(exclude_feats)]]
# X

In [243]:
X

In [244]:
y

In [245]:
split_year = 2024
# start_year = split_year - 15
start_year = 2019
split_month = 12 - 7

In [246]:
# 切分训练集和测试集
# xTr, xTe = X[(X.Year < split_year) & (X.Year >= start_year)], X[X.Year >= split_year]
# yTr, yTe = y.loc[xTr.index, :], y.loc[xTe.index, :]
# 切分训练集和测试集
xTr, xTe = X[((X.Year < split_year) & (X.Year >= start_year))
 | ((X.Year == split_year) & (X.Month < split_month)) ], X[(X.Year > split_year) | ((X.Year == split_year) & (X.Month >= split_month)) ]
yTr, yTe = y.loc[xTr.index, :], y.loc[xTe.index, :]

### 3.2 Normalization <br>
Following our various feature transformations and development, we arrived to a sparse dataframe with the exception of a few features(*Year, DayofYear*). It will be important to *normalize* these features as they are in gross magnitudes compared to the remaining features. During model training, having dominating features (in scale relative to others) can be dangerous as the weight updates may mistakengly favor these larger-scale features because it will have the largest influence on the target output. 

In [247]:
# minmax_scaler.fit_transform()：这个方法首先拟合数据，即计算数据的最小值和最大值，这些值用于后续的缩放。然后，它将这些参数用于转换数据，将原始数据缩放到0和1之间。
# minmax_scaler.transform()：这个方法使用在训练数据上计算得到的最小值和最大值来转换测试数据。这确保了训练数据和测试数据使用相同的缩放标准。
from sklearn.preprocessing import MinMaxScaler
minmax_scaler = MinMaxScaler()
xTr.loc[:, ['Year']] = minmax_scaler.fit_transform(xTr.loc[:, ['Year']])
xTe.loc[:, ['Year']] = minmax_scaler.transform(xTe.loc[:, ['Year']])
# 保存到文件
import time
local_time = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
joblib.dump(minmax_scaler, f'minmax_scaler_{local_time}.pkl')  # 保存为 .pkl 文件

In [248]:
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
# to_scale = ['HomeWinStreak','AwayWinStreak','HomeWinsToDate', 'AwayWinsToDate', 'HomeTeamELO', 'AwayTeamELO', 'HomeOdds', 'AwayOdds', 'DrawOdds'] + betting_feats
to_scale = ['HomeTeamELO', 'AwayTeamELO'] + betting_feats

xTr.loc[:, to_scale] = std_scaler.fit_transform(xTr.loc[:, to_scale])
xTe.loc[:, to_scale] = std_scaler.transform(xTe.loc[:, to_scale])
# 保存到文件
import time
local_time = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
joblib.dump(std_scaler, f'std_scaler_{local_time}.pkl')  # 保存为 .pkl 文件

In [249]:
xTr

In [250]:
xTe

### 3.3 HomeWins Baseline Model

In [251]:
from sklearn.metrics import accuracy_score

In [252]:
xTr.shape

In [253]:
xTe.shape

In [254]:
# training score
baseline_Tr = np.full((xTr.shape[0], 1), 2) 
accuracy_score(yTr.Result.values, baseline_Tr.ravel())

In [255]:
# testing score
baseline_preds_Te = np.full((xTe.shape[0]  , 1), 2) #predicts home wins all the time
accuracy_score(yTe.Result.values, baseline_preds_Te.ravel())

### 3.4 Multinomial Logistic Regression

**3.4.1** $l2$ Regularized

In [256]:
from sklearn.linear_model import LogisticRegression
l2_lr = LogisticRegression(max_iter = 10000, n_jobs=-1).fit(xTr, yTr.values.ravel())

In [257]:
# training score
accuracy_score(yTr.Result.values, l2_lr.predict(xTr))

In [258]:
# testing score
lr_preds = l2_lr.predict(xTe)
accuracy_score(yTe.Result.values, lr_preds)

**3.4.1** $l2$ Penalty Tuning

In [259]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

logistic_params = {'C':[0.001,0.01,0.10]}

# logistic_randsearch = RandomizedSearchCV(estimator=LogisticRegression(max_iter=10000),
#                                          param_distributions=logistic_params,
logistic_randsearch = GridSearchCV(estimator=LogisticRegression(max_iter=10000),
                                         param_grid=logistic_params,
                                         scoring='accuracy',
                                         verbose=1,
                                         cv=5,
                                         n_jobs=-1)

logistic_rand_results = logistic_randsearch.fit(xTr, yTr.values.ravel())
print("Best: %f using %s" % (logistic_rand_results.best_score_, logistic_rand_results.best_params_))

In [260]:
l2_rs = logistic_rand_results.best_estimator_

In [261]:
# training score
accuracy_score(yTr.Result.values, l2_rs.predict(xTr))

In [262]:
# testing score
accuracy_score(yTe.Result.values, l2_rs.predict(xTe))

**3.4.4** $l1$ Regularized

In [263]:
l1_lr = LogisticRegression(penalty='l1', solver='saga', max_iter = 10000, n_jobs=-1).fit(xTr, yTr.values.ravel())

In [264]:
# training score
accuracy_score(yTr.Result.values, l1_lr.predict(xTr))

In [265]:
# testing score
l1_preds = l1_lr.predict(xTe)
accuracy_score(yTe.Result.values, l1_preds)

**3.4.5** Penalty Tuning

In [266]:
l1_params = {'C':[0.001,0.01,0.10]}

# l1_randsearch = RandomizedSearchCV(estimator=LogisticRegression(penalty='l1',solver='saga', max_iter=10000),
#                                          param_distributions=l1_params,
l1_randsearch = GridSearchCV(estimator=LogisticRegression(penalty='l1',solver='saga', max_iter=10000),
                                         param_grid=l1_params,
                                         scoring='accuracy',
                                         verbose=1,
                                         n_jobs=-1,
                                         cv=5)

l1_rand_results = l1_randsearch.fit(xTr, yTr.values.ravel())
print("Best: %f using %s" % (l1_rand_results.best_score_, l1_rand_results.best_params_))

In [267]:
l1_rs = l1_randsearch.best_estimator_ #LogisticRegression(C=0.10, solver='saga', max_iter=10000).fit(xTr, yTr.values.ravel())#

In [268]:
# training score
accuracy_score(yTr.Result.values, l1_rs.predict(xTr))

In [269]:
# testing score
accuracy_score(yTe.Result.values, l1_rs.predict(xTe))

### 3.5 Support Vector Machine

In [270]:
from sklearn.svm import SVC
svm = SVC(max_iter=100000).fit(xTr, yTr.values.ravel())

In [271]:
# training score
accuracy_score(yTr.Result.values, svm.predict(xTr))

In [272]:
# testing score
accuracy_score(yTe.Result.values, svm.predict(xTe))

In [273]:
predict_svm = svm.predict(xTe)
series_svm = pd.Series(predict_svm, name='Predicted')
compare_result = pd.concat([series_svm, yTe.reset_index()], axis=1)
compare_result

In [274]:
import matplotlib.pyplot as plt
result_subset = compare_result.tail(200)
plt.figure(figsize=(10,10))
result_subset.plot(x='index', y=['Predicted', 'Result'], kind='line')
plt.title("Prediction vs Real")
plt.show()

In [275]:
import time
local_time = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
dump(svm, f'./sklearn_svm_{local_time}.joblib')

**3.5.2** Penalty Tuning

In [276]:
svm_params = {'C':[0.001,0.01,0.10]}

# svm_randsearch = RandomizedSearchCV(estimator=SVC(max_iter=100000),
#                                          param_distributions=svm_params,
svm_randsearch = GridSearchCV(estimator=SVC(max_iter=100000),
                                         param_grid=svm_params,
                                         scoring='accuracy',
                                         verbose=2,
                                         cv=5,
                                         n_jobs=-1)

svm_rand_results = svm_randsearch.fit(xTr, yTr.values.ravel())
print("Best: %f using %s" % (svm_rand_results.best_score_, svm_rand_results.best_params_))

In [277]:
svm_rs = svm_rand_results.best_estimator_

In [278]:
# training score
accuracy_score(yTr.Result.values, svm_rs.predict(xTr))

In [279]:
# testing score
accuracy_score(yTe.Result.values, svm_rs.predict(xTe))

In [280]:
import time
local_time = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
dump(svm_rs, f'./sklearn_svm_randsearch_{local_time}.joblib')

### 3.6 Simple Neural Network ####

In [281]:
# from sklearn.neural_network import MLPClassifier
# mlp = MLPClassifier(hidden_layer_sizes=(512,128,32),
#                     activation='relu',
#                     batch_size=512,
#                     max_iter=10000,
#                     learning_rate_init=1e-4,
#                     early_stopping=True,
#                     alpha=1e-3,
#                    ).fit(xTr, yTr.values.ravel())
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(128,64,32),
                    activation='relu',
                    batch_size=64,
                    max_iter=10000,
                    learning_rate_init=1e-4,
                    early_stopping=True,
                    alpha=1e-3,
                    verbose=True,
                    tol=1e-4,
                    random_state=13,
                    learning_rate='adaptive',
                    n_iter_no_change=300
                   ).fit(xTr, yTr.values.ravel())

In [282]:
# training score
accuracy_score(yTr.Result.values, mlp.predict(xTr))

In [283]:
# testing score
accuracy_score(yTe.Result.values, mlp.predict(xTe))

In [284]:
xTr

In [285]:
yTr

In [286]:
xTe

In [287]:
yTe

In [288]:
predict_val = mlp.predict(xTe)
series_pre = pd.Series(predict_val, name='Predicted')
compare_result = pd.concat([series_pre, yTe.reset_index()], axis=1)
compare_result

In [289]:
import matplotlib.pyplot as plt
result_subset = compare_result.tail(200)
plt.figure(figsize=(10,10))
result_subset.plot(x='index', y=['Predicted', 'Result'], kind='line')
plt.title("Prediction vs Real")
plt.show()

In [290]:
# xTr = xTr.drop(columns=['pad1', 'pad2', 'pad3'])
# xTe = xTe.drop(columns=['pad1', 'pad2', 'pad3'])

### 保存模型

In [291]:
import time
local_time = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
dump(mlp, f'./sklearn_mlp_{local_time}.joblib')

In [292]:
# ### 加载模型
# # model_name = 'sklearn_mlp_' + local_time + '.joblib'
# model_name = './' + 'sklearn_mlp_2025_01_21_22_41_15.joblib'
# mlp = load(model_name)

In [299]:
# 获取每个样本的类别概率
probs = mlp.predict_proba(xTe)

# 统计 top 2 类别
top2_indices = np.argsort(probs, axis=1)[:, -2:]  # 获取每个样本概率最高的两个类别的索引
top2_probs = np.take_along_axis(probs, top2_indices, axis=1)  # 获取这两个类别的概率

print("Top 2 Categories for each sample:")
for i in range(xTe.shape[0]):
    print(f"Sample {i+1}: Class {top2_indices[i, 1]} (prob={top2_probs[i, 1]:.4f}), Class {top2_indices[i, 0]} (prob={top2_probs[i, 0]:.4f})")

### 随机森林

In [294]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 创建随机森林分类器
rf = RandomForestClassifier(n_estimators=300,  # 100棵树
                            max_depth=None,    # 不限制树的最大深度
                            min_samples_split=2,  # 每个节点最小样本数
                            random_state=42,  # 随机种子，确保结果可复现
                            n_jobs=-1)  # 使用所有CPU核心

# 训练分类器
rf.fit(xTr, yTr.values.ravel())

# 预测
y_pred_rf = rf.predict(xTe)

# 评估准确率
accuracy_rf = accuracy_score(yTe, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf:.4f}")

In [295]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# 定义超参数网格
param_grid = {
    'n_estimators': [100, 200, 500, 1000],  # 不同的树数
    'max_depth': [None, 10, 20, 30],  # 树的最大深度
}

# 随机森林分类器
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

# 进行网格搜索
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# 拟合网格搜索
grid_search.fit(xTr, yTr.values.ravel())

# 输出最佳参数
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.4f}")

In [296]:
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# 记录不同树数下的交叉验证准确率
n_estimators_range = [50, 100, 200, 300, 500, 1000]
cv_scores = []

for n in n_estimators_range:
    rf = RandomForestClassifier(n_estimators=n, random_state=42)
    scores = cross_val_score(rf, xTr, yTr.values.ravel(), cv=5)
    cv_scores.append(scores.mean())

# 绘制树数与交叉验证准确率的关系
plt.plot(n_estimators_range, cv_scores)
plt.xlabel('Number of Trees')
plt.ylabel('Cross-validation Accuracy')
plt.title('Random Forest: Trees vs Accuracy')
plt.show()

In [297]:
from sklearn.metrics import accuracy_score

# 1. 获取最佳模型
best_rf = grid_search.best_estimator_

# 2. 在测试集上进行预测
y_pred = best_rf.predict(xTe)

# 3. 计算准确率
test_accuracy = accuracy_score(yTe, y_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")


In [298]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(yTe, y_pred)
print("Confusion Matrix:")
print(cm)


### pytorch MLP

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 假设 xTr, yTr 已经准备好，xTe, yTe 作为测试集

# 转换为Tensor
xTr_tensor = torch.tensor(xTr, dtype=torch.float32)
yTr_tensor = torch.tensor(yTr.values.ravel(), dtype=torch.long)  # 使用 long 类型因为是分类问题
xTe_tensor = torch.tensor(xTe, dtype=torch.float32)
yTe_tensor = torch.tensor(yTe.values.ravel(), dtype=torch.long)

# 创建DataLoader
train_data = TensorDataset(xTr_tensor, yTr_tensor)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

# 定义神经网络结构
class MLP(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size):
        super(MLP, self).__init__()
        layers = []
        prev_size = input_size
        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(prev_size, hidden_size))
            layers.append(nn.ReLU())
            prev_size = hidden_size
        layers.append(nn.Linear(prev_size, output_size))  # 输出层
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

# 初始化模型
input_size = xTr.shape[1]  # 特征数
hidden_sizes = [512, 128, 32]  # 可以根据实际需求调整
output_size = 3  # 三分类问题
model = MLP(input_size, hidden_sizes, output_size)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()  # 交叉熵损失函数，适用于多分类问题
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# 训练模型
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()  # 清除梯度
        outputs = model(inputs)  # 模型前向传播
        loss = criterion(outputs, labels)  # 计算损失
        loss.backward()  # 反向传播
        optimizer.step()  # 更新参数

        running_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader):.4f}")

# 测试模型
model.eval()  # 设置为评估模式
with torch.no_grad():
    y_pred = model(xTe_tensor)
    _, predicted = torch.max(y_pred, 1)  # 获取预测类别
    accuracy = accuracy_score(yTe_tensor.numpy(), predicted.numpy())  # 计算准确率

print(f"Test Accuracy: {accuracy:.4f}")


### xgboost

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
import numpy as np

# 创建DMatrix（XGBoost的输入格式）
dtrain = xgb.DMatrix(xTr, label=yTr.values.ravel())
dtest = xgb.DMatrix(xTe)

# 设置XGBoost参数
params = {
    'objective': 'multi:softmax',  # 多分类任务
    'num_class': 3,  # 类别数，假设是三分类
    'max_depth': 6,  # 树的最大深度
    'eta': 0.05,  # 学习率
    'subsample': 0.8,  # 数据采样比率
    'colsample_bytree': 0.8,  # 特征采样比率
    'eval_metric': 'mlogloss',  # 多分类对数损失
    'gamma': 1,  # 正则化的最小损失
    'lambda': 1,  # L2 正则化
    'alpha': 0.5,  # L1 正则化
    'tree_method': 'hist',  # 使用直方图算法
    'missing': np.nan  # 处理缺失值
}

# 训练模型
num_round = 100  # 训练轮次
bst = xgb.train(params, dtrain, num_round)

# 预测
y_pred_xgb = bst.predict(dtest)

# 评估准确率
accuracy_xgb = accuracy_score(yTe, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb:.4f}")


### 3.7 Stacked Classifier ###

from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC

from sklearn.ensemble import StackingClassifier
stacked_clf = StackingClassifier(estimators=[('svm', SVC(max_iter=100000)), ('logistic', LogisticRegression(C=0.01, max_iter=10000))],
                                final_estimator=LogisticRegression(max_iter=10000),
                                n_jobs=-1).fit(xTr, yTr.values.ravel())

# training score
accuracy_score(yTr.Result.values, stacked_clf.predict(xTr))

# testing score
accuracy_score(yTe.Result.values, stacked_clf.predict(xTe))

## 4. Result Analysis ##

In [293]:
## TODO: breakdown results across divisions and/or teams; i.e., see how model performs individually at subgroups

## 5. Scrap Code ##

barcelona_df = learning_df[(learning_df['HomeTeam 17'] == 1) | (learning_df['AwayTeam 17'] == 1)]
barcelona_df

bxTr = xTr[(xTr['HomeTeam 17'] == 1) | (xTr['AwayTeam 17'] == 1)]
bxTe = xTe[(xTe['HomeTeam 17'] == 1) | (xTe['AwayTeam 17'] == 1)]

byTr, byTe = yTr.loc[bxTr.index,:], yTe.loc[bxTe.index,:]

# training score
accuracy_score(byTr, l1_lr.predict(bxTr))

# testing score
accuracy_score(byTe, l1_lr.predict(bxTe))

# training score
accuracy_score(byTr, l2_lr.predict(bxTr))

# testing score
accuracy_score(byTe, l2_lr.predict(bxTe))

## 6. Pytorch MLP ##

type(xTr)

xTr.shape

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
import time
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

class Attention(nn.Module):
    def __init__(self, feature_dim):
        super(Attention, self).__init__()
        self.attention_weights = nn.Parameter(torch.randn(feature_dim))

    def forward(self, x):
        # 应用注意力权重
        weights = F.softmax(self.attention_weights, dim=0)
        # 加权求和
        x = x * weights
        return x
    
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(in_features=xTr.shape[1], out_features=512)
        self.bn1 = nn.BatchNorm1d(num_features=512)
        self.attention = Attention(512)
        self.dropout1 = nn.Dropout(p=0.2)
        
        self.fc2 = nn.Linear(in_features=512, out_features=128)
        self.bn2 = nn.BatchNorm1d(num_features=128)
        self.dropout2 = nn.Dropout(p=0.2)
        
        self.fc3 = nn.Linear(in_features=128, out_features=32)
        self.bn3 = nn.BatchNorm1d(num_features=32)
        self.dropout3 = nn.Dropout(p=0.2)
        
        self.fc4 = nn.Linear(in_features=32, out_features=3)  # 输出层改为3，对应三个类别

    def forward(self, x):
        x = self.dropout1(torch.relu(self.bn1(self.fc1(x))))
        x = self.attention(x)
        x = self.dropout2(torch.relu(self.bn2(self.fc2(x))))
        x = self.dropout3(torch.relu(self.bn3(self.fc3(x))))
        x = self.fc4(x)
        return x

# 数据预处理
scaler = StandardScaler()
xTr_scaled = scaler.fit_transform(xTr)
xTr_tensor = torch.tensor(xTr_scaled, dtype=torch.float32).to(device)
yTr_tensor = torch.tensor(yTr.values.ravel(), dtype=torch.long).to(device)

# 创建数据加载器
dataset = TensorDataset(xTr_tensor, yTr_tensor)
dataloader = DataLoader(dataset, batch_size=1024, shuffle=True)

# 创建模型实例
model = MLP().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-4)

train_start = time.time()
# 训练模型
model.train()
for epoch in range(500):  # 假设训练200个epoch
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()  # 清除之前的梯度

        outputs = model(inputs)  # 前向传播
        loss = criterion(outputs, labels)  # 计算损失
        loss.backward()  # 反向传播
        optimizer.step()  # 更新参数

        running_loss += loss.item() * inputs.size(0)  # 累计损失
        _, predicted = torch.max(outputs.data, 1)  # 获取预测结果
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / total
    epoch_acc = correct / total * 100  # 计算准确率

    # 每个epoch结束后输出
    print(f'Epoch {epoch+1}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}%')
print(f'训练时长： {time.time() - train_start}s')

# 假设 xTe 和 yTe 是 pandas DataFrame 或 Series
# 数据预处理
xTe_scaled = scaler.fit_transform(xTe)  # 使用与训练数据相同的标准化参数
xTe_tensor = torch.tensor(xTe_scaled, dtype=torch.float32).to(device)
yTe_tensor = torch.tensor(yTe.values.ravel(), dtype=torch.long).to(device)

# 创建数据加载器
test_dataset = TensorDataset(xTe_tensor, yTe_tensor)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# 设置模型为评估模式
model.eval()

# 初始化用于计算准确率的变量
correct = 0
total = 0

# 不计算梯度，因为在评估模式下不需要进行反向传播
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

# 计算准确率
accuracy = correct / total
print(f'Accuracy on test set: {accuracy * 100:.2f}%')

## 7. Pytorch Transformer ##

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class TransformerModel(nn.Module):
    def __init__(self, input_dim, num_classes, num_heads, num_layers, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.input_dim = input_dim
        self.num_classes = num_classes
        self.model_dim = input_dim  # 通常情况下，模型维度与输入维度相同

        # Transformer Encoder Layer
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=self.model_dim,
            nhead=num_heads,
            dim_feedforward=512,  # 前馈网络的维度
            dropout=dropout,
            batch_first=True
        )

        # Transformer Encoder
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # 输出层
        self.output_layer = nn.Linear(self.model_dim, self.num_classes)

        # Batch Normalization
        self.bn = nn.BatchNorm1d(self.model_dim)

    def forward(self, x):
        # 增加一个假的序列维度
        x = x.unsqueeze(1)
        # Transformer Encoder
        x = self.transformer_encoder(x)

        # Batch Normalization
        x = self.bn(x[:, 0, :])  # 取序列的第一个元素进行批量归一化

        # 输出层
        x = self.output_layer(x)
        return x

# 补充维度
n_samples_xTr = xTr.shape[0]
n_samples_xTe = xTe.shape[0]
for i in range(1, 4):  # 从 1 到 3，因为需要添加三列
    xTr[f'pad{i}'] = 0  # 添加填充列，初始化为 0
    xTe[f'pad{i}'] = 0  # 添加填充列，初始化为 0

# 参数设置
input_dim = xTr.shape[1]  # 输入特征的维度
num_classes = 3  # 类别数
num_heads = 10  # 注意力头的数量
num_layers = 3  # Transformer层的数量
dropout = 0.8  # Dropout比率

# 创建模型
model = TransformerModel(input_dim, num_classes, num_heads, num_layers, dropout).to(device)

# 损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# 数据加载
# 假设 xTr 和 xTe 已经是适当的 torch.Tensor 对象
xTr_values = xTr.values.astype(float)
xTe_values = xTe.values.astype(float)

xTr_tensor = torch.tensor(xTr_values, dtype=torch.float32).to(device)
xTe_tensor = torch.tensor(xTe_values, dtype=torch.float32).to(device)
yTr_tensor = torch.tensor(yTr.values, dtype=torch.long).to(device).squeeze(1)
yTe_tensor = torch.tensor(yTe.values, dtype=torch.long).to(device).squeeze(1)
# 转换为 one-hot 编码
yTr_tensor = F.one_hot(yTr_tensor, num_classes=num_classes).float()
yTe_tensor = F.one_hot(yTe_tensor, num_classes=num_classes).float()

# 数据加载器
train_dataset = TensorDataset(xTr_tensor, yTr_tensor)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=False)

test_dataset = TensorDataset(xTe_tensor, yTe_tensor)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False)

train_start = time.time()
# 训练模型
model.train()
for epoch in range(500):  # 运行更多的 epoch 以获得更好的结果
    running_loss = 0.0
    correct = 0
    total = 0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)  # 累计损失
        _, predicted = torch.max(outputs.data, 1)  # 获取预测结果
        _, truth = torch.max(labels.data, 1)
        total += truth.size(0)
        correct += (predicted == truth).sum().item()

    epoch_loss = running_loss / total
    epoch_acc = correct / total * 100  # 计算准确率

    # 每个epoch结束后输出
    print(f'Epoch {epoch+1}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}%')
print(f'训练时长： {time.time() - train_start}s')

# 设置模型为评估模式
model.eval()

# 初始化用于计算准确率的变量
correct = 0
total = 0

# 不计算梯度，因为在评估模式下不需要进行反向传播
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        _, truth = torch.max(labels.data, 1)
        total += labels.size(0)
        correct += (predicted == truth).sum().item()

# 计算准确率
accuracy = correct / total
print(f'Accuracy on test set: {accuracy * 100:.2f}%')