In [75]:
import numpy as np
import gzip
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier

In [61]:
with gzip.open('pkl/asia_football_df_noNone_noDiv.pkl.gz', 'rb') as f:
    df_loaded = pd.read_pickle(f)
df_loaded

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO,asia_final_result,...,AwayTeam 203,AwayTeam 204,AwayTeam 205,AwayTeam 206,Year,Sin_Month,Cos_Month,DayofYear,Sin_Day,Cos_Day
0,Auxerre,Nice,1.0,2.0,-0.75,2.050,1.850,1702.604858,1611.196045,-1.75,...,0,0,0,0,2003,-0.866025,-0.5,214,-0.516062,-0.856551
1,Guingamp,Marseille,0.0,1.0,0.00,1.925,1.975,1685.016113,1665.625732,-1.00,...,0,0,0,0,2003,-0.866025,-0.5,214,-0.516062,-0.856551
2,Hamburg,Hannover,0.0,3.0,-0.75,1.800,2.100,1718.566284,1649.805298,-3.75,...,0,0,0,0,2003,-0.866025,-0.5,214,-0.516062,-0.856551
3,Hertha,Werder Bremen,0.0,3.0,-0.75,2.025,1.875,1719.916748,1692.120972,-3.75,...,0,0,0,0,2003,-0.866025,-0.5,214,-0.516062,-0.856551
4,Lens,Le Mans,0.0,0.0,-0.75,1.900,2.000,1697.354004,1539.958130,-0.75,...,0,0,0,0,2003,-0.866025,-0.5,214,-0.516062,-0.856551
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37251,Reims,Montpellier,4.0,2.0,-0.50,1.780,2.030,1633.626221,1645.806641,1.50,...,0,0,0,0,2024,-0.866025,0.5,280,-0.994218,0.107381
37252,Sevilla,Betis,1.0,0.0,0.00,1.700,2.100,1676.242676,1709.259521,1.00,...,0,0,0,0,2024,-0.866025,0.5,280,-0.994218,0.107381
37253,Sociedad,Ath Madrid,1.0,1.0,0.25,1.890,2.010,1766.551880,1828.522095,0.25,...,0,0,0,0,2024,-0.866025,0.5,280,-0.994218,0.107381
37254,Strasbourg,Lens,2.0,2.0,0.25,1.810,2.090,1608.732544,1719.138184,0.25,...,0,0,0,0,2024,-0.866025,0.5,280,-0.994218,0.107381


In [39]:
def convert_labels_to_onehot(labels):
    """
    将 -1, 0, 1 的标签转换为 one-hot 编码的概率向量
    
    参数:
    labels: numpy array 或 torch tensor，包含 -1, 0, 1 的标签
    
    返回:
    torch tensor，形状为 (len(labels), 3)，每行是一个概率向量
    """
    # 首先确保标签是 numpy 数组
    if isinstance(labels, torch.Tensor):
        labels = labels.numpy()
    
    # 创建 one-hot 编码
    # 映射: -1 -> [1, 0, 0]
    #       0 -> [0, 1, 0]
    #       1 -> [0, 0, 1]
    label_map = {-1: 0, 0: 1, 1: 2}
    
    # 使用 torch.nn.functional.one_hot 转换
    onehot_labels = torch.nn.functional.one_hot(
        torch.tensor([label_map[label] for label in labels]), 
        num_classes=3
    ).float()
    
    return onehot_labels

In [50]:
# 假设 df 是你的 DataFrame
# 示例特征列名：'hometeam', 'awayteam', 'AHh', 'B365AHH', 'B365AHA', 'HomeTeamELO', 'AwayTeamELO'
# 示例标签列名：'asia_final_result'

# 标准化数值型特征
scaler = StandardScaler()
numerical_features = ['AHh', 'B365AHH', 'B365AHA', 'HomeTeamELO', 'AwayTeamELO']
df_loaded[numerical_features] = scaler.fit_transform(df_loaded[numerical_features])

# 将标签转换为 tensor
labels = torch.tensor(df_loaded['easy_label'].values, dtype=torch.float32)
onehot_labels = convert_labels_to_onehot(labels)

# 提取 One-Hot 编码特征和数值型特征
one_hot_home = torch.tensor(df_loaded['home_team'].values.tolist(), dtype=torch.int32)
one_hot_away = torch.tensor(df_loaded['away_team'].values.tolist(), dtype=torch.int32)
numerical_features = torch.tensor(df_loaded[numerical_features].values, dtype=torch.float32)


In [51]:
class MatchDataset(Dataset):
    def __init__(self, one_hot_home, one_hot_away, numerical_features, labels):
        self.one_hot_home = one_hot_home
        self.one_hot_away = one_hot_away
        self.numerical_features = numerical_features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return (self.one_hot_home[idx], self.one_hot_away[idx], self.numerical_features[idx], self.labels[idx])


In [53]:
# 创建 DataLoader
dataset = MatchDataset(one_hot_home, one_hot_away, numerical_features, onehot_labels)
# Split the dataset into training and testing sets
total_size = len(dataset)
train_size = int(0.9 * total_size)  # 90% for training
test_size = total_size - train_size  # 10% for testing

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

batch_size = 512
# Create DataLoaders for training and testing
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [71]:
drop_col_list = ['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'asia_final_result', 'label', 'easy_label']
df_dropped_cols = df_loaded.drop(columns=drop_col_list)
df_dropped_cols

Unnamed: 0,AHh,B365AHH,B365AHA,HomeTeamELO,AwayTeamELO,div,home_team,away_team,Div 0,Div 1,...,AwayTeam 203,AwayTeam 204,AwayTeam 205,AwayTeam 206,Year,Sin_Month,Cos_Month,DayofYear,Sin_Day,Cos_Day
0,-0.75,2.050,1.850,1702.604858,1611.196045,2,16,137,0,0,...,0,0,0,0,2003,-0.866025,-0.5,214,-0.516062,-0.856551
1,0.00,1.925,1.975,1685.016113,1665.625732,2,80,123,0,0,...,0,0,0,0,2003,-0.866025,-0.5,214,-0.516062,-0.856551
2,-0.75,1.800,2.100,1718.566284,1649.805298,0,81,82,1,0,...,0,0,0,0,2003,-0.866025,-0.5,214,-0.516062,-0.856551
3,-0.75,2.025,1.875,1719.916748,1692.120972,0,86,199,1,0,...,0,0,0,0,2003,-0.866025,-0.5,214,-0.516062,-0.856551
4,-0.75,1.900,2.000,1697.354004,1539.958130,2,108,103,0,0,...,0,0,0,0,2003,-0.866025,-0.5,214,-0.516062,-0.856551
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37251,-0.50,1.780,2.030,1633.626221,1645.806641,2,158,130,0,0,...,0,0,0,0,2024,-0.866025,0.5,280,-0.994218,0.107381
37252,0.00,1.700,2.100,1676.242676,1709.259521,4,167,22,0,0,...,0,0,0,0,2024,-0.866025,0.5,280,-0.994218,0.107381
37253,0.25,1.890,2.010,1766.551880,1828.522095,4,171,14,0,0,...,0,0,0,0,2024,-0.866025,0.5,280,-0.994218,0.107381
37254,0.25,1.810,2.090,1608.732544,1719.138184,2,179,108,0,0,...,0,0,0,0,2024,-0.866025,0.5,280,-0.994218,0.107381


In [78]:
split_year = 2023
# 切分训练集和测试集v
xTr, xTe = df_dropped_cols[df_dropped_cols.Year <= split_year], df_dropped_cols[df_dropped_cols.Year > split_year]
yTr, yTe = df_loaded.loc[xTr.index, :]['easy_label'], df_loaded.loc[xTe.index, :]['easy_label']
yTr

0       -1
1       -1
2       -1
3       -1
4       -1
        ..
36004    1
36005   -1
36006    1
36007    1
36008    1
Name: easy_label, Length: 36009, dtype: int32

In [None]:
class MultiFeatureModel(nn.Module):
    def __init__(self, num_classes=3, categorical_num_classes=3):
        super(MultiFeatureModel, self).__init__()
        
        # 数值特征组 1：假设第一个特征组有 16 * 3 维度
        self.fc_numerical_1 = nn.Sequential(
            nn.Linear(16 * 3, 128),
            nn.ReLU(),
            nn.Linear(128, 64)
        )
        
        # 数值特征组 2：假设第二个特征组有 8 * 3 维度
        self.fc_numerical_2 = nn.Sequential(
            nn.Linear(8 * 3, 128),
            nn.ReLU(),
            nn.Linear(128, 64)
        )
        
        # 数值特征组 3：假设第三个特征组有 4 * 3 维度
        self.fc_numerical_3 = nn.Sequential(
            nn.Linear(4 * 3, 128),
            nn.ReLU(),
            nn.Linear(128, 64)
        )
        
        # 序列特征处理
        self.gru = nn.GRU(input_size=3, hidden_size=16, batch_first=True)
        
        # 类别特征处理（使用嵌入层）
        self.embedding = nn.Embedding(categorical_num_classes, 128)
        
        # 输出层
        self.fc_output = nn.Linear(64 * 3 + 16 + 128, num_classes)  # 3组数值特征 + GRU输出 + 嵌入层输出
        
        
    def forward(self, numerical_features_1, numerical_features_2, numerical_features_3, sequence_features, categorical_features):
        # 处理第一个数值特征组
        numerical_out_1 = self.fc_numerical_1(numerical_features_1)
        
        # 处理第二个数值特征组
        numerical_out_2 = self.fc_numerical_2(numerical_features_2)
        
        # 处理第三个数值特征组
        numerical_out_3 = self.fc_numerical_3(numerical_features_3)
        
        # 融合数值特征组输出
        numerical_out = torch.cat((numerical_out_1, numerical_out_2, numerical_out_3), dim=1)
        
        # 处理序列特征
        seq_out, _ = self.gru(sequence_features)
        seq_out = seq_out[:, -1, :]  # 取最后时刻的输出
        
        # 处理类别特征
        categorical_out = self.embedding(categorical_features).view(categorical_features.size(0), -1)
        
        # 融合所有特征
        combined = torch.cat((numerical_out, seq_out, categorical_out), dim=1)
        
        # 输出层
        output = self.fc_output(combined)
        
        return output

In [86]:
mlp = MLPClassifier(hidden_layer_sizes=(512,128,32),
                    activation='relu',
                    batch_size=64,
                    max_iter=200,
                    learning_rate_init=1e-4,
                    early_stopping=False,
                    alpha=1e-3,
                   ).fit(xTr, yTr.values)

In [85]:
# training score
accuracy_score(yTr.values, mlp.predict(xTr))

0.45135938237662804

In [83]:
# testing score
accuracy_score(yTe.values, mlp.predict(xTe))

0.5172413793103449

In [54]:
# 定义模型
embedding_dim = 100  # 可以根据任务需要调整
num_categories = torch.max(one_hot_away) + 1  # 类别数量，即 One-Hot 编码的长度
numerical_input_dim = 5  # 5 个数值型特征（赔率和评分）

model = OptimizedMultiFeatureNN(num_categories, embedding_dim, numerical_input_dim)

# 训练模型
num_epochs = 5000

# 优化器
optimizer = torch.optim.AdamW(
    model.parameters(), 
    lr=1e-3, 
    weight_decay=1e-5  # L2正则化
)

# 学习率调度器
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer, 
    T_max=num_epochs
)

# 损失函数
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)


In [None]:
# train model
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for home, away, numerical_features, labels in train_dataloader:
        optimizer.zero_grad()
        
        # 前向传播
        outputs = model(home, away, numerical_features)
        
        # 计算损失
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        
        # 计算准确率
        _, predicted = torch.max(outputs, 1)
        _, truth = labels.max(1)
        temp = (predicted == truth)
        correct += temp.sum().item()
        total += labels.size(0)
        
        # 反向传播和优化
        loss.backward()
        optimizer.step()
    
    epoch_loss = total_loss / len(train_dataloader)
    epoch_accuracy = correct / total
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")

In [60]:
# evaluation
total_eval = 0
correct_eval = 0
model.eval()
with torch.no_grad():
    for home, away, numerical_features, labels in test_dataloader:
        outputs = model(home, away, numerical_features)
        loss = criterion(outputs, labels)
        _, predicted = torch.max(outputs, 1)
        _, truth = labels.max(1)
        temp = (predicted == truth)
        correct_eval += temp.sum().item()
        total_eval += labels.size(0)
    accuracy = correct_eval / total_eval
    print(f"{accuracy:.4f}")

0.4675


class MultiFeatureNN(nn.Module):
    def __init__(self, num_categories, embedding_dim, numerical_input_dim):
        super(MultiFeatureNN, self).__init__()
        
        # Embedding 层处理类别特征
        self.embedding_home = nn.Embedding(num_categories, embedding_dim)
        self.embedding_away = nn.Embedding(num_categories, embedding_dim)
        
        # 全连接层处理数值型特征
        self.fc_numerical = nn.Sequential(
            nn.Linear(numerical_input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        
        # 合并嵌入向量和数值特征后进行分类
        self.fc_combined = nn.Sequential(
            nn.Linear(embedding_dim * 2 + 64, 128),  # 2 * embedding_dim + 64（数值特征处理后的维度）
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 3)  # 输出3类：胜、平、负
        )

    def forward(self, home, away, numerical_features):
        # 通过嵌入层处理类别特征
        home_embedded = self.embedding_home(home)
        away_embedded = self.embedding_away(away)
        
        # 对嵌入向量进行合并
        embedded_features = torch.cat([home_embedded, away_embedded], dim=-1)
        
        # 通过全连接层处理数值型特征
        numerical_features_processed = self.fc_numerical(numerical_features)
        
        # 合并嵌入向量和数值特征
        combined_features = torch.cat([embedded_features, numerical_features_processed], dim=-1)
        
        # 通过最终的全连接层进行分类
        output = F.softmax(self.fc_combined(combined_features), dim=-1)
        
        return output

# 定义模型
embedding_dim = 50  # 可以根据任务需要调整
num_categories = torch.max(one_hot_away) + 1  # 类别数量，即 One-Hot 编码的长度
numerical_input_dim = 5  # 5 个数值型特征（赔率和评分）

model = MultiFeatureNN(num_categories, embedding_dim, numerical_input_dim)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()  # 分类任务，使用交叉熵损失
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
num_epochs = 2000

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for home, away, numerical_features, labels in dataloader:
        optimizer.zero_grad()
        
        # 前向传播
        outputs = model(home, away, numerical_features)
        
        # 计算损失
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        
        # 计算准确率
        _, predicted = torch.max(outputs, 1)
        _, truth = labels.max(1)
        temp = (predicted == truth)
        correct += temp.sum().item()
        total += labels.size(0)
        
        # 反向传播和优化
        loss.backward()
        optimizer.step()
    
    epoch_loss = total_loss / len(dataloader)
    epoch_accuracy = correct / total
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")


## 优化后模型
class OptimizedMultiFeatureNN(nn.Module):
    def __init__(self, num_categories, embedding_dim, numerical_input_dim):
        super(OptimizedMultiFeatureNN, self).__init__()
        
        # 类别特征嵌入
        self.embedding_home = nn.Embedding(num_categories, embedding_dim)
        self.embedding_away = nn.Embedding(num_categories, embedding_dim)
        
        # 数值特征处理使用更复杂的结构
        self.numerical_layers = nn.Sequential(
            nn.Linear(numerical_input_dim, 128),
            nn.BatchNorm1d(128),  # 添加批归一化
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        # 合并特征后的分类层
        self.classification_head = nn.Sequential(
            nn.Linear(embedding_dim * 2 + 64, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 3)
        )
    
    def forward(self, home, away, numerical_features):
        # 类别特征嵌入
        home_embedded = self.embedding_home(home)
        away_embedded = self.embedding_away(away)
        
        # 合并嵌入向量
        embedded_features = torch.cat([home_embedded, away_embedded], dim=-1)
        
        # 处理数值特征
        numerical_features_processed = self.numerical_layers(numerical_features)
        
        # 合并所有特征
        combined_features = torch.cat([embedded_features, numerical_features_processed], dim=-1)
        
        # 分类
        logits = self.classification_head(combined_features)
        probabilities = F.softmax(logits, dim=-1)
        
        return probabilities