# install pytorch_tabnet


In [None]:
!pip install pytorch_tabnet

Collecting pytorch_tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pytorch_tabnet
Successfully installed pytorch_tabnet-4.1.0


# import


In [None]:
import os
import copy
import torch
import random
import pandas as pd

from itertools import product
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from pytorch_tabnet.tab_model import TabNetRegressor

from torch.utils.tensorboard import SummaryWriter

In [None]:
def fix_random(seed):
    torch.manual_seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
fix_random(42)

apply_pca = True
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# read dataset

In [None]:
#import dataset
from google.colab import drive

drive.mount('/content/drive') #connect the drive
dataset_file = '/content/drive/MyDrive/W-Workspace/MovieLens_da_li/dataset'
df = pd.read_csv(dataset_file + '/dataset.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Preprocessing


In [None]:
# 分割数据集为特征和目标
X = df.drop(['rating'], axis=1).to_numpy()  # 移除'rating'列，其余作为特征
y = df['rating'].to_numpy()  # 'rating'列作为目标

# 分割数据集为训练+验证集和测试集（测试集20%）
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# 分割训练+验证集为训练集和验证集（验证集占前者的10%）
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1, random_state=42)

# 将目标变量重塑为列向量以符合某些模型的要求
y_train = y_train.reshape(-1, 1)
y_val = y_val.reshape(-1, 1)

# 打印各数据集的样本数量以确认分割正确
print("Number of train set: ", X_train.shape[0])
print("Number of test set: ", X_test.shape[0])
print("Number of validation set: ", X_val.shape[0])



Number of train set:  9946
Number of test set:  2764
Number of validation set:  1106


In [None]:
# 检查是否应用PCA进行维度缩减
if apply_pca:
    pca = PCA(n_components=0.95)
    # 对训练数据拟合PCA同时转换训练数据
    X_train = pca.fit_transform(X_train)
    # 仅转换验证集和测试集
    X_val = pca.transform(X_val)
    X_test = pca.transform(X_test)
else:
    print("PCA is not applied")

In [None]:
from itertools import product

# 定义超参数范围

batch_sizes = [256]   # 批量大小保持不变
n_epochs = [200]      # 训练周期数保持不变
n_d = [16]            # 仅选择一个预测层维度 初始状态 n_d = [8, 16, 32]  # 预测层的维度
n_a = [16]            # 仅选择一个注意力层维度 初始状态 n_a = [8, 16, 32]  # 注意力层的维度
n_steps = [3, 5]      # 减少网络中连续步骤的数量选项 初始状态 n_steps = [3, 5, 7]  # 网络中连续步骤的数量
n_indipendent = [2]   # 仅选择一个独立GLU层的数量选项 初始状态 n_indipendent = [2, 3]  # 每个GLU块中独立GLU层的数量

# 生成所有可能的参数组合
params = list(product(batch_sizes, n_epochs, n_d, n_a, n_steps, n_indipendent))

# 直接计算组合数量
print("Number of combinations: ", len(params))

Number of combinations:  2


In [None]:
def get_model(n_d, n_a, n_steps, n_independent):
    """
    创建并返回一个配置了特定超参数的TabNetRegressor模型实例。

    Parameters:
    - n_d: 预测层的维度。
    - n_a: 注意力层的维度。
    - n_steps: 模型的步数。
    - n_independent: 独立GLU层的数量。

    Returns:
    - TabNetRegressor模型实例。
    """
    model = TabNetRegressor(
        n_d=n_d,
        n_a=n_a,
        n_steps=n_steps,
        n_independent=n_independent  # 确保参数名称正确
    )
    return model

In [None]:
best_mse = float('inf')
best_model = None
best_params = None
iteration = 0

for b, n_e, n_d, n_a, n_s, n_i in params:
    iteration += 1
    print(f'\nIteration {iteration}/{len(params) }')
    print(f"Configuration: batch size = {b}, epochs = {n_e}, n_d = {n_d}, n_a = {n_a}, steps = {n_s}, n_independent = {n_i}")

    model = get_model(n_d, n_a, n_s, n_i)  # 修正调用
    base_path = "/content/drive/MyDrive/W-Workspace/MovieLens_da_li/results/TabNet"
    sub_dir = "pca" if apply_pca else "no_pca"
    log_dir = os.path.join(base_path, sub_dir, f"batch_size_{b}_nEpochs_{n_e}_nd_{n_d}_na_{n_a}_nSteps_{n_s}_nIndependent_{n_i}")


    if os.path.exists(log_dir):
        print("Model already trained. Skipping...")
        continue
    writer = SummaryWriter(log_dir)

    # 确保目录存在（如果需要的话）
    os.makedirs(os.path.dirname(log_dir), exist_ok=True)

    #fit model
    model.fit(
        X_train=X_train,
        y_train=y_train,
        eval_set=[(X_val, y_val)],
        eval_name=['mse'],
        patience=10,
        batch_size=b,
        virtual_batch_size=128
    )

    # evaluate model
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
   #save hparams for each iteration with tensorboard
    writer.add_hparams(
        {'batch_size': b, 'n_epochs': n_e, 'n_d': n_d, 'n_a': n_a, 'n_steps': n_s, 'n_indipendent': n_i},
        {'hparam/mse': mse, 'hparam/r2': r2}
    )
    print('MSE:', mse)
    print('R2 Score:', r2)
    if mse < best_mse:
        best_mse = mse
        best_model = copy.deepcopy(model)
        best_params = (b, n_e, n_d, n_a, n_s, n_i)
        print('Best model updated')
    writer.close()
# 注：模型训练和评估部分省略了详细的fit和predict调用以及TensorBoard日志记录，因为它们已经正确无误。


Iteration 1/2
Configuration: batch size = 256, epochs = 200, n_d = 16, n_a = 16, steps = 3, n_independent = 2




epoch 0  | loss: 1.86584 | mse_mse: 0.63007 |  0:00:04s
epoch 1  | loss: 0.30308 | mse_mse: 0.31684 |  0:00:08s
epoch 2  | loss: 0.24654 | mse_mse: 0.26662 |  0:00:18s
epoch 3  | loss: 0.2296  | mse_mse: 0.21074 |  0:00:28s
epoch 4  | loss: 0.2268  | mse_mse: 0.19321 |  0:00:33s
epoch 5  | loss: 0.21625 | mse_mse: 0.19237 |  0:00:37s
epoch 6  | loss: 0.20631 | mse_mse: 0.19008 |  0:00:42s
epoch 7  | loss: 0.2003  | mse_mse: 0.19423 |  0:00:46s
epoch 8  | loss: 0.18557 | mse_mse: 0.14818 |  0:00:51s
epoch 9  | loss: 0.16535 | mse_mse: 0.14568 |  0:00:56s
epoch 10 | loss: 0.13521 | mse_mse: 0.08652 |  0:01:00s
epoch 11 | loss: 0.10368 | mse_mse: 0.08014 |  0:01:05s
epoch 12 | loss: 0.08161 | mse_mse: 0.06133 |  0:01:10s
epoch 13 | loss: 0.06801 | mse_mse: 0.05045 |  0:01:14s
epoch 14 | loss: 0.05899 | mse_mse: 0.04497 |  0:01:18s
epoch 15 | loss: 0.05507 | mse_mse: 0.04635 |  0:01:23s
epoch 16 | loss: 0.04928 | mse_mse: 0.0393  |  0:01:27s
epoch 17 | loss: 0.0417  | mse_mse: 0.03102 |  0



MSE: 0.007878176810709618
R2 Score: 0.9644592608757693
Best model updated

Iteration 2/2
Configuration: batch size = 256, epochs = 200, n_d = 16, n_a = 16, steps = 5, n_independent = 2




epoch 0  | loss: 2.67814 | mse_mse: 1.13887 |  0:00:07s
epoch 1  | loss: 0.35488 | mse_mse: 0.57869 |  0:00:14s
epoch 2  | loss: 0.29018 | mse_mse: 0.33975 |  0:00:21s
epoch 3  | loss: 0.26346 | mse_mse: 0.28686 |  0:00:29s
epoch 4  | loss: 0.26972 | mse_mse: 0.27527 |  0:00:36s
epoch 5  | loss: 0.25729 | mse_mse: 0.25326 |  0:00:44s
epoch 6  | loss: 0.24918 | mse_mse: 0.32913 |  0:00:50s
epoch 7  | loss: 0.25287 | mse_mse: 0.32665 |  0:00:58s
epoch 8  | loss: 0.25665 | mse_mse: 0.24613 |  0:01:09s
epoch 9  | loss: 0.24577 | mse_mse: 0.23293 |  0:01:17s
epoch 10 | loss: 0.23597 | mse_mse: 0.22477 |  0:01:24s
epoch 11 | loss: 0.23785 | mse_mse: 0.22116 |  0:01:31s
epoch 12 | loss: 0.23973 | mse_mse: 0.2471  |  0:01:39s
epoch 13 | loss: 0.23831 | mse_mse: 0.22552 |  0:01:46s
epoch 14 | loss: 0.23231 | mse_mse: 0.2271  |  0:01:54s
epoch 15 | loss: 0.23177 | mse_mse: 0.21862 |  0:02:00s
epoch 16 | loss: 0.22228 | mse_mse: 0.20898 |  0:02:08s
epoch 17 | loss: 0.23451 | mse_mse: 0.20807 |  0



MSE: 0.010323455207568978
R2 Score: 0.9534279012506903


In [None]:
# 打印最佳模型的详细信息
print("Best Model:", best_model)

# 打印最佳模型的MSE
print("Best MSE:", best_mse)

# 如果之前没有计算过最佳模型的预测结果，则先进行预测
# 这避免了重复进行相同的预测操作
y_pred_best = best_model.predict(X_test)

# 计算并打印最佳模型的R2分数
r2_best = r2_score(y_test, y_pred_best)
print("R2 Score:", r2_best)

Best Model: TabNetRegressor(n_d=16, n_a=16, n_steps=3, gamma=1.3, cat_idxs=[], cat_dims=[], cat_emb_dim=[], n_independent=2, n_shared=2, epsilon=1e-15, momentum=0.02, lambda_sparse=0.001, seed=0, clip_value=1, verbose=1, optimizer_fn=<class 'torch.optim.adam.Adam'>, optimizer_params={'lr': 0.02}, scheduler_fn=None, scheduler_params={}, mask_type='sparsemax', input_dim=552, output_dim=1, device_name='auto', n_shared_decoder=1, n_indep_decoder=1, grouped_features=[])
Best MSE: 0.007878176810709618
R2 Score: 0.9644592608757693


In [None]:
# 假设 best_model 是您训练好的 TabNet 模型
# 保存模型
#torch.save(best_model.state_dict(), 'best_model_tabnet.pth')

#save the best model in a file csv
best_model.save_model('best_model_tabnet_nopca_256.csv')

Successfully saved model at best_model_tabnet_nopca_256.csv.zip


'best_model_tabnet_nopca_256.csv.zip'