In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset, DataLoader
from torchfm.model.fm import FactorizationMachineModel 
from torchfm.model.wd import WideAndDeepModel
from torchfm.model.dfm import DeepFactorizationMachineModel
from sklearn.metrics import mean_squared_error, precision_score, recall_score
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_parquet("data/holdings_data.parquet")

# 정수 인덱스 생성 
df["CIK_idx"], _ = pd.factorize(df["CIK"])
df["CUSIP_idx"], _ = pd.factorize(df["CUSIP"])
field_dims = np.array([df["CIK_idx"].nunique(), df["CUSIP_idx"].nunique()], dtype=np.int64)

X_all = df[["CIK_idx", "CUSIP_idx"]].values.astype("int64")
y_all = df["hold"].astype("float32").values

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42, stratify=y_all)

X_tr_tensor = torch.from_numpy(X_train)
y_tr_tensor = torch.from_numpy(y_train)

X_te_tensor = torch.from_numpy(X_test)
y_te_tensor = torch.from_numpy(y_test)

train_dataset = TensorDataset(X_tr_tensor, y_tr_tensor)
test_dataset  = TensorDataset(X_te_tensor, y_te_tensor)

batch_size = 4096
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False)

device = "cuda" if torch.cuda.is_available() else "cpu"

Factorization을 이용한 결과 확인

In [3]:
model = FactorizationMachineModel(field_dims, embed_dim=16).to(device)

criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

n_epochs = 100

model.train()
for epoch in range(1, n_epochs + 1):
    loss_sum, n = 0.0, 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        preds = model(X).squeeze()
        loss = criterion(preds, y)
        optimizer.zero_grad(); loss.backward(); optimizer.step()
        loss_sum += loss.item() * len(y); n += len(y)

    #print(f"[Epoch {epoch:02d}] Train BCE: {loss_sum / n:.4f}")

model.eval()
with torch.no_grad():
    preds, labs = [], []
    for Xi, yi in test_loader:
        preds.append(model(Xi.to(device)).cpu().numpy())
        labs.append(yi.numpy())
y_pred = np.concatenate(preds)
y_true = np.concatenate(labs)

rmse  = np.sqrt(mean_squared_error(y_true, y_pred))
y_bin = (y_pred >= 0.5).astype(int)
prec  = precision_score(y_true, y_bin, zero_division=0)
rec   = recall_score(y_true, y_bin,    zero_division=0)

print(f"RMSE      : {rmse:.4f}")
print(f"Precision : {prec:.4f}")
print(f"Recall    : {rec:.4f}")

RMSE      : 0.0012
Precision : 1.0000
Recall    : 1.0000


Wide&Deep 모델을 이용한 결과 확인

In [4]:
model = WideAndDeepModel(field_dims = field_dims, embed_dim = 16, mlp_dims = (64, 32), dropout = 0.2).to(device)

criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

n_epochs = 100

model.train()
for epoch in range(1, n_epochs + 1):
    loss_sum, n = 0.0, 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        preds = model(X).squeeze()
        loss = criterion(preds, y)
        optimizer.zero_grad(); loss.backward(); optimizer.step()
        loss_sum += loss.item() * len(y); n += len(y)

model.eval()
with torch.no_grad():
    preds, labs = [], []
    for Xi, yi in test_loader:
        preds.append(model(Xi.to(device)).cpu().numpy())
        labs.append(yi.numpy())
y_pred = np.concatenate(preds)
y_true = np.concatenate(labs)

rmse  = np.sqrt(mean_squared_error(y_true, y_pred))
y_bin = (y_pred >= 0.5).astype(int)
prec  = precision_score(y_true, y_bin, zero_division=0)
rec   = recall_score(y_true, y_bin, zero_division=0)

print(f"RMSE      : {rmse:.4f}")
print(f"Precision : {prec:.4f}")
print(f"Recall    : {rec:.4f}")

RMSE      : 0.0000
Precision : 1.0000
Recall    : 1.0000


DeepFM을 이용한 결과 확인

In [5]:
model = DeepFactorizationMachineModel(field_dims = field_dims, embed_dim = 16, mlp_dims = (64, 32), dropout = 0.2).to(device)

criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

n_epochs = 100

model.train()
for epoch in range(1, n_epochs + 1):
    loss_sum, n = 0.0, 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        preds = model(X).squeeze()
        loss = criterion(preds, y)
        optimizer.zero_grad(); loss.backward(); optimizer.step()
        loss_sum += loss.item() * len(y); n += len(y)

model.eval()
with torch.no_grad():
    preds, labs = [], []
    for Xi, yi in test_loader:
        preds.append(model(Xi.to(device)).cpu().numpy())
        labs.append(yi.numpy())
y_pred = np.concatenate(preds)
y_true = np.concatenate(labs)

rmse  = np.sqrt(mean_squared_error(y_true, y_pred))
y_bin = (y_pred >= 0.5).astype(int)
prec  = precision_score(y_true, y_bin, zero_division=0)
rec   = recall_score(y_true, y_bin,    zero_division=0)

print(f"RMSE      : {rmse:.4f}")
print(f"Precision : {prec:.4f}")
print(f"Recall    : {rec:.4f}")

RMSE      : 0.0000
Precision : 1.0000
Recall    : 1.0000
