In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import scipy.sparse as sp
from scipy.special import expit
from implicit.als import AlternatingLeastSquares
from torch.utils.data import Dataset, DataLoader, TensorDataset, DataLoader
from torchfm.model.fm import FactorizationMachineModel 
from torchfm.model.wd import WideAndDeepModel
from torchfm.model.dfm import DeepFactorizationMachineModel
from sklearn.metrics import mean_squared_error, precision_score, recall_score
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

%load_ext autoreload
%autoreload 2

ALS 성능 확인

In [None]:
DATA_PATH = "data/holdings_data.parquet"   
FACTORS = 16        
REG = 0.1      
ALPHA = 40
ITERATIONS = 100
SEED = 42

In [None]:
df = pd.read_parquet(DATA_PATH)[["CIK", "CUSIP", "TOP25_FLAG"]]
df["CIK_id"],   _ = pd.factorize(df["CIK"])
df["CUSIP_id"], _ = pd.factorize(df["CUSIP"])

USER_COL, ITEM_COL, VALUE_COL = "CIK_id", "CUSIP_id", "TOP25_FLAG"
n_users, n_items = df[USER_COL].nunique(), df[ITEM_COL].nunique()

train_df, test_df = train_test_split(df, test_size=0.2, random_state=SEED, stratify=df[VALUE_COL])

train_pos = train_df[train_df[VALUE_COL] > 0]

train_mat = sp.coo_matrix(
    (train_pos[VALUE_COL].astype(np.float32) * ALPHA,
     (train_pos[USER_COL], train_pos[ITEM_COL])),
    shape=(n_users, n_items)
).tocsr()


model = AlternatingLeastSquares(
    factors = FACTORS,
    regularization = REG,
    alpha = ALPHA,
    iterations = ITERATIONS,
    random_state = SEED,
    use_cg = True,
    num_threads = 0
)

model.fit(train_mat, show_progress=True)          

user_vecs = model.user_factors
item_vecs = model.item_factors

u_idx = test_df[USER_COL].values
i_idx = test_df[ITEM_COL].values

raw_scores = np.einsum("ij,ij->i", user_vecs[u_idx], item_vecs[i_idx])
y_prob = expit(raw_scores)
y_true = test_df[VALUE_COL].values.astype(np.float32)
y_label = (y_prob >= 0.5).astype(int)

print(f"AUC       : {roc_auc_score(y_true, y_prob):.4f}")
print(f"Acc@0.5   : {accuracy_score(y_true, y_label):.4f}")
print(f"Precision : {precision_score(y_true, y_label):.4f}")
print(f"Recall    : {recall_score(y_true, y_label):.4f}")
print(f"RMSE      : {np.sqrt(mean_squared_error(y_true, y_prob)):.4f}")

torch-fm을 이용한 모델 성능 확인

In [None]:
DATA_PATH   = "data/holdings_data.parquet"
BATCH_SIZE = 4096
EPOCHS     = 100
EMBED_DIM  = 16
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
FIELD_COLS = ["CIK_id", "CUSIP_id"]
LABEL_COL  = "TOP25_FLAG"
MLP_DIMS  = (64, 32)
DROPOUT   = 0.2

In [None]:
class StockDataset(Dataset):
    def __init__(self, frame):
        self.X = frame[FIELD_COLS].values.astype(np.int64)
        self.y = frame[LABEL_COL ].values.astype(np.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return (torch.from_numpy(self.X[idx]), torch.tensor(self.y[idx]))

Factorization

In [None]:
df = pd.read_parquet(DATA_PATH)[["CIK", "CUSIP", "TOP25_FLAG"]]
df["CIK_id"],   _ = pd.factorize(df["CIK"])
df["CUSIP_id"], _ = pd.factorize(df["CUSIP"])

field_dims = np.array([df[c].nunique() for c in FIELD_COLS])

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df[LABEL_COL])

train_loader = DataLoader(
    StockDataset(train_df),
    batch_size=BATCH_SIZE, shuffle=True,
    num_workers=0, pin_memory=True
)
test_loader  = DataLoader(
    StockDataset(test_df ),
    batch_size=BATCH_SIZE, shuffle=False,
    num_workers=0, pin_memory=True
)

model = FactorizationMachineModel(
    field_dims = field_dims,
    embed_dim  = EMBED_DIM
).to(DEVICE)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(1, EPOCHS + 1):
    model.train(); running_loss = 0.0
    for X, y in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}"):
        X, y = X.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        logits = model(X).squeeze()
        loss   = criterion(logits, y)
        loss.backward(); optimizer.step()
        running_loss += loss.item() * y.size(0)
    print(f"[{epoch:02d}] Train Loss = {running_loss / len(train_loader.dataset):.4f}")

model.eval(); y_true, y_pred = [], []
with torch.no_grad():
    for X, y in test_loader:
        X = X.to(DEVICE)
        probs = torch.sigmoid(model(X).squeeze()).cpu()
        y_true.append(y)
        y_pred.append(probs)

y_true = torch.cat(y_true).numpy()
y_prob = torch.cat(y_pred).numpy()
y_pred_label = (y_prob >= 0.5).astype(int)

precision = precision_score(y_true, y_pred_label)
recall    = recall_score(y_true, y_pred_label)
rmse      = np.sqrt(mean_squared_error(y_true, y_prob))

print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"RMSE:      {rmse:.4f}")


Wide&Deep 모델

In [None]:
df = pd.read_parquet(DATA_PATH)[["CIK", "CUSIP", "TOP25_FLAG"]]
df["CIK_id"], cik_uniques = pd.factorize(df["CIK"])
df["CUSIP_id"], cusip_uniques = pd.factorize(df["CUSIP"])

field_dims = np.array([df[c].nunique() for c in FIELD_COLS])

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df[LABEL_COL])

train_loader = DataLoader(StockDataset(train_df), batch_size=BATCH_SIZE, shuffle=True,  num_workers=0)
test_loader  = DataLoader(StockDataset(test_df ), batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

model = WideAndDeepModel(
    field_dims = field_dims,
    embed_dim  = EMBED_DIM,
    mlp_dims   = MLP_DIMS,
    dropout    = DROPOUT
).to(DEVICE)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(1, EPOCHS + 1):
    model.train()
    epoch_loss = 0
    for x, y in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}"):
        x, y = x.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        logits = model(x).squeeze()
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() * y.size(0)
    print(f"  ▸ train loss = {epoch_loss / len(train_loader.dataset):.4f}")

model.eval(); y_true, y_pred = [], []
with torch.no_grad():
    for x, y in test_loader:
        x = x.to(DEVICE)
        logits = model(x).squeeze().cpu()
        y_true.append(y)
        y_pred.append(torch.sigmoid(logits))
y_true = torch.cat(y_true).numpy()
y_pred = torch.cat(y_pred).numpy()

print(f"AUC  : {roc_auc_score(y_true, y_pred):.4f}")
print(f"Acc@0.5: {accuracy_score(y_true, (y_pred >= 0.5)):.4f}")

from sklearn.metrics import precision_score, recall_score, mean_squared_error

y_pred_label = (y_pred >= 0.5).astype(int)

precision = precision_score(y_true, y_pred_label)
recall    = recall_score(y_true, y_pred_label)
rmse      = np.sqrt(mean_squared_error(y_true, y_pred))

print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"RMSE     : {rmse:.4f}")


DeepFM

In [None]:
df = pd.read_parquet(DATA_PATH)[["CIK", "CUSIP", "TOP25_FLAG"]]
df["CIK_id"],   _ = pd.factorize(df["CIK"])
df["CUSIP_id"], _ = pd.factorize(df["CUSIP"])

field_dims = np.array([df[c].nunique() for c in FIELD_COLS])

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df[LABEL_COL])

train_loader = DataLoader(
    StockDataset(train_df),
    batch_size=BATCH_SIZE, shuffle=True,
    num_workers=0, pin_memory=True
)
test_loader  = DataLoader(
    StockDataset(test_df ),
    batch_size=BATCH_SIZE, shuffle=False,
    num_workers=0, pin_memory=True
)

model = DeepFactorizationMachineModel(
    field_dims = field_dims,
    embed_dim  = EMBED_DIM,
    mlp_dims   = MLP_DIMS,
    dropout    = DROPOUT
).to(DEVICE)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(1, EPOCHS + 1):
    model.train(); running_loss = 0.0
    for X, y in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}"):
        X, y = X.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        logits = model(X).squeeze()
        loss   = criterion(logits, y)
        loss.backward(); optimizer.step()
        running_loss += loss.item() * y.size(0)
    print(f"[{epoch:02d}] Train Loss = {running_loss / len(train_loader.dataset):.4f}")

model.eval(); y_true, y_pred = [], []
with torch.no_grad():
    for X, y in test_loader:
        X = X.to(DEVICE)
        probs = torch.sigmoid(model(X).squeeze()).cpu()
        y_true.append(y); y_pred.append(probs)

y_true = torch.cat(y_true).numpy()
y_pred = torch.cat(y_pred).numpy()

print(f"AUC      : {roc_auc_score(y_true, y_pred):.4f}")
print(f"Acc@0.5  : {accuracy_score(y_true, (y_pred >= 0.5)):.4f}")

y_pred_label = (y_pred >= 0.5).astype(int)

precision = precision_score(y_true, y_pred_label)
recall    = recall_score(y_true, y_pred_label)
rmse      = np.sqrt(mean_squared_error(y_true, y_pred))

print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"RMSE     : {rmse:.4f}")