In [None]:
!pip uninstall -y torch torchvision torchaudio transformers sentence-transformers
!pip install torch==2.6.0 torchvision==0.17.0 torchaudio --index-url https://download.pytorch.org/whl/cu124
!pip install transformers==4.41.2 sentence-transformers==2.6.1

In [None]:
!pip install scikit-learn==1.7.0

In [None]:
!pip show scikit-learn

In [2]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m509.2/509.2 kB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [20]:
import ast
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, TensorDataset
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import random
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import mean_squared_error, r2_score
from tqdm.notebook import tqdm

In [4]:
# 1. Seed 고정
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [11]:
# 2. 데이터 로드
df = pd.read_csv('dataset_2023_2025.csv')

In [12]:
# 3. 스케일링
header_str = """
news_id,summary,d_minus_5_date_close,d_minus_5_date_volume,d_minus_5_date_foreign,d_minus_5_date_institution,d_minus_5_date_individual,d_minus_4_date_close,d_minus_4_date_volume,d_minus_4_date_foreign,d_minus_4_date_institution,d_minus_4_date_individual,d_minus_3_date_close,d_minus_3_date_volume,d_minus_3_date_foreign,d_minus_3_date_institution,d_minus_3_date_individual,d_minus_2_date_close,d_minus_2_date_volume,d_minus_2_date_foreign,d_minus_2_date_institution,d_minus_2_date_individual,d_minus_1_date_close,d_minus_1_date_volume,d_minus_1_date_foreign,d_minus_1_date_institution,d_minus_1_date_individual,d_plus_1_date_close,d_plus_2_date_close,d_plus_3_date_close,d_plus_4_date_close,d_plus_5_date_close,fx,bond10y,base_rate,토픽 1,토픽 2,토픽 3,토픽 4,토픽 5,토픽 6,토픽 7,토픽 8,토픽 9,similar_news_id,similar_summary,similar_d_minus_5_date_close,similar_d_minus_5_date_volume,similar_d_minus_5_date_foreign,similar_d_minus_5_date_institution,similar_d_minus_5_date_individual,similar_d_minus_4_date_close,similar_d_minus_4_date_volume,similar_d_minus_4_date_foreign,similar_d_minus_4_date_institution,similar_d_minus_4_date_individual,similar_d_minus_3_date_close,similar_d_minus_3_date_volume,similar_d_minus_3_date_foreign,similar_d_minus_3_date_institution,similar_d_minus_3_date_individual,similar_d_minus_2_date_close,similar_d_minus_2_date_volume,similar_d_minus_2_date_foreign,similar_d_minus_2_date_institution,similar_d_minus_2_date_individual,similar_d_minus_1_date_close,similar_d_minus_1_date_volume,similar_d_minus_1_date_foreign,similar_d_minus_1_date_institution,similar_d_minus_1_date_individual,similar_d_plus_1_date_close,similar_d_plus_2_date_close,similar_d_plus_3_date_close,similar_d_plus_4_date_close,similar_d_plus_5_date_close,similar_fx,similar_bond10y,similar_base_rate,similar_토픽 1,similar_토픽 2,similar_토픽 3,similar_토픽 4,similar_토픽 5,similar_토픽 6,similar_토픽 7,similar_토픽 8,similar_토픽 9,target
"""

header_cols = [col.strip() for col in header_str.strip().split(',')]

ext_a = [col for col in header_cols if col.startswith('d_minus_') or col.startswith('d_plus_') or col.startswith('D_day_') or col in ['fx', 'bond10y', 'base_rate'] or col.startswith('토픽 ')]
ext_b = [col for col in header_cols if col.startswith('similar_d_minus_') or col.startswith('similar_d_plus_') or col.startswith('similar_D_day_') or col in ['similar_fx', 'similar_bond10y', 'similar_base_rate'] or col.startswith('similar_토픽 ')]

group_ext_price_close = [c for c in ext_a if 'date_close' in c]
group_ext_volume = [c for c in ext_a if 'date_volume' in c]
group_ext_foreign = [c for c in ext_a if 'date_foreign' in c]
group_ext_institution = [c for c in ext_a if 'date_institution' in c]
group_ext_individual = [c for c in ext_a if 'date_individual' in c]
group_ext_macro = ['fx', 'bond10y', 'base_rate']
group_ext_topic = [f'토픽 {i}' for i in range(1, 10) if f'토픽 {i}' in ext_a]

group_ext_similar_price_close = [c for c in ext_b if 'date_close' in c]
group_ext_similar_volume = [c for c in ext_b if 'date_volume' in c]
group_ext_similar_foreign = [c for c in ext_b if 'date_foreign' in c]
group_ext_similar_institution = [c for c in ext_b if 'date_institution' in c]
group_ext_similar_individual = [c for c in ext_b if 'date_individual' in c]
group_ext_similar_macro = ['similar_fx', 'similar_bond10y', 'similar_base_rate']
group_ext_similar_topic = [f'similar_토픽 {i}' for i in range(1, 10) if f'similar_토픽 {i}' in ext_b]

ext_a_groups = [
    group_ext_price_close, group_ext_volume, group_ext_foreign,
    group_ext_institution, group_ext_individual,
    group_ext_macro, group_ext_topic
]

ext_b_groups = [
    group_ext_similar_price_close, group_ext_similar_volume, group_ext_similar_foreign,
    group_ext_similar_institution, group_ext_similar_individual,
    group_ext_similar_macro, group_ext_similar_topic
]

scaler_choices_a = [
    StandardScaler(), RobustScaler(), MinMaxScaler(),
    MinMaxScaler(), MinMaxScaler(), StandardScaler(), MinMaxScaler()
]

scaler_choices_b = [
    StandardScaler(), RobustScaler(), MinMaxScaler(),
    MinMaxScaler(), MinMaxScaler(), StandardScaler(), MinMaxScaler()
]

scaler_dict_a = {}
scaler_dict_b = {}

train_idx, val_idx = train_test_split(np.arange(len(df)), test_size=0.2, random_state=42)

extA_scaled_parts = []
for group, scaler in zip(ext_a_groups, scaler_choices_a):
    X_train = df.loc[train_idx, group].replace([np.inf, -np.inf], np.nan).fillna(df[group].mean())
    scaler.fit(X_train)
    X_all = df[group].replace([np.inf, -np.inf], np.nan).fillna(df[group].mean())
    X_scaled = scaler.transform(X_all)
    extA_scaled_parts.append(X_scaled)
    scaler_dict_a[group[0]] = scaler

extB_scaled_parts = []
for group, scaler in zip(ext_b_groups, scaler_choices_b):
    X_train = df.loc[train_idx, group].replace([np.inf, -np.inf], np.nan).fillna(df[group].mean())
    scaler.fit(X_train)
    X_all = df[group].replace([np.inf, -np.inf], np.nan).fillna(df[group].mean())
    X_scaled = scaler.transform(X_all)
    extB_scaled_parts.append(X_scaled)
    scaler_dict_b[group[0]] = scaler

extA_scaled = np.concatenate(extA_scaled_parts, axis=1)
extB_scaled = np.concatenate(extB_scaled_parts, axis=1)

extA_tensor = torch.tensor(extA_scaled, dtype=torch.float32)
extB_tensor = torch.tensor(extB_scaled, dtype=torch.float32)
y_tensor = torch.tensor(df['target'].values, dtype=torch.float32).unsqueeze(1)

print("ext_a / ext_b 그룹별 성격별 스케일링 + tensor 변환까지 최종 완료")

ext_a / ext_b 그룹별 성격별 스케일링 + tensor 변환까지 최종 완료


In [13]:
# 4. 임베딩 생성
model_name = "snunlp/KR-SBERT-V40K-klueNLI-augSTS"
model_emb = SentenceTransformer(model_name)
if torch.cuda.is_available():
    model_emb = model_emb.to("cuda")

print("[임베딩 중] summary/유사 summary 임베딩 생성 중...")
df['embedding'] = model_emb.encode(df['summary'].tolist(), show_progress_bar=True).tolist()
df['similar_embedding'] = model_emb.encode(df['similar_summary'].tolist(), show_progress_bar=True).tolist()

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.02k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/336k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/967k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[임베딩 중] summary/유사 summary 임베딩 생성 중...


Batches:   0%|          | 0/1344 [00:00<?, ?it/s]

Batches:   0%|          | 0/1344 [00:00<?, ?it/s]

In [15]:
tokenizer = model_emb.tokenizer

In [16]:
# 5. 오토인코더 정의 (latent_dim=256)
class EmbeddingAutoencoder(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=256):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 512), nn.GELU(), nn.BatchNorm1d(512), nn.Dropout(0.1),
            nn.Linear(512, 384), nn.GELU(), nn.BatchNorm1d(384), nn.Dropout(0.1),
            nn.Linear(384, hidden_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, 384), nn.GELU(), nn.BatchNorm1d(384), nn.Dropout(0.1),
            nn.Linear(384, 512), nn.GELU(), nn.BatchNorm1d(512), nn.Dropout(0.1),
            nn.Linear(512, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat, z

# AE 학습
print("[AE 학습 중]...")
text_emb = df['embedding'].apply(lambda x: np.array(ast.literal_eval(x)) if isinstance(x, str) else x)
X = torch.stack([torch.tensor(e, dtype=torch.float32) for e in text_emb.tolist()])
loader = DataLoader(TensorDataset(X), batch_size=64, shuffle=True)

ae_model = EmbeddingAutoencoder(hidden_dim=256)
optimizer = torch.optim.Adam(ae_model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=25, gamma=0.5)
criterion = nn.MSELoss()

for epoch in range(1, 31):
    ae_model.train()
    epoch_loss = 0
    for batch in loader:
        x_batch = batch[0]
        x_hat, z = ae_model(x_batch)
        loss = criterion(x_hat, x_batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        epoch_loss += loss.item() * x_batch.size(0)
    scheduler.step()
    print(f"AE Epoch {epoch}/30, Loss: {epoch_loss / len(X):.6f}")

[AE 학습 중]...
AE Epoch 1/30, Loss: 0.110500
AE Epoch 2/30, Loss: 0.057536
AE Epoch 3/30, Loss: 0.051610
AE Epoch 4/30, Loss: 0.049034
AE Epoch 5/30, Loss: 0.047281
AE Epoch 6/30, Loss: 0.046127
AE Epoch 7/30, Loss: 0.045263
AE Epoch 8/30, Loss: 0.044702
AE Epoch 9/30, Loss: 0.044229
AE Epoch 10/30, Loss: 0.043861
AE Epoch 11/30, Loss: 0.043490
AE Epoch 12/30, Loss: 0.043235
AE Epoch 13/30, Loss: 0.042994
AE Epoch 14/30, Loss: 0.042732
AE Epoch 15/30, Loss: 0.042476
AE Epoch 16/30, Loss: 0.042403
AE Epoch 17/30, Loss: 0.042271
AE Epoch 18/30, Loss: 0.042166
AE Epoch 19/30, Loss: 0.041942
AE Epoch 20/30, Loss: 0.041889
AE Epoch 21/30, Loss: 0.041819
AE Epoch 22/30, Loss: 0.041746
AE Epoch 23/30, Loss: 0.041583
AE Epoch 24/30, Loss: 0.041511
AE Epoch 25/30, Loss: 0.041481
AE Epoch 26/30, Loss: 0.040425
AE Epoch 27/30, Loss: 0.040388
AE Epoch 28/30, Loss: 0.040389
AE Epoch 29/30, Loss: 0.040297
AE Epoch 30/30, Loss: 0.040308


In [None]:
# 6. 벡터 유사도 모델 정의 (외부 변수에 α=2.0 가중치 적용)
class SimilarityRankerWithText(nn.Module):
    def __init__(self, embedding_model, autoencoder_encoder, tokenizer, ext_dim=42, latent_dim=256, alpha=2.0, rank_weight=0.3, rank_margin=0.05):
        super().__init__()
        self.embedding_model = embedding_model
        self.tokenizer = tokenizer
        self.encoder = autoencoder_encoder
        self.ext_dim = ext_dim
        self.latent_dim = latent_dim
        self.input_dim = latent_dim + ext_dim
        self.alpha = alpha
        self.rank_weight = rank_weight
        self.rank_margin = rank_margin

        self.news_fc = nn.Sequential(
            nn.Linear(self.input_dim, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(256, 128), nn.BatchNorm1d(128), nn.ReLU(), nn.Dropout(0.2)
        )

    def encode_text(self, text_list):
        with torch.no_grad():
            emb = self.embedding_model.encode(text_list, convert_to_tensor=True)
            emb = emb.to(next(self.encoder.parameters()).device).float()
            reduced = self.encoder(emb)
            return reduced

    def forward(self, summary_list, similar_summary_list, extA, extB):
        zA = self.encode_text(summary_list)
        zB = self.encode_text(similar_summary_list)

        extA = self.alpha * extA.to(zA.device)
        extB = self.alpha * extB.to(zB.device)

        inputA = torch.cat([zA, extA], dim=1)
        inputB = torch.cat([zB, extB], dim=1)

        hA = self.news_fc(inputA)
        hB = self.news_fc(inputB)

        # FC 벡터 간 cosine similarity 반환
        return F.cosine_similarity(hA, hB, dim=1)

    def compute_loss(self, pred, target):
        # ranking loss 기준
        rank = self.ranking_loss(pred, target)
        return self.rank_weight * rank

    def ranking_loss(self, pred, target):
        loss = 0.0
        count = 0
        for i in range(len(pred)):
            for j in range(i + 1, len(pred)):
                if target[i] == target[j]:
                    continue

                label = 1.0 if target[i] > target[j] else -1.0

                diff = pred[i] - pred[j]

                loss += torch.clamp(self.rank_margin - label * diff, min=0.0)
                
                count += 1
        return loss / (count + 1e-6)

In [18]:
# 7. Dataset 및 DataLoader 정의
class NewsSimilarityDataset(Dataset):
    def __init__(self, df, extA_tensor, extB_tensor, y_tensor, indices):
        self.summary = df["summary"].iloc[indices].tolist()
        self.similar_summary = df["similar_summary"].iloc[indices].tolist()
        self.extA = extA_tensor[indices]
        self.extB = extB_tensor[indices]
        self.y = y_tensor[indices]

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return (
            self.summary[idx],
            self.similar_summary[idx],
            self.extA[idx],
            self.extB[idx],
            self.y[idx]
        )

train_dataset = NewsSimilarityDataset(df, extA_tensor, extB_tensor, y_tensor, train_idx)
val_dataset = NewsSimilarityDataset(df, extA_tensor, extB_tensor, y_tensor, val_idx)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [21]:
# 8. 학습 loop (Early Stopping 적용)
model = SimilarityRankerWithText(
    embedding_model=model_emb,
    autoencoder_encoder=ae_model.encoder,
    tokenizer=tokenizer,
    ext_dim=extA_tensor.shape[1],
    latent_dim=256,
    alpha=2.0
).to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

# 옵티마이저 설정
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Early Stopping 설정
best_val_loss = float('inf')
patience, trigger = 5, 0

# 에폭 루프 시작
for epoch in range(1, 51):
    start_time = time.time()  # 시간 측정 시작
    model.train()
    train_losses = []

    print(f'\nEpoch {epoch}/50 (Patience: {patience - trigger} left)')

    # 학습 루프
    for summary, similar_summary, extA, extB, y in tqdm(train_loader, desc=f"Training Epoch {epoch}", leave=False):
        # 디바이스로 이동
        device = model.encoder[0].weight.device
        extA, extB, y = extA.to(device), extB.to(device), y.to(device)

        # forward + loss 계산
        pred = model(summary, similar_summary, extA, extB)
        loss = model.compute_loss(pred, y)

        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())

    # 검증 루프
    model.eval()
    val_losses = []

    with torch.no_grad():
        for summary, similar_summary, extA, extB, y in tqdm(val_loader, desc=f"🔎 Validating Epoch {epoch}", leave=False):
            device = model.encoder[0].weight.device
            extA, extB, y = extA.to(device), extB.to(device), y.to(device)

            pred = model(summary, similar_summary, extA, extB)
            val_loss = model.compute_loss(pred, y)
            val_losses.append(val_loss.item())

    avg_train_loss = np.mean(train_losses)
    avg_val_loss = np.mean(val_losses)
    elapsed_time = time.time() - start_time

    print(f"Epoch {epoch} Summary | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | ⏱️ Time: {elapsed_time:.2f}s")

    # Early Stopping 로직
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        trigger = 0
        print("Validation loss improved. Resetting patience.")
    else:
        trigger += 1
        print(f"No improvement. Patience counter: {trigger}/{patience}")
        if trigger >= patience:
            print("Early stopping triggered!")
            break


Epoch 1/50 (Patience: 5 left)


🧪 Training Epoch 1:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 1:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 1 Summary | Train Loss: 0.0205 | Val Loss: 0.0179 | ⏱️ Time: 300.66s
Validation loss improved. Resetting patience.

Epoch 2/50 (Patience: 5 left)


🧪 Training Epoch 2:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 2:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 2 Summary | Train Loss: 0.0189 | Val Loss: 0.0166 | ⏱️ Time: 299.27s
Validation loss improved. Resetting patience.

Epoch 3/50 (Patience: 5 left)


🧪 Training Epoch 3:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 3:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 3 Summary | Train Loss: 0.0179 | Val Loss: 0.0160 | ⏱️ Time: 300.63s
Validation loss improved. Resetting patience.

Epoch 4/50 (Patience: 5 left)


🧪 Training Epoch 4:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 4:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 4 Summary | Train Loss: 0.0173 | Val Loss: 0.0153 | ⏱️ Time: 299.93s
Validation loss improved. Resetting patience.

Epoch 5/50 (Patience: 5 left)


🧪 Training Epoch 5:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 5:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 5 Summary | Train Loss: 0.0169 | Val Loss: 0.0147 | ⏱️ Time: 300.52s
Validation loss improved. Resetting patience.

Epoch 6/50 (Patience: 5 left)


🧪 Training Epoch 6:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 6:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 6 Summary | Train Loss: 0.0163 | Val Loss: 0.0144 | ⏱️ Time: 300.45s
Validation loss improved. Resetting patience.

Epoch 7/50 (Patience: 5 left)


🧪 Training Epoch 7:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 7:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 7 Summary | Train Loss: 0.0160 | Val Loss: 0.0141 | ⏱️ Time: 300.17s
Validation loss improved. Resetting patience.

Epoch 8/50 (Patience: 5 left)


🧪 Training Epoch 8:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 8:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 8 Summary | Train Loss: 0.0156 | Val Loss: 0.0139 | ⏱️ Time: 301.01s
Validation loss improved. Resetting patience.

Epoch 9/50 (Patience: 5 left)


🧪 Training Epoch 9:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 9:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 9 Summary | Train Loss: 0.0154 | Val Loss: 0.0137 | ⏱️ Time: 300.71s
Validation loss improved. Resetting patience.

Epoch 10/50 (Patience: 5 left)


🧪 Training Epoch 10:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 10:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 10 Summary | Train Loss: 0.0152 | Val Loss: 0.0135 | ⏱️ Time: 299.36s
Validation loss improved. Resetting patience.

Epoch 11/50 (Patience: 5 left)


🧪 Training Epoch 11:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 11:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 11 Summary | Train Loss: 0.0150 | Val Loss: 0.0132 | ⏱️ Time: 299.68s
Validation loss improved. Resetting patience.

Epoch 12/50 (Patience: 5 left)


🧪 Training Epoch 12:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 12:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 12 Summary | Train Loss: 0.0147 | Val Loss: 0.0131 | ⏱️ Time: 300.12s
Validation loss improved. Resetting patience.

Epoch 13/50 (Patience: 5 left)


🧪 Training Epoch 13:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 13:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 13 Summary | Train Loss: 0.0145 | Val Loss: 0.0129 | ⏱️ Time: 300.19s
Validation loss improved. Resetting patience.

Epoch 14/50 (Patience: 5 left)


🧪 Training Epoch 14:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 14:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 14 Summary | Train Loss: 0.0146 | Val Loss: 0.0127 | ⏱️ Time: 301.23s
Validation loss improved. Resetting patience.

Epoch 15/50 (Patience: 5 left)


🧪 Training Epoch 15:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 15:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 15 Summary | Train Loss: 0.0143 | Val Loss: 0.0124 | ⏱️ Time: 299.64s
Validation loss improved. Resetting patience.

Epoch 16/50 (Patience: 5 left)


🧪 Training Epoch 16:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 16:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 16 Summary | Train Loss: 0.0142 | Val Loss: 0.0127 | ⏱️ Time: 300.49s
No improvement. Patience counter: 1/5

Epoch 17/50 (Patience: 4 left)


🧪 Training Epoch 17:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 17:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 17 Summary | Train Loss: 0.0141 | Val Loss: 0.0123 | ⏱️ Time: 298.64s
Validation loss improved. Resetting patience.

Epoch 18/50 (Patience: 5 left)


🧪 Training Epoch 18:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 18:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 18 Summary | Train Loss: 0.0139 | Val Loss: 0.0120 | ⏱️ Time: 298.98s
Validation loss improved. Resetting patience.

Epoch 19/50 (Patience: 5 left)


🧪 Training Epoch 19:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 19:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 19 Summary | Train Loss: 0.0137 | Val Loss: 0.0121 | ⏱️ Time: 299.77s
No improvement. Patience counter: 1/5

Epoch 20/50 (Patience: 4 left)


🧪 Training Epoch 20:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 20:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 20 Summary | Train Loss: 0.0137 | Val Loss: 0.0119 | ⏱️ Time: 300.08s
Validation loss improved. Resetting patience.

Epoch 21/50 (Patience: 5 left)


🧪 Training Epoch 21:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 21:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 21 Summary | Train Loss: 0.0137 | Val Loss: 0.0118 | ⏱️ Time: 298.51s
Validation loss improved. Resetting patience.

Epoch 22/50 (Patience: 5 left)


🧪 Training Epoch 22:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 22:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 22 Summary | Train Loss: 0.0135 | Val Loss: 0.0117 | ⏱️ Time: 298.81s
Validation loss improved. Resetting patience.

Epoch 23/50 (Patience: 5 left)


🧪 Training Epoch 23:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 23:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 23 Summary | Train Loss: 0.0134 | Val Loss: 0.0115 | ⏱️ Time: 298.93s
Validation loss improved. Resetting patience.

Epoch 24/50 (Patience: 5 left)


🧪 Training Epoch 24:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 24:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 24 Summary | Train Loss: 0.0133 | Val Loss: 0.0115 | ⏱️ Time: 301.49s
Validation loss improved. Resetting patience.

Epoch 25/50 (Patience: 5 left)


🧪 Training Epoch 25:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 25:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 25 Summary | Train Loss: 0.0133 | Val Loss: 0.0115 | ⏱️ Time: 299.85s
Validation loss improved. Resetting patience.

Epoch 26/50 (Patience: 5 left)


🧪 Training Epoch 26:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 26:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 26 Summary | Train Loss: 0.0131 | Val Loss: 0.0115 | ⏱️ Time: 298.58s
Validation loss improved. Resetting patience.

Epoch 27/50 (Patience: 5 left)


🧪 Training Epoch 27:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 27:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 27 Summary | Train Loss: 0.0130 | Val Loss: 0.0113 | ⏱️ Time: 299.70s
Validation loss improved. Resetting patience.

Epoch 28/50 (Patience: 5 left)


🧪 Training Epoch 28:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 28:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 28 Summary | Train Loss: 0.0130 | Val Loss: 0.0113 | ⏱️ Time: 299.73s
Validation loss improved. Resetting patience.

Epoch 29/50 (Patience: 5 left)


🧪 Training Epoch 29:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 29:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 29 Summary | Train Loss: 0.0129 | Val Loss: 0.0112 | ⏱️ Time: 300.23s
Validation loss improved. Resetting patience.

Epoch 30/50 (Patience: 5 left)


🧪 Training Epoch 30:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 30:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 30 Summary | Train Loss: 0.0128 | Val Loss: 0.0111 | ⏱️ Time: 301.32s
Validation loss improved. Resetting patience.

Epoch 31/50 (Patience: 5 left)


🧪 Training Epoch 31:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 31:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 31 Summary | Train Loss: 0.0128 | Val Loss: 0.0112 | ⏱️ Time: 299.51s
No improvement. Patience counter: 1/5

Epoch 32/50 (Patience: 4 left)


🧪 Training Epoch 32:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 32:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 32 Summary | Train Loss: 0.0128 | Val Loss: 0.0111 | ⏱️ Time: 298.71s
Validation loss improved. Resetting patience.

Epoch 33/50 (Patience: 5 left)


🧪 Training Epoch 33:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 33:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 33 Summary | Train Loss: 0.0127 | Val Loss: 0.0109 | ⏱️ Time: 300.20s
Validation loss improved. Resetting patience.

Epoch 34/50 (Patience: 5 left)


🧪 Training Epoch 34:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 34:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 34 Summary | Train Loss: 0.0125 | Val Loss: 0.0111 | ⏱️ Time: 300.59s
No improvement. Patience counter: 1/5

Epoch 35/50 (Patience: 4 left)


🧪 Training Epoch 35:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 35:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 35 Summary | Train Loss: 0.0126 | Val Loss: 0.0109 | ⏱️ Time: 299.79s
No improvement. Patience counter: 2/5

Epoch 36/50 (Patience: 3 left)


🧪 Training Epoch 36:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 36:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 36 Summary | Train Loss: 0.0125 | Val Loss: 0.0109 | ⏱️ Time: 299.13s
Validation loss improved. Resetting patience.

Epoch 37/50 (Patience: 5 left)


🧪 Training Epoch 37:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 37:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 37 Summary | Train Loss: 0.0124 | Val Loss: 0.0109 | ⏱️ Time: 299.47s
No improvement. Patience counter: 1/5

Epoch 38/50 (Patience: 4 left)


🧪 Training Epoch 38:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 38:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 38 Summary | Train Loss: 0.0124 | Val Loss: 0.0108 | ⏱️ Time: 299.79s
Validation loss improved. Resetting patience.

Epoch 39/50 (Patience: 5 left)


🧪 Training Epoch 39:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 39:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 39 Summary | Train Loss: 0.0126 | Val Loss: 0.0108 | ⏱️ Time: 299.90s
No improvement. Patience counter: 1/5

Epoch 40/50 (Patience: 4 left)


🧪 Training Epoch 40:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 40:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 40 Summary | Train Loss: 0.0123 | Val Loss: 0.0109 | ⏱️ Time: 299.84s
No improvement. Patience counter: 2/5

Epoch 41/50 (Patience: 3 left)


🧪 Training Epoch 41:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 41:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 41 Summary | Train Loss: 0.0123 | Val Loss: 0.0107 | ⏱️ Time: 300.87s
Validation loss improved. Resetting patience.

Epoch 42/50 (Patience: 5 left)


🧪 Training Epoch 42:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 42:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 42 Summary | Train Loss: 0.0123 | Val Loss: 0.0107 | ⏱️ Time: 300.38s
Validation loss improved. Resetting patience.

Epoch 43/50 (Patience: 5 left)


🧪 Training Epoch 43:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 43:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 43 Summary | Train Loss: 0.0123 | Val Loss: 0.0108 | ⏱️ Time: 297.93s
No improvement. Patience counter: 1/5

Epoch 44/50 (Patience: 4 left)


🧪 Training Epoch 44:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 44:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 44 Summary | Train Loss: 0.0122 | Val Loss: 0.0106 | ⏱️ Time: 299.64s
Validation loss improved. Resetting patience.

Epoch 45/50 (Patience: 5 left)


🧪 Training Epoch 45:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 45:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 45 Summary | Train Loss: 0.0120 | Val Loss: 0.0105 | ⏱️ Time: 302.05s
Validation loss improved. Resetting patience.

Epoch 46/50 (Patience: 5 left)


🧪 Training Epoch 46:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 46:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 46 Summary | Train Loss: 0.0119 | Val Loss: 0.0107 | ⏱️ Time: 298.92s
No improvement. Patience counter: 1/5

Epoch 47/50 (Patience: 4 left)


🧪 Training Epoch 47:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 47:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 47 Summary | Train Loss: 0.0120 | Val Loss: 0.0107 | ⏱️ Time: 302.19s
No improvement. Patience counter: 2/5

Epoch 48/50 (Patience: 3 left)


🧪 Training Epoch 48:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 48:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 48 Summary | Train Loss: 0.0121 | Val Loss: 0.0105 | ⏱️ Time: 303.64s
Validation loss improved. Resetting patience.

Epoch 49/50 (Patience: 5 left)


🧪 Training Epoch 49:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 49:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 49 Summary | Train Loss: 0.0120 | Val Loss: 0.0105 | ⏱️ Time: 303.36s
Validation loss improved. Resetting patience.

Epoch 50/50 (Patience: 5 left)


🧪 Training Epoch 50:   0%|          | 0/1075 [00:00<?, ?it/s]

🔎 Validating Epoch 50:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch 50 Summary | Train Loss: 0.0119 | Val Loss: 0.0104 | ⏱️ Time: 304.10s
Validation loss improved. Resetting patience.


In [23]:
# 9. 검증
from sklearn.metrics import mean_squared_error, r2_score, ndcg_score
from scipy.stats import spearmanr

# 예측
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for summary, similar_summary, extA, extB, y in tqdm(val_loader, desc='🔍 Evaluation'):
        device = model.encoder[0].weight.device
        extA, extB, y = extA.to(device), extB.to(device), y.to(device)
        pred = model(summary, similar_summary, extA, extB)

        y_true.extend(y.cpu().numpy())
        y_pred.extend(pred.cpu().numpy())

# 평탄화
y_true = np.array(y_true).flatten()
y_pred = np.array(y_pred).flatten()

# 1. Spearman Correlation
spearman_corr, _ = spearmanr(y_true, y_pred)
print(f"Spearman Rank Correlation: {spearman_corr:.4f}")

# 2. NDCG
ndcg = ndcg_score([y_true], [y_pred])  # input: 2D list
print(f"NDCG Score: {ndcg:.4f}")

# 3. RMSE (보조 지표)
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
print(f"RMSE: {rmse:.4f}")

# 4. R^2 (보조 지표)
r2 = r2_score(y_true, y_pred)
print(f"R^2: {r2:.4f}")

🔍 Evaluation:   0%|          | 0/269 [00:00<?, ?it/s]

Spearman Rank Correlation: 0.6665
NDCG Score: 0.9935
RMSE: 0.0828
R^2: 0.2588


- Spearman Rank Correlation: 예측값과 실제값의 순위 간 일치도 측정 (순위의 일관성 측정에 최적)
- NDCG (Normalized DCG): 예측된 순위가 실제값 순위와 얼마나 잘 맞는지를 측정 (랭킹 품질에 대한 절대적 평가)

| 지표                      | 현재 값     | 실무 기준                                         | 해석                                               |
| ----------------------- | -------- | --------------------------------------------- | ------------------------------------------------ |
| **Spearman Rank Corr.** | `0.6665` | `0.6 이상 → 실사용 가능`, `0.7 이상 → 우수`              | **랭킹 순서 예측이 잘 됨**, 실무 적용 충분                      |
| **NDCG\@k**             | `0.9935` | `0.9 이상 → 거의 완벽`                              | 상위 N개 뉴스 재정렬/추천 목적이라면 **매우 우수**                  |
| **RMSE**                | `0.0828` | `0.05~0.10 → 괜찮음`, `0.01~0.03 → 매우 우수`        | 수치 자체가 \[-1,1] 범위인 cosine 유사도이기 때문에 **이 정도면 양호** |
| **R²**                  | `0.2588` | `>0.2 → 약한 설명력`, `>0.5 → 실사용 가능`, `>0.7 → 우수` | **정량 예측엔 약간 부족**, 하지만 지금 모델은 그게 주 목적이 아님         |

In [26]:
# 10. onnx로 저장
import torch
import torch.nn as nn
import torch.nn.functional as F

class SimilarityRankerONNX(nn.Module):
    def __init__(self, input_dim=298):
        super().__init__()
        self.news_fc = nn.Sequential(
            nn.Linear(input_dim, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(256, 128), nn.BatchNorm1d(128), nn.ReLU(), nn.Dropout(0.2)
        )

    def forward(self, inputA, inputB):
        hA = self.news_fc(inputA)
        hB = self.news_fc(inputB)
        sim = F.cosine_similarity(hA, hB, dim=1)
        return sim.unsqueeze(1)  # [batch, 1]

# ===== 1. 학습된 모델에서 news_fc 파라미터만 이식 =====
onnx_model = SimilarityRankerONNX().eval().cpu()
onnx_model.news_fc.load_state_dict(model.news_fc.state_dict())

# ===== 2. 더미 입력 설정 =====
dummy_A = torch.randn(1, 298)  # 256 + 42
dummy_B = torch.randn(1, 298)

# ===== 3. ONNX로 export =====
torch.onnx.export(
    onnx_model,
    (dummy_A, dummy_B),
    'similarity_ranker.onnx',
    input_names=['inputA', 'inputB'],
    output_names=['similarity'],
    opset_version=13,
    dynamic_axes={
        'inputA': {0: 'batch_size'},
        'inputB': {0: 'batch_size'},
        'similarity': {0: 'batch_size'}
    }
)

print('SimilarityRanker ONNX 저장 완료!')

SimilarityRanker ONNX 저장 완료!


In [25]:
!pip install onnx

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting onnx
  Downloading onnx-1.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting protobuf>=4.25.1 (from onnx)
  Downloading protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Downloading onnx-1.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl (321 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m321.1/321.1 kB[0m [31m101.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf, onnx
Successfully installed onnx-1.18.0 protobuf-6.31.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32