# 모델 아키텍처

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import pandas as pd

# 1. 모델 아키텍처
class NewsSimilarityModel(nn.Module):
    def __init__(self, 
                 embedding_model_name='klue/bert-base',
                 embedding_dim=768,
                 external_feature_dim=9,
                 fcl_hidden_dim=512,
                 linear_output_dim=256):
        super().__init__()
        self.embedding_model = AutoModel.from_pretrained(embedding_model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
        for param in self.embedding_model.parameters():
            param.requires_grad = False

        input_dim = embedding_dim + external_feature_dim

        self.fcl_current = nn.Sequential(
            nn.Linear(input_dim, fcl_hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(fcl_hidden_dim, fcl_hidden_dim)
        )
        self.fcl_past = nn.Sequential(
            nn.Linear(input_dim, fcl_hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(fcl_hidden_dim, fcl_hidden_dim)
        )

        self.linear_current = nn.Linear(fcl_hidden_dim, linear_output_dim)
        self.linear_past = nn.Linear(fcl_hidden_dim, linear_output_dim)

    def get_text_embedding(self, texts, device):
        inputs = self.tokenizer(
            texts,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=512
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = self.embedding_model(**inputs)
            return outputs.last_hidden_state[:, 0, :]

    def forward(self, news_a, news_b, ext_a, ext_b, device):
        emb_a = self.get_text_embedding(news_a, device)
        emb_b = self.get_text_embedding(news_b, device)
        combined_a = torch.cat([emb_a, ext_a.to(device)], dim=1)
        combined_b = torch.cat([emb_b, ext_b.to(device)], dim=1)
        current_vec = self.linear_current(self.fcl_current(combined_a))
        past_vec = self.linear_past(self.fcl_past(combined_b))
        return current_vec, past_vec


# 2. 손실 함수
class SimilarityLoss(nn.Module):
    def __init__(self, margin=0.5):
        super().__init__()
        self.margin = margin

    def forward(self, vec_a, vec_b, labels):
        cosine_sim = F.cosine_similarity(vec_a, vec_b, dim=1)
        pos_loss = labels * torch.pow(1 - cosine_sim, 2)
        neg_loss = (1 - labels) * torch.pow(torch.clamp(cosine_sim - self.margin, min=0), 2)
        return torch.mean(pos_loss + neg_loss), cosine_sim

# 데이터셋 및 학습 코드

In [16]:
# 3. 데이터셋 클래스
class NewsDataset(Dataset):
    def __init__(self, news_a, news_b, feat_a, feat_b, labels):
        self.news_a = news_a
        self.news_b = news_b
        self.feat_a = feat_a
        self.feat_b = feat_b
        self.labels = labels

    def __len__(self):
        return len(self.news_a)

    def __getitem__(self, idx):
        return {
            'news_a': self.news_a[idx],
            'news_b': self.news_b[idx],
            'feat_a': torch.FloatTensor(self.feat_a[idx]),
            'feat_b': torch.FloatTensor(self.feat_b[idx]),
            'label': torch.FloatTensor([self.labels[idx]])
        }

def custom_collate(batch):
    return {
        'news_a': [item['news_a'] for item in batch],
        'news_b': [item['news_b'] for item in batch],
        'feat_a': torch.stack([item['feat_a'] for item in batch]),
        'feat_b': torch.stack([item['feat_b'] for item in batch]),
        'label': torch.stack([item['label'] for item in batch])
    }


# 4. 학습 함수 (기존 모델을 인자로 받음)
def train_model(model, dataloader, num_epochs=10, lr=1e-4):
    device = next(model.parameters()).device
    criterion = SimilarityLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in dataloader:
            optimizer.zero_grad()
            vec_a, vec_b = model(
                batch['news_a'],
                batch['news_b'],
                batch['feat_a'].to(device),
                batch['feat_b'].to(device),
                device
            )
            loss, _ = criterion(vec_a, vec_b, batch['label'].squeeze().to(device))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f'Epoch {epoch+1}: Loss={total_loss / len(dataloader):.4f}')
    return model


# 5. 유사 뉴스 탐색 함수
def find_similar_news(model, current_news, current_feat, past_news_list, past_feat_list, top_k=5, batch_size=32):
    device = next(model.parameters()).device
    model.eval()

    current_feat_tensor = torch.FloatTensor(current_feat).unsqueeze(0).to(device)
    similarities = []

    for i in range(0, len(past_news_list), batch_size):
        batch_news = past_news_list[i:i+batch_size]
        batch_feat = past_feat_list[i:i+batch_size]
        feat_batch = torch.FloatTensor(batch_feat).to(device)

        with torch.no_grad():
            vec_curr, vec_past = model(
                [current_news]*len(batch_news),
                batch_news,
                current_feat_tensor.repeat(len(batch_news), 1),
                feat_batch,
                device
            )
            batch_sim = F.cosine_similarity(vec_curr, vec_past, dim=1)
            for j in range(len(batch_news)):
                similarities.append((i + j, batch_sim[j].item(), batch_news[j]))

    return sorted(similarities, key=lambda x: x[1], reverse=True)[:top_k]


# 6. 실행 예시
if __name__ == '__main__':
    import time
    df = pd.read_csv('/Users/han-yeeun/final/db/news_2023_2025_with_stock_impact.csv')

    # 텍스트 없는 뉴스 제거
    df = df.dropna(subset=['text_combined'])

    # 날짜 처리
    df['wdate'] = pd.to_datetime(df['wdate'])

    # 외부 변수
    ext_cols = ['D-3', 'D-2', 'D-1', 'D+1', 'D+2', 'D+3', 'D+7', 'D+14', 'D+30']
    ext_dim = len(ext_cols)

    # 기준 뉴스 설정 (예: 첫 번째 뉴스가 최신이라고 가정)
    base_idx = 0
    base_news = df.iloc[base_idx]['text_combined']
    base_feat = df.iloc[base_idx][ext_cols].astype(float).values
    base_date = df.iloc[base_idx]['wdate']

    # 🔻 기준 뉴스보다 **과거 뉴스만** 비교 대상으로 설정
    compare_df = df[df['wdate'] < base_date].reset_index(drop=True)
    compare_news = compare_df['text_combined'].tolist()
    compare_feats = compare_df[ext_cols].astype(float).values.tolist()

    dummy_labels = [1] * len(compare_news)  # 실제 학습 시엔 적절한 레이블 필요

    # 데이터셋 및 로더 생성
    dataset = NewsDataset(
        news_a=[base_news] * len(compare_news),
        news_b=compare_news,
        feat_a=[base_feat] * len(compare_news),
        feat_b=compare_feats,
        labels=dummy_labels
    )
    dataloader = DataLoader(dataset, batch_size=8, collate_fn=custom_collate)

    # 모델 및 학습
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = NewsSimilarityModel(external_feature_dim=ext_dim).to(device)

    print(f'총 비교 대상 뉴스 개수: {len(compare_news)}')
    start = time.time()

    model = train_model(model, dataloader, num_epochs=1, lr=1e-4)

    results = find_similar_news(
        model=model,
        current_news=base_news,
        current_feat=base_feat,
        past_news_list=compare_news,
        past_feat_list=compare_feats,
        top_k=5
    )

    end = time.time()

    print('\n기준 뉴스:\n', base_news[:200], '...\n')
    print('유사 뉴스 Top-5:')
    for i, sim, text in results:
        print(f'[유사도 {sim:.4f}] {text[:100]}...')

    print(f'\n비교 대상 {len(compare_news)}건 | 소요 시간: {end - start:.2f}초')

총 비교 대상 뉴스 개수: 35197


KeyboardInterrupt: 