# 1. 라이브러리 및 API key 설정

In [57]:
import os
import torch
import numpy as np
import datasets
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import roc_auc_score
from openai import OpenAI
from serpapi import GoogleSearch 

In [101]:
from dotenv import load_dotenv
import json
import os
import openai
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langsmith.wrappers import wrap_openai
from langsmith import traceable
from langchain_anthropic import ChatAnthropic, AnthropicLLM
from langchain.llms import OpenAI
from langchain.chains import LLMChain, SimpleSequentialChain, SequentialChain
from langchain.memory import SimpleMemory
from datetime import datetime
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
import pandas as pd

# Load environment variables
load_dotenv()

# Auto-trace LLM calls in-context
client = wrap_openai(openai.Client())

# Create ChatOpenAI object
llm = ChatOpenAI(
    temperature=0,  # Creativity (0.0 ~ 2.0)
    max_tokens=2048,  # Max tokens
    model_name='gpt-4',  # Model name
    streaming=True,  # Enable streaming output
    callbacks=[StreamingStdOutCallbackHandler()]
)



In [102]:
# OpenAI 및 SerpAPI 키 설정
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
SERP_API_KEY = os.getenv("SERPAPI_API_KEY")

# OpenAI 클라이언트 초기화
client = OpenAI(api_key=OPENAI_API_KEY)

# FAISS 인덱스 저장 경로
FAISS_INDEX_PATH = Path('local/news-please/faiss_index')

# 2. DPR 기반 사내 뉴스 검색 시스템

In [60]:
class DPR():
    def __init__(self):
        """사내 뉴스 데이터베이스를 로드하고 FAISS 인덱스를 활용한 검색을 수행하는 DPR 클래스"""
        self.ds = datasets.load_dataset('sanxing/advfake_news_please')['train']
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.index_dpr()

    @torch.no_grad()
    def index_dpr(self):
        """DPR 기반 뉴스 임베딩을 생성하고 FAISS 인덱스를 구축"""
        from transformers import DPRContextEncoder, DPRContextEncoderTokenizer

        faiss_path = FAISS_INDEX_PATH / 'my_index.faiss'
        if faiss_path.exists():
            print('🔹 FAISS 인덱스 로드 중...')
            self.ds.load_faiss_index('embeddings', str(faiss_path))
            return

        print('🔹 FAISS 인덱스 생성 중...')
        ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base").to(self.device)
        ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

        # 🔹 뉴스 제목을 임베딩 벡터로 변환하여 저장
        ds_with_embeddings = self.ds.map(lambda example: {
            'embeddings': ctx_encoder(**ctx_tokenizer(example["title"], return_tensors="pt", padding=True).to(self.device))[0].cpu().numpy()
        }, batched=True, batch_size=64)

        ds_with_embeddings.add_faiss_index(column='embeddings')

        print('🔹 FAISS 인덱스 저장 중...')
        ds_with_embeddings.save_faiss_index('embeddings', str(faiss_path))

    @torch.no_grad()
    def search(self, query):
        """입력된 쿼리를 기반으로 가장 관련성 높은 뉴스 10개 검색"""
        from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer

        q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base").to(self.device)
        q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

        # 🔹 입력 쿼리를 DPR 임베딩으로 변환
        question_embedding = q_encoder(**q_tokenizer(query, return_tensors="pt").to(self.device))[0][0].cpu().numpy()
        
        # 🔹 FAISS 검색 수행
        scores, retrieved_examples = self.ds.get_nearest_examples('embeddings', question_embedding, k=5)

        # 🔹 결과를 딕셔너리 리스트 형태로 변환
        retrieved_examples = [dict(zip(retrieved_examples, t)) for t in zip(*retrieved_examples.values())]

        return scores, retrieved_examples

In [115]:
import torch
import datasets
import faiss
from pathlib import Path
from transformers import (
    DPRContextEncoder, DPRContextEncoderTokenizer,
    DPRQuestionEncoder, DPRQuestionEncoderTokenizer
)

FAISS_INDEX_PATH = Path("faiss_index")  # 저장할 폴더
DATASET_SAVE_PATH = Path("news_dataset")  # 데이터셋 저장 위치
FAISS_FILE = FAISS_INDEX_PATH / "my_index.faiss"

class DPR():
    def __init__(self):
        """사내 뉴스 데이터베이스를 로드하고 FAISS 인덱스를 활용한 검색을 수행하는 DPR 클래스"""
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

        # 🔹 저장된 데이터셋이 있으면 로드, 없으면 새로 다운로드
        if DATASET_SAVE_PATH.exists():
            print("🔹 저장된 데이터셋 로드 중...")
            self.ds = datasets.load_from_disk(str(DATASET_SAVE_PATH))
        else:
            print("🔹 새로운 데이터셋 다운로드 중...")
            self.ds = datasets.load_dataset('sanxing/advfake_news_please')['train']
            self.ds.save_to_disk(str(DATASET_SAVE_PATH))  # 저장

        # FAISS 인덱싱
        self.index_dpr()

    @torch.no_grad()
    def index_dpr(self):
        """DPR 기반 뉴스 임베딩을 생성하고 FAISS 인덱스를 구축"""
        if FAISS_FILE.exists():
            print('🔹 FAISS 인덱스 로드 중...')
            self.ds.load_faiss_index('embeddings', str(FAISS_FILE))
            return

        print('🔹 FAISS 인덱스 생성 중...')
        ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base").to(self.device)
        ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

        def embed_batch(examples):
            inputs = ctx_tokenizer(examples["title"], return_tensors="pt", padding=True, truncation=True).to(self.device)
            embeddings = ctx_encoder(**inputs).pooler_output.cpu().numpy()
            return {"embeddings": embeddings}

        ds_with_embeddings = self.ds.map(embed_batch, batched=True, batch_size=64)
        ds_with_embeddings.add_faiss_index(column='embeddings')

        print('🔹 FAISS 인덱스 저장 중...')
        FAISS_INDEX_PATH.mkdir(parents=True, exist_ok=True)
        ds_with_embeddings.save_faiss_index('embeddings', str(FAISS_FILE))

        # 🔹 데이터셋도 함께 저장
        ds_with_embeddings.save_to_disk(str(DATASET_SAVE_PATH))
        self.ds = ds_with_embeddings  # 메모리에 저장

    @torch.no_grad()
    def search(self, query):
        """입력된 쿼리를 기반으로 가장 관련성 높은 뉴스 5개 검색"""
        q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base").to(self.device)
        q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

        # 🔹 입력 쿼리를 DPR 임베딩으로 변환
        inputs = q_tokenizer(query, return_tensors="pt", padding=True, truncation=True).to(self.device)
        question_embedding = q_encoder(**inputs).pooler_output.cpu().numpy()[0]

        # 🔹 FAISS 검색 수행
        scores, retrieved_examples = self.ds.get_nearest_examples('embeddings', question_embedding, k=5)

        # 🔹 결과를 딕셔너리 리스트 형태로 변환
        retrieved_examples = [dict(zip(retrieved_examples, t)) for t in zip(*retrieved_examples.values())]

        return scores, retrieved_examples

# 3. Google SerpAPI 기반

In [61]:
def search_serpapi(q):
    """Google SerpAPI를 사용하여 실시간 뉴스 검색"""
    params = {
        "api_key": SERP_API_KEY,
        "engine": "google",
        "q": q,
        "location": "Austin, Texas, United States",
        "google_domain": "google.com",
        "gl": "us",
        "hl": "en",
        "num": "30"
    }

    search = GoogleSearch(params)
    results = search.get_dict()
    return results

def concat_snippets(organic_results):
    """검색된 기사에서 유의미한 정보를 필터링하고 요약"""
    organic_results = [result for result in organic_results if 'snippet' in result]
    organic_results = [result for result in organic_results if 'NBC' not in result['source'] and 'NBC' not in result['title']]
    organic_results = [result for result in organic_results if 'fact' not in result['link']]
    organic_results = organic_results[:5]

    return '\n'.join([
        f'Title: {result["title"]}\nSource: {result["source"]}, {result["date"] if "date" in result else ""}\nContent: {result["snippet"]}' 
        for result in organic_results
    ])

def get_google_ctx(q):
    """실시간 검색 결과를 LLM 입력에 사용할 형태로 변환"""
    search_results = search_serpapi(q)
    if 'organic_results' in search_results:
        return concat_snippets(search_results['organic_results'])
    else:
        return ""

# 4. RAG

In [128]:
def retrieve_relevant_news(news_headline):
    """사내 뉴스 및 Google 검색을 결합하여 관련 뉴스 검색"""
    dpr = DPR()
    _, retrieved_news = dpr.search(news_headline)
    retrieved_news=[]
    print(f"retrieved_news:{retrieved_news}")
    google_results = get_google_ctx(news_headline)
    print(google_results)
    
    return retrieved_news + [google_results]

def get_plausibility_score(news_text, retrieved_news):
    """GPT-4o를 이용하여 뉴스의 개연성 점수 예측"""
    input_data = {"news_text":news_text, "retrieved_news" : retrieved_news}
    result = chain_1.invoke(input_data)
    print(result)
    return result


def generate_final_score(news_text, retrieved_news):
    """샘플링을 통해 최종 개연성 점수 생성 (0~1 범위)"""
    scores = [get_plausibility_score(news_text, retrieved_news) for _ in range(3)]
    final_score = sum(scores) / len(scores)
    return final_score / 10  # 0~1 범위로 정규화

# Test

In [11]:
# 🔹 테스트 데이터 로드 (가상의 뉴스 데이터)
test_news = ["테스트 뉴스 1", "테스트 뉴스 2", "테스트 뉴스 3"]
true_labels = [1, 0, 1]  # 1: Real, 0: Fake

# 🔹 개연성 점수 예측
predicted_scores = [generate_final_score(news, retrieve_relevant_news(news)) for news in test_news]

# 🔹 AUC-ROC 계산
auc_score = roc_auc_score(true_labels, predicted_scores)
print(f"AUC-ROC Score: {auc_score:.4f}")

FileNotFoundError: An error happened while trying to locate the file on the Hub and we cannot find the requested files in the local cache. Please check your connection and try again or make sure your Internet connection is on.

In [129]:
input_data = {"news_text":'news_text', "retrieved_news" : 'retrieved_news'}
result = chain_1.invoke(input_data)
result

{'plausibility_score': '1',
 'reason': "I apologize, but I cannot provide a plausibility score or reasoning for the news you've mentioned. The 'news_text' and 'retrieved_news' fields in your prompt are empty placeholders, so there's no actual news content for me to analyze. Without specific news text to evaluate, it's not possible to make an informed judgment about its plausibility. If you'd like me to assess a particular news item, please provide the actual news text and any relevant retrieved information."}