# 1. 라이브러리 및 API key 설정

In [1]:
import os
import torch
import numpy as np
import datasets
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import roc_auc_score
from openai import OpenAI
from serpapi import GoogleSearch 
from dotenv import load_dotenv
import pandas as pd
import os
import openai
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langsmith.wrappers import wrap_openai
from langsmith import traceable
from langchain_anthropic import ChatAnthropic, AnthropicLLM
from langchain.llms import OpenAI
from langchain.chains import LLMChain, SimpleSequentialChain, SequentialChain
from langchain.memory import SimpleMemory
from datetime import datetime
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
import json
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain.prompts import PromptTemplate

  from .autonotebook import tqdm as notebook_tqdm


In [25]:


# Load environment variables
load_dotenv()

# Auto-trace LLM calls in-context
client = wrap_openai(openai.Client())


In [26]:
# OpenAI 및 SerpAPI 키 설정
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
SERP_API_KEY = os.getenv("SERPAPI_API_KEY")

# OpenAI 클라이언트 초기화
client = OpenAI(api_key=OPENAI_API_KEY)

# FAISS 인덱스 저장 경로
FAISS_INDEX_PATH = Path('local/news-please/faiss_index')

# 2. DPR 기반 사내 뉴스 검색 시스템

In [27]:
class DPR():
    def __init__(self):
        """사내 뉴스 데이터베이스를 로드하고 FAISS 인덱스를 활용한 검색을 수행하는 DPR 클래스"""
        self.ds = datasets.load_dataset('sanxing/advfake_news_please')['train']
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.index_dpr()

    @torch.no_grad()
    def index_dpr(self):
        """DPR 기반 뉴스 임베딩을 생성하고 FAISS 인덱스를 구축"""
        from transformers import DPRContextEncoder, DPRContextEncoderTokenizer

        faiss_path = FAISS_INDEX_PATH / 'my_index.faiss'
        if faiss_path.exists():
            print('🔹 FAISS 인덱스 로드 중...')
            self.ds.load_faiss_index('embeddings', str(faiss_path))
            return

        print('🔹 FAISS 인덱스 생성 중...')
        ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base").to(self.device)
        ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

        # 🔹 뉴스 제목을 임베딩 벡터로 변환하여 저장
        ds_with_embeddings = self.ds.map(lambda example: {
            'embeddings': ctx_encoder(**ctx_tokenizer(example["title"], return_tensors="pt", padding=True).to(self.device))[0].cpu().numpy()
        }, batched=True, batch_size=64)

        ds_with_embeddings.add_faiss_index(column='embeddings')

        print('🔹 FAISS 인덱스 저장 중...')
        ds_with_embeddings.save_faiss_index('embeddings', str(faiss_path))

    @torch.no_grad()
    def search(self, query):
        """입력된 쿼리를 기반으로 가장 관련성 높은 뉴스 10개 검색"""
        from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer

        q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base").to(self.device)
        q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

        # 🔹 입력 쿼리를 DPR 임베딩으로 변환
        question_embedding = q_encoder(**q_tokenizer(query, return_tensors="pt").to(self.device))[0][0].cpu().numpy()
        
        # 🔹 FAISS 검색 수행
        scores, retrieved_examples = self.ds.get_nearest_examples('embeddings', question_embedding, k=5)

        # 🔹 결과를 딕셔너리 리스트 형태로 변환
        retrieved_examples = [dict(zip(retrieved_examples, t)) for t in zip(*retrieved_examples.values())]

        return scores, retrieved_examples

In [115]:
import torch
import datasets
import faiss
from pathlib import Path
from transformers import (
    DPRContextEncoder, DPRContextEncoderTokenizer,
    DPRQuestionEncoder, DPRQuestionEncoderTokenizer
)

FAISS_INDEX_PATH = Path("faiss_index")  # 저장할 폴더
DATASET_SAVE_PATH = Path("news_dataset")  # 데이터셋 저장 위치
FAISS_FILE = FAISS_INDEX_PATH / "my_index.faiss"

class DPR():
    def __init__(self):
        """사내 뉴스 데이터베이스를 로드하고 FAISS 인덱스를 활용한 검색을 수행하는 DPR 클래스"""
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

        # 🔹 저장된 데이터셋이 있으면 로드, 없으면 새로 다운로드
        if DATASET_SAVE_PATH.exists():
            print("🔹 저장된 데이터셋 로드 중...")
            self.ds = datasets.load_from_disk(str(DATASET_SAVE_PATH))
        else:
            print("🔹 새로운 데이터셋 다운로드 중...")
            self.ds = datasets.load_dataset('sanxing/advfake_news_please')['train']
            self.ds.save_to_disk(str(DATASET_SAVE_PATH))  # 저장

        # FAISS 인덱싱
        self.index_dpr()

    @torch.no_grad()
    def index_dpr(self):
        """DPR 기반 뉴스 임베딩을 생성하고 FAISS 인덱스를 구축"""
        if FAISS_FILE.exists():
            print('🔹 FAISS 인덱스 로드 중...')
            self.ds.load_faiss_index('embeddings', str(FAISS_FILE))
            return

        print('🔹 FAISS 인덱스 생성 중...')
        ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base").to(self.device)
        ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

        def embed_batch(examples):
            inputs = ctx_tokenizer(examples["title"], return_tensors="pt", padding=True, truncation=True).to(self.device)
            embeddings = ctx_encoder(**inputs).pooler_output.cpu().numpy()
            return {"embeddings": embeddings}

        ds_with_embeddings = self.ds.map(embed_batch, batched=True, batch_size=64)
        ds_with_embeddings.add_faiss_index(column='embeddings')

        print('🔹 FAISS 인덱스 저장 중...')
        FAISS_INDEX_PATH.mkdir(parents=True, exist_ok=True)
        ds_with_embeddings.save_faiss_index('embeddings', str(FAISS_FILE))

        # 🔹 데이터셋도 함께 저장
        ds_with_embeddings.save_to_disk(str(DATASET_SAVE_PATH))
        self.ds = ds_with_embeddings  # 메모리에 저장

    @torch.no_grad()
    def search(self, query):
        """입력된 쿼리를 기반으로 가장 관련성 높은 뉴스 5개 검색"""
        q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base").to(self.device)
        q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

        # 🔹 입력 쿼리를 DPR 임베딩으로 변환
        inputs = q_tokenizer(query, return_tensors="pt", padding=True, truncation=True).to(self.device)
        question_embedding = q_encoder(**inputs).pooler_output.cpu().numpy()[0]

        # 🔹 FAISS 검색 수행
        scores, retrieved_examples = self.ds.get_nearest_examples('embeddings', question_embedding, k=5)

        # 🔹 결과를 딕셔너리 리스트 형태로 변환
        retrieved_examples = [dict(zip(retrieved_examples, t)) for t in zip(*retrieved_examples.values())]

        return scores, retrieved_examples

# 3. Google SerpAPI 기반

In [3]:
def search_serpapi(q):
    """Google SerpAPI를 사용하여 실시간 뉴스 검색"""
    params = {
        "api_key": SERP_API_KEY,  # SerpAPI 키
        "engine": "google",  # Google 검색 엔진 사용
        "q": q,  # 검색어
        "location": "Austin, Texas, United States",  # 검색 지역 지정
        "google_domain": "google.com",
        "gl": "us",  # 국가 설정 (미국)
        "hl": "en",  # 언어 설정 (영어)
        "num": "30"  # 검색 결과 최대 30개 가져오기
    }

    search = GoogleSearch(params)  # SerpAPI를 사용한 Google 검색 객체 생성
    results = search.get_dict()  # JSON 형식으로 검색 결과 가져오기
    return results  # 결과 반환

def concat_snippets(organic_results):
    """검색된 기사에서 유의미한 정보를 필터링하고 요약"""
    organic_results = [result for result in organic_results if 'snippet' in result]  # snippet이 있는 기사만 필터링
    organic_results = [result for result in organic_results if 'NBC' not in result['source'] and 'NBC' not in result['title']]  # NBC 뉴스 제외
    organic_results = [result for result in organic_results if 'fact' not in result['link']]  # 'fact'가 포함된 링크 제외 (팩트체크 기사 필터링)
    organic_results = organic_results[:5]  # 상위 5개 기사만 선택

    return '\n'.join([
        f'Title: {result["title"]}\nSource: {result["source"]}, {result["date"] if "date" in result else ""}\nContent: {result["snippet"]}' 
        for result in organic_results
    ])

def get_google_ctx(q):
    """실시간 검색 결과를 LLM 입력에 사용할 형태로 변환"""
    search_results = search_serpapi(q)  # Google 검색 실행
    if 'organic_results' in search_results:
        return concat_snippets(search_results['organic_results'])  # 검색 결과 정리 후 반환
    else:
        return ""

# 4. RAG

In [35]:
def retrieve_relevant_news(news_headline):
    """사내 뉴스 및 Google 검색을 결합하여 관련 뉴스 검색"""
    #dpr = DPR()
    #_, retrieved_news = dpr.search(news_headline)
    retrieved_news=[]
    print(f"retrieved_news:{retrieved_news}")
    google_results = get_google_ctx(news_headline)
    print(google_results)
    
    return retrieved_news + [google_results]

def get_plausibility_score(news_text, retrieved_news):
    """LLM을 이용하여 뉴스의 개연성 점수 예측"""
    input_data = {"news_text":news_text, "retrieved_news" : retrieved_news}
    result = chain1.invoke(input_data)
    print(result)
    return result


def generate_final_score(news_text, retrieved_news):
    """샘플링을 통해 최종 개연성 점수 생성 (0~1 범위)"""
    scores = [get_plausibility_score(news_text, retrieved_news)['plausibility_score'] for _ in range(3)]
    scores = list(map(int, scores))
    final_score = sum(scores) / len(scores)
    return final_score / 10  # 0~1 범위로 정규화

In [43]:
llm1 = ChatOpenAI(temperature=1,               # 창의성 (0.0 ~ 2.0) 
                 max_tokens=2048,             # 최대 토큰수
                 model_name='gpt-4o',  # 모델명
                )

llm2 = ChatAnthropic(model='claude-3-5-sonnet-20240620',max_tokens=8192)




response_schemas = [
    ResponseSchema(
        name="plausibility_score",
        description="A numerical score between 1 and 10 that represents how plausible the given statement is. "
                    "1 means 'completely false', while 10 means 'fully plausible'."
    ),
    # ResponseSchema(
    #     name="reason",
    #     description="A textual explanation providing the reasoning behind the assigned plausibility score."
    # ),
]

# 응답 스키마를 기반으로 한 구조화된 출력 파서 초기화
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

# 출력 형식 지시사항을 파싱
format_instructions = output_parser.get_format_instructions()

prompt_template_1 = """

You are responsible for evaluating the credibility of a news article.  
Your task is to critically assess whether the given news aligns with facts, logic, and established patterns of misinformation. 

### **Evaluation Criteria**:
1. **Lack of Factual Basis**: Does the article make claims without clear sources or evidence?  
2. **Logical Fallacies**: Are there exaggerated or unnatural cause-effect relationships?  
3. **Bias and Framing**: Does the article present a one-sided narrative while omitting counterarguments?  
4. **Pattern Recognition**: Does the article resemble past misinformation or hoaxes?  

### **Input**:
#### News Article:
"{news_text}"

#### Retrieved News:
{retrieved_news}

### **Assessment**:
- **1** = Completely False (No factual basis, likely misinformation)  
- **5** = Uncertain (Some facts may be true, but unverified claims exist)  
- **9** = Highly Plausible (Strong evidence and logical consistency)  

Be as **conservative** as possible in your rating.

{format_instructions}
    """



prompt1 = PromptTemplate(
    template=prompt_template_1,
    input_variables=["news_text","retrieved news"],
    partial_variables={"format_instructions": format_instructions},
)

chain1 = prompt1 | llm2 | output_parser  # 프롬프트, 모델, 출력 파서를 연결




# Test

In [42]:
# 🔹 테스트 데이터 로드 (가상의 뉴스 데이터)
test_news = ["테스트 뉴스 1", "테스트 뉴스 2", "테스트 뉴스 3"]
true_labels = [1, 0, 1]  # 1: Real, 0: Fake

# 🔹 개연성 점수 예측
predicted_scores = [generate_final_score(news, retrieve_relevant_news(news)) for news in test_news]

# 🔹 AUC-ROC 계산
auc_score = roc_auc_score(true_labels, predicted_scores)
print(f"AUC-ROC Score: {auc_score:.4f}")

retrieved_news:[]
Title: 뉴스1
Source: 뉴스1, 
Content: '불행한 대한민국' 삶의 만족도 6.4점…OECD 38개국 중 33위 · 작년 가구순자산 301만원 늘어난 3.9억원… · 韓 자살률 10만명당 27.3명…'OECD 1등' 오명 계속 ...
Title: 정치
Source: 뉴스1, 
Content: 뉴스1 정치 뉴스를 제공합니다. 대통령실ㆍ총리실, 국회ㆍ정당, 감사원ㆍ위원회, 국방ㆍ외교, 통일, 정치일반 등의 뉴스를 제공합니다.
Title: 뉴스1 (@News1Kr) / X
Source: x.com, 
Content: 사실 앞에 겸손한 민영 통신 뉴스1입니다. 유튜브 : http://youtube.com/user/news1korea. Translate bio.
Title: 뉴스 테스트
Source: 나무위키, 
Content: 다양한 분야의 뉴스/시사 상식의 습득 수준을 객관적이고 체계적인 지표로 측정할 수 있도록 개발된 뉴스 상식 검정 시험이다. 2017년 3월 25일 시행된 1회 시험에서 최고 ...
Title: Minuteman III test launch showcases readiness of US ...
Source: Space Force (.mil), 4 days ago
Content: Minuteman III test launch showcases readiness of US nuclear force's safe, effective deterrent. Published Feb. 19, 2025; By Staff Sgt. Joshua ...


KeyboardInterrupt: 

In [44]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


# 신뢰도 평가 수행
results = []
for _, row in data.iterrows():
    retrieved_news = retrieve_relevant_news(row['title'])
    output = get_plausibility_score(row['text'], retrieved_news)
    results.append({
        "title": row['title'],
        "content": row['text'],
        "label": row['label'],
        "score": output['plausibility_score'],
        # "reason": output['reason']
    })

eval_df = pd.DataFrame(results)

# 다양한 역치 값을 시도하여 분류 성능 평가
thresholds = np.arange(1, 10)  # 역치 1~9 시도
best_threshold = None
best_f1 = 0

all_results = []

for threshold in thresholds:
    eval_df['predicted'] = eval_df['score'] >= threshold  # 역치 적용
    y_true = eval_df['label'].astype(int)
    y_pred = eval_df['predicted'].astype(int)
    
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_true, y_pred)
    
    all_results.append({
        "threshold": threshold,
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1_score": f1,
        "roc_auc": roc_auc
    })
    
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

# 결과 정리 및 출력
eval_results_df = pd.DataFrame(all_results)
import ace_tools as tools
tools.display_dataframe_to_user(name="Evaluation Results", dataframe=eval_results_df)

# 최적의 역치 및 성능 출력
print(f"Best Threshold: {best_threshold}")
print(eval_results_df[eval_results_df['threshold'] == best_threshold])


retrieved_news:[]
Title: US Attorney General Calls for Efficient Review of ...
Source: Voice of America English News, Dec 6, 2017
Content: US Attorney General Jeff Sessions on Wednesday called on the nation's immigration courts to decide cases more efficiently, amid a burgeoning backlog.
Title: Attorney General Pam Bondi plans review of cases brought ...
Source: CNN, Feb 5, 2025
Content: Attorney General Pam Bondi is expected to order a review of the cases brought against President Donald Trump, including those undertaken by prosecutors in New ...
Title: Departments of Homeland Security and ...
Source: Department of Justice (.gov), May 16, 2024
Content: “The Justice Department's immigration courts are committed to the just and efficient enforcement of the immigration laws,” said Attorney General ...
Title: Attorney General Pam Bondi orders review of Trump ...
Source: CBS News, Feb 5, 2025
Content: The attorney general established a "weaponization working group" to review Biden administ

OutputParserException: Got invalid JSON object. Error: Expecting value: line 1 column 1 (char 0)
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 

In [65]:

eval_df = pd.DataFrame(results)
# Convert 'score' column to numeric, setting invalid values to NaN
eval_df['score'] = pd.to_numeric(eval_df['score'], errors='coerce')

# Fill NaN values with a default score (e.g., 0 or another strategy)
eval_df['score'].dropna(inplace=True)  # You can change 0 to another default

# 다양한 역치 값을 시도하여 분류 성능 평가
thresholds = np.arange(1, 10)  # 역치 1~9 시도
best_threshold = None
best_f1 = 0

all_results = []

for threshold in thresholds:
    eval_df['predicted'] = eval_df['score'].astype(float) >= threshold  # 역치 적용
    y_true = eval_df['label'].astype(int)
    y_pred = eval_df['predicted'].astype(int)
    
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_true, y_pred)
    
    all_results.append({
        "threshold": threshold,
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1_score": f1,
        "roc_auc": roc_auc
    })
    
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

# 결과 정리 및 출력
eval_results_df = pd.DataFrame(all_results)

# 최적의 역치 및 성능 출력
print(f"Best Threshold: {best_threshold}")
print(eval_results_df[eval_results_df['threshold'] == best_threshold])


Best Threshold: 8
   threshold  accuracy  precision    recall  f1_score   roc_auc
7          8  0.973545   0.956522  0.988764  0.972376  0.974382


In [51]:
import openpyxl
pd.DataFrame(results).to_excel("result_middle.xlsx")

In [61]:
pip install ace_tools

Collecting ace_tools
  Downloading ace_tools-0.0-py3-none-any.whl.metadata (300 bytes)
Downloading ace_tools-0.0-py3-none-any.whl (1.1 kB)
Installing collected packages: ace_tools
Successfully installed ace_tools-0.0
Note: you may need to restart the kernel to use updated packages.


In [52]:
results_middle=pd.DataFrame(results)

In [53]:
len(results_middle)

189

In [50]:
pip install openpyxl

Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
Note: you may need to restart the kernel to use updated packages.


# Fake News 전처리

In [34]:
import pandas as pd
import numpy as np
# Load Dataset
true_data = pd.read_csv('/Users/yoonjincho/Desktop/FakeNews_kaggle/True.csv')
fake_data = pd.read_csv('/Users/yoonjincho/Desktop/FakeNews_kaggle/Fake.csv')

# Generate labels True/Fake under new Target Column in 'true_data' and 'fake_data'
true_data['label'] = 1
fake_data['label'] = 0

# Merge 'true_data' and 'fake_data', by random mixing into a single df called 'data'
data = pd.concat([true_data, fake_data], ignore_index = True).sample(frac=1).reset_index().drop(columns = ['index'])

# See how the data looks like
print(data.shape)
data.head(30)

(44898, 5)


Unnamed: 0,title,text,subject,date,label
0,U.S. attorney general calls for efficient revi...,WASHINGTON (Reuters) - U.S. Attorney General J...,politicsNews,"December 6, 2017",1
1,Ted Nugent Just Posted The Most Racist F*ckin...,"If you thought Trump supporters wered bad, wai...",News,"March 31, 2016",0
2,SEC approves budget of independent accounting ...,WASHINGTON (Reuters) - The Securities and Exch...,politicsNews,"March 14, 2016",1
3,Ukraine says ammo depot explosions huge blow t...,KIEV (Reuters) - The destruction of two ammuni...,worldnews,"September 28, 2017",1
4,"In re-election bid, Ohio senator keeps safe di...","COLUMBUS, Ohio/WASHINGTON (Reuters) - Rob Port...",politicsNews,"August 23, 2016",1
5,PM May is driving Britain to cliff-edge Brexit...,"BRIGHTON, England (Reuters) - A cliff-edge Bre...",worldnews,"September 27, 2017",1
6,VA WHISTLEBLOWER TELLS OF “LAWLESSNESS AND CHA...,This is nothing short of outrageous! Billions ...,Government News,"May 16, 2015",0
7,Germany must stop relying on U.S. for foreign ...,BERLIN (Reuters) - Germany should be more asse...,worldnews,"December 4, 2017",1
8,North Korea calls terror relisting 'serious pr...,SEOUL (Reuters) - North Korea denounced on Wed...,worldnews,"November 22, 2017",1
9,Trump touts urban policy following detour to o...,"CHARLOTTE, N.C. (Reuters) - Republican preside...",politicsNews,"October 26, 2016",1
