# 1. 라이브러리 및 API key 설정

In [None]:
import os
import torch
import numpy as np
import datasets
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import roc_auc_score
from openai import OpenAI
from serpapi import GoogleSearch 
from dotenv import load_dotenv
import pandas as pd
import os
import openai
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langsmith.wrappers import wrap_openai
from langsmith import traceable
from langchain_anthropic import ChatAnthropic, AnthropicLLM
from langchain.llms import OpenAI
from langchain.chains import LLMChain, SimpleSequentialChain, SequentialChain
from langchain.memory import SimpleMemory
from datetime import datetime
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
import json
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain.prompts import PromptTemplate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:


# Load environment variables
load_dotenv()

# Auto-trace LLM calls in-context
client = wrap_openai(openai.Client())


In [4]:
# OpenAI 및 SerpAPI 키 설정
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
SERP_API_KEY = os.getenv("SERPAPI_API_KEY")

# OpenAI 클라이언트 초기화
client = OpenAI(api_key=OPENAI_API_KEY)

# FAISS 인덱스 저장 경로
FAISS_INDEX_PATH = Path('local/news-please/faiss_index')

# 2. Entity 5개 선정


In [5]:
# text 기준으로 데이터 만들 예정
data = pd.read_csv("/Users/yoonjincho/Desktop/윤진 2025/YAI/YAI_personal_archive/NLP_ToyProject/20250226data.csv")
data.rename(columns={"text": "news"}, inplace=True)
data

Unnamed: 0.1,Unnamed: 0,label,news,title
0,49241,1,u.s. senate republican leader mitch mcconnell ...,senate leader opposes lecturing myanmar leader...
1,31990,1,four u.s. senators - two democrats and two rep...,senators introduce bill aimed at getting gener...
2,15919,1,note little johnny might want consider use dia...,
3,34664,0,president obama is clearly in good spirits tha...,president obama gets his troll game on by than...
4,34518,1,turkey said on monday its former economy minis...,turkey says u.s. indictment of former minister...
...,...,...,...,...
65852,34389,0,if you ve at all been watching the news covera...,the backstreet boys send harsh message to trum...
65853,43991,0,donald trump and his deplorable supporters are...,muslim family restaurant finds creative way to...
65854,49494,1,a russian-backed congress of syrian peoples in...,syrian congress in russia postponed to februar...
65855,47493,0,senate republicans have publicly said they are...,this is the first sign republicans are already...


In [7]:
import os
import json
import wikipedia
import nltk
import pandas as pd

# LangChain 관련
from langchain.llms import OpenAI  # 예시로 OpenAI 사용
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# 의학과학개요 검토 LLM
llm1 = ChatOpenAI(temperature=1,               # 창의성 (0.0 ~ 2.0) 
                 max_tokens=2048,             # 최대 토큰수
                 model_name='gpt-4o-mini',  # 모델명
                )

llm2 = ChatAnthropic(model='claude-3-5-haiku-20241022',max_tokens=8192)

########################################
# 1. LangChain을 이용한 키워드 추출 함수
########################################
def extract_entities_langchain(news_text: str, llm) -> list:
    """
    주어진 뉴스 문자열(news_text)에 대해 LangChain + LLM을 이용하여
    5개의 키워드를 리스트 형태로 반환한다.
    """
    # 프롬프트 템플릿 정의
    prompt = PromptTemplate(
        input_variables=["s"],
        template=(
            """
            News: {s}
            Task: Identify five named entities within the news above that necessitate elucidation for the populace to understand the news comprehensively.
            Ensure a diverse selection of the entities. The answer should be in the form of python list.
            Return the answer strictly as a json format with no additional explanation or commentary. The output should contain only the json and nothing else.
            
            <output example>
            {{"entities": ["a","b","c"]}}
            </output example>
            """
        )
    )

    chain =prompt | llm | StrOutputParser()
    response = json.loads(chain.invoke({"s": news_text}))
    return response['entities']


########################################
# 2. 위키 검색 및 요약을 가져오는 함수
########################################
def get_full_text(page_title):
    try:
        page = wikipedia.page(page_title, auto_suggest=False)
        full_text = page.content
    except wikipedia.DisambiguationError as e:
        # 검색어가 동음이의어 문서일 경우, 첫 번째 옵션으로 재시도
        try:
            page = wikipedia.page(e.options[0], auto_suggest=False)
            full_text = page.content
        except:
            return None
    except:
        # 그 외 예외 처리
        return None
    return full_text

def get_wikipedia_summary(entity: str) -> str:
    """
    특정 entity에 대해 위키피디아에서 검색 후, 요약(최대 max_sentences문장)을 반환.
    검색 실패나 예외 발생 시 None 반환.
    """
    try:
        # 위키에서 검색 (ko, en 등 필요한 언어판 지정 가능)
        wikipedia.set_lang("en")  # 예: 영어판 검색. 한국어판 검색 시 "ko"로 설정
        search_results = wikipedia.search(entity)
        if not search_results:
            return ("", "")
        # 첫 번째 검색 결과를 토대로 요약
        page_title = search_results[0]
        summary_text = wikipedia.summary(page_title, auto_suggest=False)
        full_text = wikipedia.page(page_title, auto_suggest=False).content
    except wikipedia.DisambiguationError as e:
        # 검색어가 동음이의어 문서일 경우, 첫 번째 옵션으로 재시도
        try:
            s = e.options[0]
            summary_text = wikipedia.summary(s, auto_suggest=False)
            full_text = wikipedia.page(s, auto_suggest=False).content
        except:
            return ("", "")
    except:
        # 그 외 예외
        return ("", "")
    return summary_text, full_text


########################################


In [None]:
########################################
# 3. 실제 실행부 (주피터에서 바로 실행 가능)
########################################

# 예시 파일 경로
input_path = "input_news.csv"             # 첫 번째 열(column)에 'news'라는 컬럼이 있다고 가정
output_path = "output_with_entities.csv"  # 결과 CSV 파일명

# CSV 읽기
df = data
if "news" not in df.columns:
    raise ValueError("데이터프레임에 'news' 컬럼이 없습니다. CSV 구조를 확인하세요.")

# LLM 준비 (OpenAI 예시)
# 주의: 반드시 os.environ["OPENAI_API_KEY"] 등에 OpenAI API 키를 설정해두어야 합니다.

# 결과 컬럼
df["extracted_entities"] = None
df["wiki_summary"] = None
df["wiki_full"] = None

# 각 행 처리
result_rows = []
for idx, row in df.iterrows():
    if idx<=250 :
        pass

    news_text = row["news"]

    # 1) 뉴스에서 키워드(엔티티) 5개 추출
    entities = extract_entities_langchain(news_text, llm2)
    print(entities)
    # 2) 위키 검색 요약
    wiki_texts = []
    wiki_full=[]
    for ent in entities:
        print(ent)
        summary, full = get_wikipedia_summary(ent)
        if summary is not None:
            wiki_texts.append(f"{ent}: {summary}")
            wiki_full.append(f"{ent}: {full}")
        else:
            wiki_texts.append(f"{ent}: [No summary found]")
            wiki_full.append(f"{ent}: [No full text found]")

    # (2열, 3열) 데이터 구성
    entities_str = ", ".join(entities)
    wiki_summaries_str = "\n\n".join(wiki_texts)
    wiki_full_str="\n\n".join(wiki_full)
    result_rows.append({
        "news": news_text,
        "extracted_entities": entities_str,
        "wiki_summary": wiki_summaries_str,
        "wiki_full": wiki_full_str
    })
    if idx%10==0 :
        out_df = pd.DataFrame(result_rows, columns=["news", "extracted_entities", "wiki_summary","wiki_full"])
        out_df.to_csv(output_path+"250", index=False, encoding="utf-8-sig")

# 결과 DataFrame 생성 및 CSV 저장

print(f"처리가 완료되었습니다. 결과가 '{output_path}'에 저장되었습니다.")



['Mitch McConnell', 'Aung San Suu Kyi', 'Rohingya', 'John McCain', 'Myanmar']
Mitch McConnell
Aung San Suu Kyi
Rohingya
John McCain
Myanmar


In [24]:
summary, full = get_wikipedia_summary(ent)

In [25]:
summary

'Myanmar, officially the Republic of the Union of Myanmar and also rendered as Burma (the official English form until 1989), is a country in northwest Southeast Asia. It is the largest country by area in Mainland Southeast Asia and has a population of about 55 million. It is bordered by India and Bangladesh to its northwest, China to its northeast, Laos and Thailand to its east and southeast, and the Andaman Sea and the Bay of Bengal to its south and southwest. The country\'s capital city is Naypyidaw, and its largest city is Yangon (formerly Rangoon).\nEarly civilisations in the area included the Tibeto-Burman-speaking Pyu city-states in Upper Myanmar and the Mon kingdoms in Lower Myanmar. In the 9th century, the Bamar people entered the upper Irrawaddy valley, and following the establishment of the Pagan Kingdom in the 1050s, the Burmese language and culture and Theravada Buddhism slowly became dominant in the country. The Pagan Kingdom fell to Mongol invasions, and several warring s