# 뉴스 수집

> 로컬에서 돌리세용

- https://finance.naver.com/news/news_search.naver
- 기간: **25/09/26 ~ 25/11/05**
- 수집 검색어: **'삼성전자', 'SK하이닉스'**

In [None]:
import pandas as pd
import time
import re
from collections import defaultdict
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
import requests
import selenium
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup as bs

In [None]:
def news_href_crawl(keyward, start_date, end_date):
    '''
    네이버 페이증권 뉴스 링크 수집 함수
    '''
    url = f"https://finance.naver.com/news/news_search.naver"

    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--incognito')

    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=chrome_options
    )
    driver.get(url)
    time.sleep(2)

    # 검색어, 날짜 입력
    elements = driver.find_elements(By.CLASS_NAME, "inputTxt")
    keyward_input = elements[0]
    keyward_input.clear()
    keyward_input.send_keys(keyward)

    start_date_input, end_date_input = elements[1], elements[2]
    start_date_input.clear()
    end_date_input.clear()
    start_date_input.send_keys(start_date)
    end_date_input.send_keys(end_date)

    search = driver.find_element(By.CLASS_NAME, "keywordArea").find_elements(By.TAG_NAME, 'input')[-1]
    search.click()

    url_list = []
    href_list = []
    title_list = []
    time_list = []

    page_list = driver.find_element(By.CLASS_NAME, "Nnavi").find_elements(By.TAG_NAME, 'td')

    # 1-10 page url
    for page in page_list[:-2]:
        url_list.append(page.find_element(By.TAG_NAME, 'a').get_property('href'))

    page_list[-1].click() # 맨뒤 클릭
    time.sleep(2)

    page_list = driver.find_element(By.CLASS_NAME, "Nnavi").find_elements(By.TAG_NAME, 'td')
    end_page_num = int(page_list[-1].text) # 전체 페이지 수
    roof_num = end_page_num // 10

    # page url 수집
    for page in page_list[2:]:
        url_list.append(page.find_element(By.TAG_NAME, 'a').get_property('href'))

    page_list[1].click() # 이전 클릭

    for i in range(0, roof_num-2):
        page_list = driver.find_element(By.CLASS_NAME, "Nnavi").find_elements(By.TAG_NAME, 'td')
        for page in page_list[2:-2]:
            url_list.append(page.find_element(By.TAG_NAME, 'a').get_property('href'))
        page_list[1].click()
        time.sleep(2)

    # 뉴스 링크, 제목, 날짜 수집
    for url in list(set(url_list)):
        driver.get(url)
        time.sleep(2)

        news_list = driver.find_element(By.CLASS_NAME, "newsList").find_elements(By.CLASS_NAME, 'articleSubject')
        tt_list = driver.find_element(By.CLASS_NAME, "newsList").find_elements(By.CLASS_NAME, 'articleSummary')

        for i in range(len(news_list)):
            href_list.append(news_list[i].find_element(By.TAG_NAME, 'a').get_property('href'))
            title_list.append(news_list[i].text)
            time_list.append(tt_list[i].find_element(By.CLASS_NAME, 'wdate').text)

    driver.close()

    return pd.DataFrame({"href": href_list,
                         "title": title_list,
                         "date": time_list
                        })

In [None]:
def news_content_crawl(url_list):
    '''
    news_href_crawl에서 수집된 뉴스 본문 수집 함수
    '''
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--incognito')

    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=chrome_options
    )

    content_list = []
    for url in tqdm(url_list):
        driver.get(url)
        time.sleep(2)

        text = driver.find_element(By.TAG_NAME, 'article').text
        text = re.sub(r"\([^)]*기자\)|[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", "", text) # 기자명, 이메일 제거
        text = re.sub(r"\[.*?\]", "", text)      # 대괄호 [] 안 내용 제거
        text = re.sub(r"\s+", " ", text).strip() # 개행/여백 정리

        content_list.append(text)

    driver.close()

    return content_list

In [None]:
d_list = pd.date_range(start='20250926', end='20251105', freq='D').strftime('%Y-%m-%d').tolist()
d_list[:5]

['2025-09-26', '2025-09-27', '2025-09-28', '2025-09-29', '2025-09-30']

In [None]:
for key in ['삼성전자', 'SK하이닉스']:
    for date in tqdm(d_list):
        df = news_href_crawl(key, date, date)
        content_list = news_content_crawl(list(df['href']))
        df['content'] = content_list
        df.to_csv(f'news_data/{key}_{date}.csv', index=False)

# 데이터 병합

In [None]:
import os
import pandas as pd

In [None]:
# 파일 경로 설정
path = 'C:/Users/eunkk/Sejong/4-2/GMSW_media/project/news_data'

# 파일 목록 가져오기
file_list = os.listdir(path)
len(file_list)

In [None]:
all_dataframes = []

for file in file_list:
    if file.endswith('.csv'):
        file_path = os.path.join(path, file)
        df = pd.read_csv(file_path)
        all_dataframes.append(df)

In [None]:
# 모든 데이터프레임을 하나로 합치기
merged_df = pd.concat(all_dataframes, ignore_index=True)

In [None]:
merged_df['content'] = merged_df['content'].str.replace("[^A-Za-z0-9가-힣] ","")
merged_df['content'] = merged_df['content'].str.replace(r"[\'\"·‘’]", "", regex=True)
merged_df = merged_df[merged_df['content'].isnull()!=True]

In [None]:
merged_df.drop_duplicates(subset=['href', 'date'], inplace=True)
merged_df.sort_values('date', inplace=True)
merged_df.reset_index(drop=True, inplace=True)
merged_df.tail()

In [None]:
merged_df.to_csv(f'news_data/all_data.csv', index=False)