In [None]:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
from datetime import datetime
import csv
import os

headers = {'User-Agent': 'Mozilla/5.0'}
csv_path = 'D:/csv/hist_news/hist_news_data.csv'

# 파일이 존재하지 않거나 비어 있으면 헤더를 추가합니다.
if not os.path.exists(csv_path) or os.stat(csv_path).st_size == 0:
    with open(csv_path, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Date', 'Header', 'Content'])  # 헤더 작성

def fetch_news(page):
    results = []
    with requests.Session() as session:
        url = f'https://www.investing.com/news/cryptocurrency-news/{page}'
        response = session.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        news_links = soup.select('#leftColumn div.textDiv a.title')
        
        for link in news_links:
            news_url = f"https://www.investing.com{link['href']}"
            news_response = session.get(news_url, headers=headers)
            news_soup = BeautifulSoup(news_response.text, 'html.parser')
            
            title_element = news_soup.select_one("#leftColumn > h1")
            if title_element:
                header = title_element.text.strip()
                article_content = news_soup.select_one('#leftColumn > div.WYSIWYG.articlePage')
                if article_content:
                    exclude_elements = article_content.select('#imgCarousel > span, div.relatedInstrumentsWrapper > div')
                    for exclude_element in exclude_elements:
                        exclude_element.decompose()
                
                date_info = news_soup.select_one("#leftColumn > div:nth-child(6) > span:nth-child(1)")
                if date_info and article_content:
                    temp_time = datetime.strptime(date_info.text.strip().replace("Published ", "").replace(" ET", ""), '%b %d, %Y %H:%M%p')
                    temp_article = article_content.text.strip().replace(",", " ").replace("\n", " ")
                    
                    results.append([temp_time, header, temp_article])
    return results

# 병렬 처리를 위한 ThreadPoolExecutor 사용
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(fetch_news, page) for page in range(1, 2)]
    results = [f.result() for f in futures if f.result()]

# 결과를 데이터프레임으로 변환 후 CSV 파일로 저장
flattened_results = [item for sublist in results for item in sublist]
df = pd.DataFrame(flattened_results, columns=['Date', 'Header', 'Content'])
df.to_csv(csv_path, index=False, mode='a', header=False)  # 새로운 데이터만 추가

print("데이터 수집 및 저장 완료")