https://www.yomiuri.co.jp/news/

* 다음의 URL 에 있는 요미우리 뉴스 목록을 수집하고자 합니다.

### Request URL:
https://www.yomiuri.co.jp/y_ajax/latest_list_news_more/////50//50/1/?action=latest_list_news_more&others=%2F%2F%2F%2F50%2F%2F50%2F1%2F
Request Method:
GET
Status Code:
200 OK
Remote Address:
146.75.50.133:443
Referrer Policy:
strict-origin-when-cross-origin

## Payload 정보

action=latest_list_news_more&others=%2F%2F%2F%2F50%2F%2F50%2F1%2F


## 응답 일부
```

{
    "contents": "    <article class=\"news-top-latest__list-item\">\n      <div class=\"news-top-latest__list-item__inner\">\n        <h3><a href=\"\/sports\/koshien\/20240723-OYTAT50033\/\">\u300c\u798f\u5ca1\u5927\u300d\u5bfe\u6c7a\u3001\u300c\u5927\u6fe0\u300d\u304c\u300c\u82e5\u8449\u300d\u3092\u5236\u3059\u2026\uff13\uff15\u5e74\u3076\u308a\u306e\u7532\u5b50\u5712\u3078\u897f\u65e5\u672c\u77ed\u5927\u4ed8\u3068\u6c7a\u52dd\u3067\u5bfe\u6c7a<\/a><\/h3>\n        <div class=\"c-list-date\">\n          <time datetime=\"2024-07-23T16:34\">16:34<\/time>        <\/div>\n\n                                    <div class=\"c-matome-title\">\n          <a href=\"\/feature\/titlelist\/fukuoka\/\">\u9ad8\u6821\u91ce\u7403\u30fb\u798f\u5ca1<\/a>\n          <\/div>\n                  \n                      <\/div>\n                      <figure class=\"news-top-latest__list-item__thumb c-list-thumb c-list-thumb--small\">\n            <a href=\"\/sports\/koshien\/20240723-OYTAT50033\/\">\n                <img width=\"400\" height=\"287\" src=\"\/media\/2024\/07\/20240723-OYTAI50023-T.jpg?type=medium\" class=\"attachment-medium size-medium wp-post-image\" alt=\"\" loading=\"lazy\" decoding=\"async\" \/>            <\/a>\n        <\/figure>\n                  <\/article>\n    <article class=\"news-top-latest__list-item\">\n      <div class=\"news-top-latest__list-item__inner\">\n        <h3><a href=\"\/sports\/koshien\/20240723-OYTAT50027\/\">\u521d\u306e\uff14\u5f37\u5165\u308a\u3092\u3051\u3093\u5f15\u300c\u52dd\u8ca0\u5f37\u304f\u3084\u3063\u3066\u3053\u3089\u308c\u305f\u306e\u306f\u3088\u304b\u3063\u305f\u300d\u2026\u8fd1\u5927\u798f\u5ca1\uff13\u5e74\u30fb\u7530\u8fba\u5468\u4e3b\u5c06<\/a><\/h3>\n        <div class=\"c-list-date\">\n          <time datetime=\"2024-07-23T16:31\">16:31<\/time>        <\/div>\n\n                                    <div class=\"c-matome-title\">\n
```      

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import logging
import csv
import pandas as pd
import time

# 로깅 설정
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def fetch_yomiuri_news(page=1):
    url = f"https://www.yomiuri.co.jp/y_ajax/latest_list_news_more/////50//50/{page}/"
    params = {
        "action": "latest_list_news_more",
        "others": f"/////50//50/{page}/"
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    try:
        response = requests.get(url, params=params, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        logging.error(f"Request failed: {e}")
        return []

    try:
        data = json.loads(response.text)
        html_content = data.get("contents", "")

        if not html_content:
            logging.warning("No HTML content found in the response")
            return []

        soup = BeautifulSoup(html_content, 'html.parser')
        articles = soup.find_all('article', class_='news-top-latest__list-item')

        if not articles:
            logging.warning("No articles found in the HTML content")
            return []

        news_list = []
        for article in articles:
            try:
                title_element = article.find('h3').find('a')
                title = title_element.text.strip()
                link = "https://www.yomiuri.co.jp" + title_element['href']
                time = article.find('time').text.strip()

                news_list.append({
                    "title": title,
                    "link": link,
                    "time": time
                })
            except AttributeError as e:
                logging.error(f"Error parsing article: {e}")
                continue

        return news_list
    except json.JSONDecodeError as e:
        logging.error(f"JSON decoding failed: {e}")
        return []
    except Exception as e:
        logging.error(f"Unexpected error: {e}")
        return []

def fetch_all_pages():
    all_news = []
    page = 1
    while True:
        news_items = fetch_yomiuri_news(page)
        if not news_items:
            break
        all_news.extend(news_items)
        logging.info(f"Fetched page {page} with {len(news_items)} items")
        page += 1
        time.sleep(1)  # 서버에 부담을 주지 않기 위해 1초 대기
    return all_news

def validate_news_data(news_items):
    if not news_items:
        logging.warning("No news items collected")
        return False

    for item in news_items:
        if not all(key in item for key in ['title', 'link', 'time']):
            logging.warning(f"Invalid news item format: {item}")
            return False
        if not item['link'].startswith("https://www.yomiuri.co.jp"):
            logging.warning(f"Invalid link format: {item['link']}")
            return False

    logging.info(f"Successfully collected {len(news_items)} news items")
    return True

def save_to_csv(news_items, filename='yomiuri_news.csv'):
    with open(filename, 'w', newline='') as csvfile:
        fieldnames = ['title', 'link', 'time']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for item in news_items:
            writer.writerow(item)
    logging.info(f"Saved {len(news_items)} items to {filename}")

def save_to_excel(news_items, filename='yomiuri_news.xlsx'):
    df = pd.DataFrame(news_items)
    df.to_excel(filename, index=False)
    logging.info(f"Saved {len(news_items)} items to {filename}")

def load_from_csv(filename='yomiuri_news.csv'):
    df = pd.read_csv(filename)
    logging.info(f"Loaded {len(df)} items from {filename}")
    return df

def load_from_excel(filename='yomiuri_news.xlsx'):
    df = pd.read_excel(filename)
    logging.info(f"Loaded {len(df)} items from {filename}")
    return df

# 메인 실행 코드
if __name__ == "__main__":
    # 모든 페이지 수집
    all_news = fetch_all_pages()

    # 데이터 검증
    if validate_news_data(all_news):
        # CSV 파일로 저장
        save_to_csv(all_news)

        # 엑셀 파일로 저장
        save_to_excel(all_news)

        # CSV 파일에서 데이터 불러오기
        loaded_csv_data = load_from_csv()

        # 엑셀 파일에서 데이터 불러오기
        loaded_excel_data = load_from_excel()

        # 불러온 데이터 확인
        print(loaded_csv_data.head())
        print(f"Total items in CSV: {len(loaded_csv_data)}")
        print(loaded_excel_data.head())
        print(f"Total items in Excel: {len(loaded_excel_data)}")
    else:
        print("Data collection failed or validation errors occurred")