https://mainichi.jp/editorial/

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3

def create_db_and_schema(db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS editorials (
            title TEXT,
            link TEXT,
            text_ellipsis TEXT,
            date TEXT,
            content TEXT
        )
    ''')
    conn.commit()
    conn.close()

def fetch_mainichi_editorials(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        articles = soup.find_all('li')
        editorial_list = []
        for article in articles:
            title_tag = article.select_one('.articlelist-title')
            if not title_tag:
                continue  # title이 없는 경우 해당 기사를 건너뜀
            title = title_tag.text.strip()
            link = article.find('a')['href']
            full_link = f"https://mainichi.jp{link}"
            date_tag = article.select_one('.articletag-date')
            text_ellipsis = article.select_one('.text-ellipsis-2')
            date = date_tag.text.strip() if date_tag else "N/A"
            editorial_list.append({
                'title': title,
                'link': full_link,
                'text_ellipsis': text_ellipsis.text.strip() if text_ellipsis else "N/A",
                'date': date
            })
        return editorial_list
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return []

def fetch_article_content(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        article_body = soup.select("#main > div > article")
        if article_body:
            paragraphs = article_body[0].find_all('p')
            content = '\n'.join([p.get_text(strip=True) for p in paragraphs])
        else:
            content = "본문을 찾을 수 없습니다."
        return content
    except requests.RequestException as e:
        return f"요청 중 오류 발생: {e}"
    except Exception as e:
        return f"예상치 못한 오류 발생: {e}"

def save_to_db(editorials, db_path):
    conn = sqlite3.connect(db_path)
    editorials_df = pd.DataFrame(editorials)
    editorials_df.to_sql('editorials', conn, if_exists='append', index=False)
    conn.close()

def collect_all_editorials(base_url, db_path):
    page = 1  # 시작 페이지 설정
    while True:
        print(f"Collecting page {page}...")  # 현재 페이지 수집 중임을 출력
        url = f"{base_url}{page}?&_=1722162257351"
        editorials = fetch_mainichi_editorials(url)
        if not editorials:
            break
        for editorial in editorials:
            content = fetch_article_content(editorial['link'])
            editorial['content'] = content
            save_to_db([editorial], db_path)
        page += 1

# URL 설정
base_url = "https://mainichi.jp/editorial/"
# DB 경로 설정
db_path = 'mainichi_editorials.db'

# DB와 스키마 생성
create_db_and_schema(db_path)

# 사설 목록 수집 및 DB 저장
collect_all_editorials(base_url, db_path)

# SQLite DB 확인
def verify_db(db_path):
    conn = sqlite3.connect(db_path)
    df = pd.read_sql('SELECT * FROM editorials', conn)
    display(df)
    conn.close()
    return df

# DB 검증
verify_db(db_path)


In [None]:
# SQLite 데이터베이스 파일명
DB_NAME = 'mainichi_editorials.db'

# 데이터베이스 연결 및 커서 생성
conn = sqlite3.connect(DB_NAME)
cursor = conn.cursor()


# 데이터프레임으로 테이블 데이터 불러오기
df = pd.read_sql('SELECT * FROM editorials', conn)
conn.close()
df