# 어린이 경제신문 Web Crawling
>## Library
* re
* requests
* BeautifulSoup

>## Crawling Target
1. news_url
2. news_title
3. news_sub_title
4. news_author
5. news_data
6. news_article
7. news_img_path
8. news_source


In [35]:
%%time
import re
import requests
from bs4 import BeautifulSoup

news_list = []
data = []
start = 1
end = 3
pages = list(range(start, end+1))

for menu in range(1,4):
    for num in pages:
        url = "http://www.econoi.com/news/articleList.html?page={}&total=&box_idxno=&sc_section_code=S1N{}&view_type=sm".format(num,menu)
        req = requests.get(url)
        html = req.text
        soup = BeautifulSoup(req.text, "lxml")
        econoi = "http://www.econoi.com"
        news_list += [econoi + p.get("href") for p in soup.select("h4 > a")][:20]
        req.raise_for_status()

# url = "http://www.econoi.com/news/articleList.html?page=4&total=&box_idxno=&sc_section_code=S1N3&view_type=sm".format(num,menu)
# req = requests.get(url)
# html = req.text
# soup = BeautifulSoup(req.text, "lxml")
# econoi = "http://www.econoi.com"
# news_list += [econoi + p.get("href") for p in soup.select("h4 > a")][:20]
# for i in news_list:
# print(i)

data = []
for news_url in news_list:
    json_data = {}
    news_req = requests.get(news_url)
    news_html = news_req.text
    news_soup = BeautifulSoup(news_req.text, "lxml")

    # news_url: 기사 원문 URL
    json_data["news_url"] = news_url

    # news_title: 기사 제목
    news_title = news_soup.select_one(" header > h3 ").text
    json_data["news_title"] = news_title

# news_subtitle: 부제목
if news_soup.select_one(".subheading"):
    news_subtitle = news_soup.select_one(".subheading").text
else:
    news_subtitle = "null"    
json_data["news_subtitle"] = news_subtitle

# news_writer: 기자
news_writer = news_soup.select_one("article > ul > li").text
news_writer = re.sub("[\\n|기자명|\\t|\\r]","", news_writer).strip()
json_data["news_writer"] = news_writer
news_writer

# news_date: 날짜
news_date = news_soup.select("article > ul > li")[1].text
news_date = re.sub("[입력]","", news_date).strip()
news_date = re.sub("\.", "-", news_date).strip()
json_data["news_date"] = news_date

# news_article: 기사 내용
news_article = news_soup.select("p")
news_article = [re.sub("\\r|\\n|\\xa0","",p.text) for p in news_article]
news_article = " ".join(news_article)
json_data["news_article"] = news_article

# news_img: 기사 img 경로
if news_soup.select("figure"):
    news_img = news_soup.select_one("figure").img.get("src")
else:
    news_img = "null"
json_data["news_img"] = news_img

# news_source : 신문사
json_data["news_source"] = "어린이 경제신문"

data.append(json_data)


CPU times: user 10.6 s, sys: 131 ms, total: 10.7 s
Wall time: 1min 34s


In [36]:
len(data)

200

In [37]:
check = data[0]
for key in check:
    print(f"{key}: {check[key][:50]}")

news_url: http://www.econoi.com/news/articleView.html?idxno=
news_title: 고교학점제 대비, 다중지능 관심 가질 때
news_subtitle: 현재 초등 6학년 고교 진학 시기 시행하는
news_wrtier: 어린이 경제신문
news_date: 2022-02-23 15:36
news_article: “지난해 한 대학의 의뢰로 졸업을 앞둔 대학교 4학년 학생 120명을 대상으로 다중지능 검
news_img_path: https://cdn.econoi.com/news/photo/202202/32621_124
news_source: 어린이 경제신문


In [39]:
# json 파일로 저장
import json
with open("econoi_new.json", "w", encoding="utf-8") as json_file:
    json.dump(data, json_file,  ensure_ascii=False)