# 어린이 경제신문 Web Crawling
>## Library
* re
* requests
* BeautifulSoup

>## Crawling Target
1. news_url
2. news_title
3. news_sub_title
4. news_author
5. news_data
6. news_article
7. news_img_path
8. news_source


In [2]:

%%time
import re
import requests
from bs4 import BeautifulSoup

news_list = []
data = []
start = 1
end = 7
pages = list(range(start, end+1))

for num in pages:
    url = "http://www.econoi.com/news/articleList.html?page={}&total=928&box_idxno=&sc_section_code=S1N1&view_type=sm".format(num)
    req = requests.get(url)
    html = req.text
    soup = BeautifulSoup(req.text, "lxml")
    econoi = "http://www.econoi.com"
    news_list += [econoi + p.get("href") for p in soup.select("h4 > a")][:20]
    req.raise_for_status()

# for i in news_list:
#   print(i)

data = []
for news_url in news_list:
  json_data = {}
  news_req = requests.get(news_url)
  news_html = news_req.text
  news_soup = BeautifulSoup(news_req.text, "lxml")

  # news_url: 기사 원문 URL
  json_data["news_url"] = news_url

  # news_title: 기사 제목
  news_title = news_soup.select_one(" header > h3 ").text
  json_data["news_title"] = news_title

  # news_subtitle: 부제목
  if news_soup.select_one(".subheading"):
      news_subtitle = news_soup.select_one(".subheading").text
  else:
      news_subtitle = "null"    
  json_data["news_subtitle"] = news_subtitle

  # news_author: 기자
  news_author = news_soup.select_one("article > ul > li").text
  news_author = re.sub("[\\n|기자명|\\t|\\r]","", news_author).strip()
  json_data["news_author"] = news_author
  news_author

  # news_data: 날짜
  news_data = news_soup.select("article > ul > li")[1].text
  news_data = re.sub("[입력]","", news_data).strip()
  news_data = re.sub("\.", "-", news_data).strip()
  json_data["news_data"] = news_data

  # news_article: 기사 내용
  news_article = news_soup.select("p")
  news_article = [re.sub("\\r|\\n|\\xa0","",p.text) for p in news_article]
  news_article = " ".join(news_article)
  json_data["news_article"] = news_article

  # news_img_path: 기사 img 경로
  if news_soup.select("figure"):
      news_img_path = news_soup.select_one("figure").img.get("src")
  else:
      news_img_path = "null"
  json_data["news_img_path"] = news_img_path

  # news_source : 신문사
  json_data["news_source"] = "어린이 경제신문"

  data.append(json_data)


CPU times: user 8.27 s, sys: 123 ms, total: 8.39 s
Wall time: 1min 41s


In [3]:
len(data)

140

In [13]:
check = data[10]
for key in check:
    print(f"{key}: {check[key][:50]}")

news_url: http://www.econoi.com/news/articleView.html?idxno=
news_title: 2021 CEO 클럽 31명, 창업에 도전!
news_subtitle: 달고나, 비빔밥, 주식 교육 등 다양한 사업 전개
news_author: 어린이 경제신문
news_data: 2021-12-09 09:51
news_article: ‘좋아하고, 잘하는 것은 무엇인가요?’ ‘지금 유행하는 게 뭔가요?’창업에서 가장 중요한 
news_img_path: https://cdn.econoi.com/news/photo/202112/32267_121
news_source: 어린이 경제신문


In [None]:
# json 파일로 저장
import json
with open("econoi.json", "w", encoding="utf-8") as json_file:
    json.dump(data, json_file,  ensure_ascii=False)