## Web Scraping——Covid-19 News

In [22]:
import time
import logging
import requests
import sys
import urllib


from bs4 import BeautifulSoup
from collections import OrderedDict
from urllib.parse import urlencode

def get_list(comp, page):
    """Function to get  web list pages for a given company and page number.

    Args:
        comp: Company name.
        page: The page number.

    Returns:
        newsData: A dictionary with news title as its key and other details as values.

    """#help get_list(函数名称) 调取
    
    newsData = OrderedDict()
    href = 'http://search.sina.com.cn/?%s&range=title&c=news&num=20&col=1_7&page=%s' % (comp, page) # comp -> first %s; page -> 2nd %s; col=1_7 -> financial news in sina
    html = requests.get(href)
    # Parsing html
    soup = BeautifulSoup(html.content, 'html.parser')
    divs = soup.findAll('div', {"class": "r-info r-info2"})
    for div in divs:
        head = div.findAll('h2')[0]
        # News title
        titleinfo = head.find('a')
        title = titleinfo.get_text()
        # News url
        url = titleinfo['href']
        # Other info
        otherinfo = head.find('span', {"class": "fgray_time"}).get_text()
        source, date, time = otherinfo.split()
        # News abstract
        abstract = div.find('p', {"class": "content"}).get_text()
        newsData[title] = [date, source, abstract, url]
    return newsData



if __name__ == "__main__":
    today = time.strftime("%Y%m%d")
    compRawStr = '新冠肺炎疫情'
    # Dealing with character encoding
    comp = compRawStr.encode('gbk')
    d = {'q': comp}
    pname = urlencode(d)
    # Scraping and printing the first two pages
    filename = 'Covid19news_'+today+'.txt'
    News = open(filename, 'w', encoding="utf-8")
    for page in range(20)[1:]:
        newsData = get_list(pname, page)
        for ky in newsData:
            each = str('\001'.join([ky] + newsData[ky]))
            print(each)
            News.write(each+'\n')
    # "\001" as separator
    News.close()

推特CEO将捐10亿美元股票 资助防控新冠肺炎疫情的工作2020-04-08证券时报e公司原标题：推特CEO将捐10亿美元股票 资助防控新冠肺炎疫情的工作  来源：澎湃新闻                                                           e公司讯 4月7日 推特（Twitter）首席执行官（CEO）杰克·多西（Jackhttps://finance.sina.com.cn/roll/2020-04-08/doc-iirczymi5176533.shtml
中巴军队召开新冠肺炎疫情防控经验分享视频会议2020-04-08中国青年报分别在北京、武汉和拉瓦尔品第三地召开新冠肺炎疫情防控经验分享视频会议  https://k.sina.com.cn/article_1726918143_66eeadff02000ww2l.html?from=mil
国务院联防联控机制印发《关于进一步做好重点场所重点单位重点人群新冠肺炎疫情防控相关工作的通知》2020-04-08环球网国务院联防联控机制印发《关于进一步做好重点场所重点单位重点人群新冠肺炎疫情防控相关工作的通知》 强调结合当前疫情防控形势 落实分区分级防控要求 推进生产生活秩序逐步恢复   https://k.sina.com.cn/article_1686546714_6486a91a020011a6c.html?from=news&subch=onews
中共中央政治局常务委员会召开会议，分析国内外新冠肺炎疫情防控和经济运行形势...2020-04-08新浪财经中共中央政治局常务委员会召开会议 分析国内外新冠肺炎疫情防控和经济运行形势 研究部署落实常态化疫情防控举措全面推进复工复产工作 中共中央总书记习近平主持会议 https://finance.sina.com.cn/7x24/2020-04-08/doc-iircuyvh6640750.shtml
上海举办抗击新冠肺炎疫情美术摄影主题展2020-04-08新华网原标题：上海举办抗击新冠肺炎疫情美术摄影主题展 https://news.sina.com.cn/o/2020-04-08/doc-iircuyvh6636202.shtml
东盟-中日韩特别会议将就应对新冠肺炎疫情形成

In [28]:
News.close()

In [33]:
import logging
import requests
import sys

from bs4 import BeautifulSoup


def get_body(href):
    """Function to retrieve news content given its url.
    Args:
        href: url of the news to be crawled.
    Returns:
        content: the crawled news content.
    """
    html = requests.get(href)
    soup = BeautifulSoup(html.content, 'html.parser')
    div = soup.find('div', {"id": "artibody"})
    paras = div.findAll('p')
    content = ''
    for p in paras:
        ptext = p.get_text().strip().replace("\n", "")
        content += ptext
    return content



if __name__ == "__main__":
    logging.getLogger().setLevel(logging.INFO)
    # Getting and printing content for each url in the crawled web list pages
    article = open('covid19_sorted.text', 'w')
    with open(filename) as f:
        for line in f:
            title, date, source, abstract, href = line.strip().split('\001')
            # Printing progress onto console
            logging.info('Scraping ' + href)
            content = get_body(href)
            print('\001'.join([title, date, source, abstract, href, content]))
            content = str('\001'.join([title, date, source, abstract, href, content]))
            article.write(content)  
            article.write('\n') 
    article.close()