In [None]:
# Content, Website 객체에 몇 가지 속성 추가
class Content:
    def __init__(self, topic, url, title, body):
        self.topic = topic
        self.url = url
        self.title = title
        self.body = body
    
    def print(self):
        print('\nNew artice found for topic: {}'.format(self.topic))
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('BODY: \n{}'.format(self.body))

class Website:
    # searchUrl = URL에 검색어를 추가한 경우 검색 결과를 어디에서 얻는지 정의
    # resultLising: 각 결과에 관한 정보를 담고 있는 박스
    # resultUrl: 결과에서 정확한 URL을 추출할 때 사용할 태그 정보
    # absoluteUrl : 검색 결과가 절대 URL인지 상대 URL 인지 알려주는 불리언 값
    def __init__(self, name, url, searchUrl, resultListing, 
                resultUrl, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.searchUrl = searchUrl
        self.resultListing = resultListing
        self.resultUrl = resultUrl
        self.absoluteUrl = absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag
        
import requests
from bs4 import BeautifulSoup

class Crawler:
    
    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')
    
    def safeGet(self, pageObj, selector):
        childObj = pageObj.select(selector)
        if childObj is not None and len(childObj)>0:
            return childObj[0].get_text()
        return ''
    
    def search(self, topic, site):
        # 주어진 검색어로 주어진 웹사이트를 검색해 결과 페이지를 모두 기록
        bs = self.getPage(site.searchUrl+topic)
        searchResults = bs.select(site.resultListing)
        for result in searchResults:
            url = result.select(site.resultUrl)[0].attrs['href']
            # 상대 url인지 절대 url 인지 확인
            if(site.absoluteUrl):
                bs = self.getPage(url)
            else:
                bs = self.getPage(site.url + url)
            if bs is None:
                print("Something was wrong with that page or URL. SKIP!")
                return
            title = self.safeGet(bs, site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
            if title != '' and body!='':
                content = Content(topic, url, title, body)
                content.print()

In [None]:
# Example
crawler = Crawler()

siteData = [
    ['O\'Reilly Media', 'http://oreilly.com', 'https://ssearch.oreilly.com/?q=',
        'article.product-result', 'p.title a', True, 'h1', 'section#product-description'],
    ['Reuters', 'http://reuters.com', 'http://www.reuters.com/search/news?blob=', 'div.search-result-content',
        'h3.search-result-title a', False, 'h1', 'div.StandardArticleBody_body_1gnLA'],
    ['Brookings', 'http://www.brookings.edu', 'https://www.brookings.edu/search/?s=',
        'div.list-content article', 'h4.title a', True, 'h1', 'div.post-body']
]

sites = []

for row in siteData:
    sites.append(Website(row[0],row[1],row[2],
                        row[3],row[4],row[5],row[6],row[7]))
    
    topics = ['python','data science']
    for topic in topics:
        print('GETTING INFO ABOUT: '+ topic)
        for targetSite in sites:
            print(targetSite)
            crawler.search(topic, targetSite)