In [3]:
import requests
from bs4 import BeautifulSoup

class Content:
    
    def __init__(self):
        self.url = ''
        self.title = ''
        self.body = ''
    
    def print(self):
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('BODY:\n{}'.format(self.body))
        

class Website:
    
    def __init__(self, name, url, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.titleTag = titleTag
        self.bodyTag = bodyTag

class Crawler:
    
    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')
    
    def safeGet(self,pageObj,selector):
        # 콘텐츠 문자열 추출하는 함수 , (없으면 빈문자열)
        selectedElems = pageObj.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''
    
    def parse(self, site,url):
        # Url 을 받아 콘텐츠를 추출한다.
        bs = self.getPage(url)
        if bs is not None:
            title = self.safeGet(bs,site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
            if title != '' and body != '':
                content = Content(url,title,body)
                content.print()
            

    

In [5]:
crawler = Crawler()

siteData = [
    ['O\'Reilly Media','http://oreilly.com','h1','section#product-description'],
    ['Reuters','http://reuters.com','h1','div.StandardArticleBody_body_1gnLA'],
    ['Brookings','http://www.brookings.edu','h1','div.post-body'],
    ['New York Times','http://nytimes.com','h1','p.story-content']
]

websites = []

urls = ['http://oreilly.com','http://reuters.com','http://www.brookings.edu','http://nytimes.com']

for row in siteData: 
    websites.append(Website(row[0],row[1],row[2],row[3]))
    
crawler.parse(websites[0],urls[0])
crawler.parse(websites[1],urls[1])
crawler.parse(websites[2],urls[2])


# 모델링 방법 1: 검색을 통한 사이트 크롤링 모델 

In [7]:
class Content:
    
    
    def __init__(self,topic,url,title,body):
        self.topic = topic
        self.url = url
        self.title = title
        self.body = body
        
    def print(self):
        print('New article found for topic: {}'.format(self.topic))
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('BODY:\n{}'.format(self.body))
        
class Website:
    # website 구조 저장 클래스
    def __init__(self,name,url,searchUrl,resultLListing,reasultUrl,titleTag,bodyTag):
        self.name = name
        self.url = url
        self.searchUrl = searchUrl
        self.resultListing = resultLListing
        self.resultUrl = reasultUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag
    
class Crawler:
    
    def getPage(self,url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text,'html.parser')
    
    def safeGet(self,pageObj,selector):
        childObj = pageObj.select(selector)
        if childObj is not None and len(childObj) > 0:
            return childObj[0].get_text()
        return ''

    def search(self,topic,site):
        #검색어로 주어진 웹사이트를 검색해 결과 페이지를 모두 기록한다. 
        bs = self.getPage(site.searchUrl + topic)    
        searchResults = bs.select(site.resultListing)
        for result in searchResults:
            url = result.select(site.resultUrl)[0].attrs['href']
            # 상대 Url 인지 절대 Url 인지 확인
            if(site.absoluteUrl):
                bs = self.getPage(url)
            else:
                bs = self.getPage(site.url + url)
            if bs in None:
                print('Something was wrong with that page or URL. Skipping!')
                return
            
            title = self.safeGet(bs,site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
            if title != '' and body != '':
                content = Content(topic,url,title,body)
                content.print()

crawler = Crawler()

siteData = [['O\'Reilly Media','http://oreilly.com','https://ssearch.oreilly.com/?q=','article.product-result','p.title a','h1','section#product-description'],
            ['Reuters','http://reuters.com','http://www.reuters.com/search/news?blob=','div.search-result-content','h3.search-result-title a','h1','div.StandardArticleBody_body_1gnLA'],
            ['Brookings','http://www.brookings.edu','https://www.brookings.edu/search/?s=','div.list-content article','h4.title a','h1','div.post-body'],
            ['New York Times','http://nytimes.com','https://www.nytimes.com/search?query=','div.SearchResults-main','a.css-11rzbny','h4','p.story-content']]

sites = []

for row in siteData:
    sites.append(Website(row[0],row[1],row[2],row[3],row[4],row[5],row[6]))
    

topics = ['python','data science']
for topic in topics:
    print('GETTING INFO ABOUT: ' + topic)
    for targetSite in sites:
        crawler.search(topic,targetSite)


GETTING INFO ABOUT: python
GETTING INFO ABOUT: data science


서버 부하를 줄이기 위해서 topics 를 돌면서 각 사이트에 대해 검색을 하는 방법이다. ( 만약, 토픽을 하나의 site에서 다 검색했으면, 부하가 증가했겠지만, 토픽별로 사이트를 순회했음으로
부하가 줄었다)


# 2. 링클르 통한 사이트 크롤링
