## 웹 크롤링의 핵심은 재귀이다.
## 재귀는  URL에서 페이지를 가져오고, 그 페이지를 검사해서 다른 URL..을 찾고 다시 그 페이지를 가져오기를 무한 반복한다.
# 주의할 점
## 반드시 대역폭에 세심한 주의를 기울여 서버 부하를 줄일 방법을 강구해야한다.


# 단일 도메인으로의 이동
## 위키백과를 크롤링해보자. (위키백과는 API도 제공하니까 크롤링만 하지말고 API가 되면 사용하자)

In [5]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')

for link in bs.find('div', {'id':'bodyContent'}).findAll('a', href=True):
    if 'href' in link.attrs:
        print(link.attrs['href'])
    

/wiki/Wikipedia:Protection_policy#semi
/wiki/Kevin_Bacon_(disambiguation)
/wiki/File:Kevin_Bacon_in_2022.jpg
/wiki/Peabody_Awards
/wiki/Philadelphia
/wiki/Kevin_Bacon_filmography
/wiki/Kyra_Sedgwick
/wiki/Sosie_Bacon
/wiki/Edmund_Bacon_(architect)
/wiki/Michael_Bacon_(musician)
http://baconbros.com
#cite_note-1
#cite_note-actor-2
/wiki/Leading_man
/wiki/Character_actor
/wiki/Golden_Globe_Award
/wiki/Screen_Actors_Guild_Award
/wiki/Primetime_Emmy_Award
/wiki/National_Lampoon%27s_Animal_House
/wiki/Footloose_(1984_film)
/wiki/Diner_(1982_film)
/wiki/JFK_(film)
/wiki/A_Few_Good_Men
/wiki/Apollo_13_(film)
/wiki/Mystic_River_(film)
/wiki/Frost/Nixon_(film)
/wiki/Friday_the_13th_(1980_film)
/wiki/Tremors_(1990_film)
/wiki/The_River_Wild
/wiki/The_Woodsman_(2004_film)
/wiki/Crazy,_Stupid,_Love
/wiki/X-Men:_First_Class
/wiki/Patriots_Day_(film)
/wiki/Losing_Chase
/wiki/Loverboy_(2005_film)
/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Miniseries_or_Television_Film
/wiki/Screen_Actors_Guild

# 최하위 도메인까지 내려가는 함수( 특별한 형태의 URL을 입력받아서)

In [11]:
def getLink(article_url):
    html = urlopen('http://en.wikipedia.org{}'.format(article_url))
    bs = BeautifulSoup(html, 'html.parser')
    urls =bs.find('div', {'id':'bodyContent'}).findAll('a', href=True)
    for url in urls:
        new_url =getLink(url)
        if new_url is not None: 
            print(new_url)
            getLink(new_url)

getLink('/wiki/Kevin_Bacon')        

InvalidURL: URL can't contain control characters. 'en.wikipedia.org<a href="' (found at least ' ')

In [17]:
import re
from random import random


def getLinks(article_url):
    html = urlopen('http://en.wikipedia.org{}'.format(article_url))
    bs = BeautifulSoup(html, 'html.parser')
    return bs.find('div', {'id':'bodyContent'}).findAll('a', href=re.compile('^(/wiki/)((?!:).)*$'))

links = getLinks('/wiki/Kevin_Bacon')

# 랜덤으로 하나의 링크를 선택하고 출력하고, 그 링크의 하위 링크를 가져와서 다시 랜덤으로 선택하는 방식으로 계속 반복한다.
while len(links) > 0:
    newArticle = links[random.randint(0, len(links)-1)].attrs['href']
    print(newArticle)
    links = getLinks(newArticle)
    


AttributeError: 'builtin_function_or_method' object has no attribute 'randint'

In [18]:
# 중복된 URL을 방지하기 위해 set을 사용한다.
# 돌아다니며 모든 하위 URL 다 조사합니다. (python은 재귀를 1000단계까지 허용합니다)
pages = set()

def getLinks(page_url):
    global pages
    html = urlopen('http://en.wikipedia.org{}'.format(page_url))
    bs = BeautifulSoup(html, 'html.parser')
    for link in bs.findAll('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                new_page = link.attrs['href']
                print(new_page)
                pages.add(new_page)
                getLinks(new_page)

In [19]:
pages = set()
def getLinks(pageUrl):
    global pages 
    html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
    bs = BeautifulSoup(html, 'html.parser')
    
    try: 
        print(bs.h1.get_text())
        print(bs.find(id='mw-content-text').findAll('p')[0])
        print(bs.find(id = 'ca-edit').find('span').find('a').attrs['href'])
    except AttributeError:
        print('This page is missing something! No worries though!')
    for link in bs.findAll('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                newPage = link.attrs['href']
                print('-----\n'+newPage)
                pages.add(newPage)
                getLinks(newPage)

getLinks('')

SyntaxError: incomplete input (339627775.py, line 2)

In [34]:
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from urllib.request import urlopen
import datetime
import re
import random

pages = set()
random.seed(datetime.datetime.now().timestamp())  # 정수형 seed 사용

# 페이지에서 발견된 내부 링크를 모두 목록으로 만든다.
def getInternalLinks(bs, includeUrl):
    includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme, urlparse(includeUrl).netloc)
    internalLinks = []
    # /로 시작하는 링크를 모두 찾는다.
    for link in bs.findAll('a', href=re.compile('^(/|.*'+includeUrl+')')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.attrs['href'].startswith('/')):
                    internalLinks.append(includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks

# 페이지에서 발견된 외부 링크를 모두 목록으로 만든다.
def getExternalLinks(bs, excludeUrl):
    externalLinks = []
    # 현재 URL을 포함하지 않으면서 http나 www로 시작하는 링크를 모두 찾는다.
    for link in bs.findAll('a', href=re.compile('^(http|www)((?!'+excludeUrl+').)*$')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks

def getRandomExternalLink(startingPage):
    html = urlopen(startingPage)
    bs = BeautifulSoup(html, 'html.parser')
    externalLinks = getExternalLinks(bs, urlparse(startingPage).netloc)
    if len(externalLinks) == 0:
        print('No external links, looking around the site for one')
        domain = '{}://{}'.format(urlparse(startingPage).scheme, urlparse(startingPage).netloc)
        internalLinks = getInternalLinks(bs, domain)
        if len(internalLinks) == 0:
            return None
        return getRandomExternalLink(internalLinks[random.randint(0, len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]

def followExternalOnly(startingSite):
    externalLink = getRandomExternalLink(startingSite)
    if externalLink is None:
        print("No external links found.")
        return
    print('Random external link is: {}'.format(externalLink))
    followExternalOnly(externalLink)

# 로 시작해서 외부 링크로 무작위로 이동한다. 
followExternalOnly('http://oreilly.com')


Random external link is: https://www.youtube.com/user/OreillyMedia
Random external link is: https://support.google.com/youtube?p=korea_report
Random external link is: https://www.google.co.kr/intl/en/about/products?tab=uh
Random external link is: https://youtube-global.blogspot.com/2015/02/youtube-kids.html
Random external link is: https://www.youtube.com/@YouTubeLiaison
Random external link is: https://developers.google.com/youtube
Random external link is: http://stackoverflow.com/questions/ask?tags=youtube-api


HTTPError: HTTP Error 403: Forbidden

In [None]:
allExternalLinks = set()
allInternalLinks = set()

def getAllExternalLinks(siteUrl):
    html = urlopen(siteUrl)
    domain = '{}://{}'.format(urlparse(siteUrl).scheme, urlparse(siteUrl).netloc)
    bs = BeautifulSoup(html, 'html.parser')
    internalLinks = getInternalLinks(bs, domain)
    externalLinks = getExternalLinks(bs, domain)
    
    for link in externalLinks:
        if link not in allExternalLinks:
            allExternalLinks.add(link)
            print(link)
    for link in internalLinks:
        if link not in allInternalLinks:
            allInternalLinks.add(link)
            getAllExternalLinks(link)

allInternalLinks.add('http://oreilly.com')
getAllExternalLinks('http://oreilly.com')
