In [38]:
## 요즘에는 웹에서 데이터를 가져와야 하는 일들이 많아졌습니다. 
## 따라서, beautifulsoup이라는 라이브러리를 사용해서 html문서를 파싱하는 작업을 해보기로 합니다. 
from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
## BeautifulSoup 에 html 데이터와 파서를 넣고 인스턴스를 생성해줌
soup = BeautifulSoup(html_doc, 'html.parser')
def prettify(input_soup, indent_level = 1):    
    ## pretiffy에서는 indetation level을 조절할 수 없음
    ## 기본적으로는 1칸씩만 주어지는데, 이는 가독성이 좋지 못하여 임의로 코드를 작성해서 아래처럼 변경함.
    for s in soup.prettify().split("\n"):
        non_blank = 0 
        for i in range(0, len(s)):
            if s[i] != " ":
                non_blank = i
                break
        print(" "*(indent_level-1)*non_blank + s)
prettify(soup, 2)

<html>
  <head>
    <title>
      The Dormouse's story
    </title>
  </head>
  <body>
    <p class="title">
      <b>
        The Dormouse's story
      </b>
    </p>
    <p class="story">
      Once upon a time there were three little sisters; and their names were
      <a class="sister" href="http://example.com/elsie" id="link1">
        Elsie
      </a>
      ,
      <a class="sister" href="http://example.com/lacie" id="link2">
        Lacie
      </a>
      and
      <a class="sister" href="http://example.com/tillie" id="link3">
        Tillie
      </a>
      ;
and they lived at the bottom of a well.
    </p>
    <p class="story">
      ...
    </p>
  </body>
</html>


In [65]:
## soup는 객체처럼 hierarchical하게 값을 내려가면서 원하는 value를 찾을 수 있다. 
## 또한 다음처럼, tag를 attribute처럼 접근하는 것이 가능함은 물론, parent, children 등을 사용할 수도 있음. 
print(soup.title)
## tag name 
print(soup.title.name)
## 내부의 string
print(soup.title.string)
print("="*20)
## tag가 'p'인 것 모두 찾기 
for x in soup.find_all('p'):## tag는 그냥 스트링으로 넣어도됨. 
    if x['class'][0]=='story':
        print(x)
print("="*20)
## 전체에서 id가 link3인 것을 모두 찾고, 첫번째만 가져오기
print(soup.find_all(id='link3')[0])
print("="*20)
##텍스트만 뽑아서 출력하기 
print(soup.get_text())

<title>The Dormouse's story</title>
title
The Dormouse's story
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...



In [173]:
def parsing_paper_info_from_url(input_url):
    ## scopus의 paper page에서 필요한 정보를 가져옴. 
    r_dict = {}
    import requests
    html_doc = requests.get(input_url)
    paper_soup = BeautifulSoup(html_doc.text, 'html.parser')
    ## paper title
    r_dict['title'] = paper_soup.find_all('h2')[1].text.strip()
    ## author_lst 
    r_dict['authors'] = []
    for x in paper_soup.find(id='authorlist').find_all('a'):
        if x.get('title') == 'Show Author Details':
            r_dict['authors'].append({'author':x.text, 
                                      'author_url':x.get('href').replace("amp;", "")})
    ## affiliation, 단 list가 아니라 string. 파싱이 어려움 
    r_dict['affiliation'] = paper_soup.find(id='affiliationlist').find('ul').find('li').text.strip()
    ## abstract
    r_dict['abstract'] = paper_soup.find(id='abstractSection').find('p').text.strip()
    ## keyword
    author_kwds = paper_soup.find(id='authorKeywords')
    if author_kwds == None:##author keywords 정보가 html 페이지에 없음
        r_dict['author keyword'] = []
    else:
        r_dict['author keyword'] = [kwd.text.strip() for kwd in author_kwds.find_all('span')]
    ## 저널 이름 
    r_dict['publication title'] = paper_soup.find(id='publicationTitle').text
    ## 저널 게재 시기등 정보 
    r_dict['journal info'] = paper_soup.find(id='journalInfo').text
    ## other information 
    other_info_dict= {}
    for x in paper_soup.find(id='citationInfo').find_all('li'):
        k, v = x.text.strip().split(":")
        other_info_dict[k] = v.strip()
    for x in paper_soup.find(id='documentInfo').find_all('li'):
        k, v = x.text.strip().split(":")
        other_info_dict[k] = v.strip()
    r_dict['other_info'] = other_info_dict
    return r_dict
url_lst = [
    'https://www.scopus.com/record/display.uri?eid=2-s2.0-85052489557&origin=resultslist&sort=plf-f&src=s&st1=entrepreneur&nlo=&nlr=&nls=&sid=545757d8f3aad35ae7a462c1d42ef88d&sot=b&sdt=b&sl=27&s=TITLE-ABS-KEY%28entrepreneur%29&relpos=80&citeCnt=0&searchTerm=',
    'https://www.scopus.com/record/display.uri?eid=2-s2.0-85050285124&origin=resultslist&sort=plf-f&src=s&st1=entrepreneur&nlo=&nlr=&nls=&sid=545757d8f3aad35ae7a462c1d42ef88d&sot=b&sdt=b&sl=27&s=TITLE-ABS-KEY%28entrepreneur%29&relpos=81&citeCnt=0&searchTerm=',
    'https://www.scopus.com/record/display.uri?eid=2-s2.0-85052543315&origin=resultslist&sort=plf-f&src=s&st1=entrepreneur&nlo=&nlr=&nls=&sid=545757d8f3aad35ae7a462c1d42ef88d&sot=b&sdt=b&sl=27&s=TITLE-ABS-KEY%28entrepreneur%29&relpos=82&citeCnt=0&searchTerm=', 
]
for url in url_lst:
    print(parsing_paper_info_from_url(url))
    print("="*20)
print("complete")

{'title': 'Musicians as entrepreneurs or entrepreneurs as musicians?(Article)', 'authors': [{'author': 'Albinsson, S.', 'author_url': 'https://www.scopus.com/authid/detail.uri?authorId=55329063400&eid=2-s2.0-85052489557'}], 'affiliation': 'Department of Economy and Society, University of Gothenburg, Sweden', 'abstract': 'This study investigates how Swedish musicians in the non-profit sector accept the “entrepreneur” label for themselves. The background section describes how identity work among “serious” musicians has, historically, made them shun any facet related to money-making. The qualitative study shows that there is some remaining reluctance. But it is obvious that freelance musicians recognize the need for entrepreneurial skills. A majority of those interviewed regard themselves as “entrepreneurs”, although in most cases it is out of necessity. It seems that the older generations are more hesitant than the younger. I conclude that it would be beneficial if musicians, during thei