# Extract article on brookings.edu

In [43]:
import time
import requests
from bs4 import BeautifulSoup

In [16]:
class Content:
	def __init__(self, url, title, body) -> None:
		self.url = url
		self.title = title
		self.body = body


def scrapBrookings(url):
	req = requests.get(url)
	bsObj = BeautifulSoup(req.text, 'html.parser')
	title = bsObj.find('h1').text
	body = bsObj.find('div', {'class': 'post-body'}).text
	return Content(url, title, body)

In [None]:
url = 'https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/'

content = scrapBrookings(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
# print(content.body)

---
# Reuter Contents 

In [None]:
url = 'https://www.reuters.com/article/us-usa-epa-pruitt-idUSKBN19W2D0'

---
# Internal-link, External-link

In [19]:
class Content:
	def __init__(self, topic, url, title, body):
		self.topic = topic
		self.url = url
		self.title = title
		self.body = body

	def __repr__(self):
		'New article found for topic: {}\n'.format(self.topic)
		'URL: {}\n'.format(self.url)
		'TITLE: {}\n'.format(self.title)
		'BODY:\n{}\n'.format(self.body)


class Website():
	def __init__(self, name, url, searchURL, resultList,
				 resultURL, absoluteURL, titleTag, bodyTag):
		self.name = name
		self.url = url
		self.searchULR = searchURL
		self.resultList = resultList
		self.resultURL = resultURL
		self.absoluteURL = absoluteURL
		self.titleTag = titleTag
		self.bodyTag = bodyTag


class Crawler:
	def getPage(self, url):
		try:
			req = requests.get(url)
		except requests.exceptions.RequestException:
			return None
		return BeautifulSoup(req.text, 'html.parser')


	def search(self, topic, site):
		# site: <class Website> object
		print('searchURL + topic:', site.searchURL + topic)

		bsObj = self.getPage(site.searchURL + topic)
		searchResults = bsObj.select(site.resultList)

		for result in searchResults:
			url = result.select(site.resultURL)[0].attrs['href']
			bsObj = self.getPage(self.url + url)
			if bsObj is None:
				print('Something was wrong with that page or URL')
				return
			title = self.safeGet(bsObj, site.titleTag)
			body = self.getAllBody(bsObj, site.bodyTag)

			if title != '' and body != '':
				content = Content(topic, url, title, body)
				content.__repr__

In [28]:
siteData1 = [
	'Reuters',  # name
	'https://www.reuters.com',  # url
	'https://www.reuters.com/search/news?blob=',  # searchULR
	'div.search-result-content',  # resultList
	'h3.search-result-title > a',  # resultURL
	False,  # absoluteURL
	'h1',  # titleTag
	'p.Paragraph-paragraph-2Bgue ArticleBody-para-TD_9x'  # bodyTag
]

sites = (Website(siteData1[0], siteData1[1], siteData1[2], siteData1[3],\
				 siteData1[4], siteData1[5], siteData1[6], siteData1[7]))

sites.url

'https://www.reuters.com'

---
# Crawling by a link

In [40]:
url = 'https://www.reuters.com'
req = requests.get(url)
bsObj = BeautifulSoup(req.text, 'html.parser')

data_testid_links = bsObj.find_all('a', attrs={'data-testid': ['Heading', 'Link']})

link_list = set()
for link in data_testid_links:
	link_list.add(link)

print('link_list length is', len(link_list))
for i, link in enumerate(link_list):
	print('[{:4}]: {}'.format(i, link['href']))

link_list length is 80
[   0]: /world/us/
[   1]: /world/europe/russia-says-it-hit-military-boat-odesa-port-ukraine-2022-07-24/
[   2]: /investigates/
[   3]:  https://www.refinitiv.com/en/financial-data
[   4]: /world/sherman-kennedy-visit-solomons-where-fathers-fought-us-now-vies-with-china-2022-07-24/
[   5]: /lifestyle/sports/nigerias-amusan-breaks-100-metres-hurdles-world-record-2022-07-25/
[   6]: /world/
[   7]: /technology/
[   8]: /lifestyle/sports/quiet-man-vingegaard-wins-maiden-tour-de-france-title-2022-07-24/
[   9]: https://www.reuters.com/fact-check/
[  10]: /lifestyle/sports/mu-holds-off-hodgkinson-take-800m-gold-2022-07-25/
[  11]:  https://www.refinitiv.com/en/products/refinitiv-workspace
[  12]: /world/asia-pacific/
[  13]: /breakingviews/
[  14]: /info-pages/advertising-guidelines/
[  15]: https://trdigital.iad1.qualtrics.com/jfe/form/SV_8kte8gArGyCGVhz
[  16]: https://www.refinitiv.com/en/products/world-check-kyc-screening
[  17]: /world/europe/god-give-us-rain-rom

---
# Webcrawling with Selenium

In [21]:
import os
import time
import pandas as pd

from bs4 import BeautifulSoup
from selenium import webdriver


def coffeebean_store(store_list):
	url = 'https://www.coffeebeankorea.com/store/store.asp'
	driver = webdriver.Chrome()

	for i in range(1, 11):
		driver.get(url)
		time.sleep(1)

		driver.execute_script('storePop2(%d)' % i)
		time.sleep(1)
		try:
			html = driver.page_source
			bsObj = BeautifulSoup(html, 'html.parser')
		# except Exception as e:
		# 	print(e)
		# 	continue
		# else:
			store_name = bsObj.select_one('div.store_txt > h2').text
			store_info = bsObj.select('div.store_txt > table.store_table > tbody > tr > td')
			store_addr_list = list(store_info[2])
			store_addr = store_addr_list[0]  # 매장 주소
			store_phone = store_info[3].text  # 매장 전화번호
			print('{} {} {}'.format(i+1, store_name, store_addr, store_phone))
			store_list.append([store_name, store_addr, store_phone])
		except:
			continue


def main():
	store_info = []
	coffeebean_store(store_info)

	# set store_info data as a DataFrame
	coffeebean_table = pd.DataFrame(store_info, columns=('매장이름', '주소', '전화번호'))
	print(coffeebean_table.head())

	coffeebean_table.to_csv('coffeebean_store.csv', mode='w', encoding='utf-8', index=True)


if __name__ == '__main__':
	main()

2 학동역 DT점 서울시 강남구 학동로 211 1층  
4 차병원점 서울시 강남구 논현로 566 강남차병원1층  
7 강남대로점 서울시 서초구 강남대로 369 1층  
      매장이름                         주소          전화번호
0  학동역 DT점       서울시 강남구 학동로 211 1층    02-3444-9973
1     차병원점  서울시 강남구 논현로 566 강남차병원1층     02-538-7615
2    강남대로점      서울시 서초구 강남대로 369 1층     02-588-5778


In [51]:
os.chdir('C:\\Users\\lh\\Downloads\\chromedriver_win32')  # excutable path will be deprecated
driver = webdriver.Chrome()
driver.get('https://www.coffeebeankorea.com/store/store.asp')
driver.execute_script('storePop2(1)')  # 학동역(DT)점

html = driver.page_source
bsObj = BeautifulSoup(html, 'html.parser')
# print(bsObj.prettify())

store_name = bsObj.select('div.store_txt > p.name > span')
store_name_list = []
for name in store_name:
	store_name_list.append(name.get_text())

print('[매장 개수] :', len(store_name_list))
print(*store_name_list, sep='\n')

store_address = bsObj.select('p.address > span')
store_address_list = []

for addr in store_address:
	store_address_list.append(addr.get_text())
print()
print('[매장 주소] :', *store_address_list, sep='\n')