#  Data Programming - Data Gathering, Cleaning, Transforming

## Practice: Web_crawling_scraping

---
### HTTP Request  
__Basic Example__  
_slide 7_

~~~ python
import requests
r = requests.get('https://api.github.com/user', auth=('user', 'pass'))
print(r.status_code)
print(r.headers['content-type'])
print(r.json())
~~~

In [1]:
# Your code here
import requests
r = requests.get('https://api.github.com/user', auth=('user', 'pass'))
# 주어진 url에 get 함수로 request를 날릴 수 있음 
print(type(r))
print(r)

print(r.status_code)
print(r.headers['content-type'])
print(r.json())

<class 'requests.models.Response'>
<Response [401]>
401
application/json; charset=utf-8
{'message': 'Requires authentication', 'documentation_url': 'https://docs.github.com/rest/reference/users#get-the-authenticated-user'}


In [None]:
# GET 방식: requests.get()
# POST 방식: requests.post()
# PUT 방식: requests.put()
# DELETE 방식: requests.delete()

__With Parameters__  
_slide 8_
~~~python
import requests
params = { "q": "cmsc320" }
r = requests.get("https://www.google.com", params=params)
print(r)
~~~

In [11]:
# Your code here
import requests
params = { "q": "cmsc320" }
r = requests.get("https://www.google.com", params=params)
# 구글에 cmsc320을 검색한 url
print(r)
print(r.status_code)
print(r.headers['content-type']) #content-type이 text이므로 text로 확인할 수 있음 
print()
print(r.text) # 전체 내용이 출력됨

<Response [200]>
200
text/html; charset=ISO-8859-1

<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="en"><head><meta content="Search the world's information, including webpages, images, videos and more. Google has many special features to help you find exactly what you're looking for." name="description"><meta content="noodp" name="robots"><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"><title>Google</title><script nonce="u949FzFBIsStYWtUvxRk2A">(function(){window.google={kEI:'YkvZY5LaFLiWwbkP3b6TqA0',kEXPI:'0,1359409,6059,206,2414,2390,2316,383,246,5,1129120,1197716,380774,16115,19398,9286,22430,1362,12318,17581,4998,13228,3847,10622,22741,5081,1593,1279,2742,149,562,541,840,1983,4314,3514,606,2023,2297,14670,3227,2845,7,29075,3041,1654,1851,6397,8927,432,3,346,1244,1,5445,148,11327,2648,4,1528,2304,7039,22023,9872,3193,11442,2216,2980,1457,9358,

---
### CSV Files  
_slide 10_
~~~ python
import csv
with open("../data/tips.csv", "rt") as f:
    reader = csv.reader(f)
    for row in reader:
        print(row)
~~~

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
# Your code here
import csv
with open("/content/drive/MyDrive/1. Python, Data Processing/data/tips.csv", "rt") as f:
    reader = csv.reader(f)
    for row in reader:
        print(row)

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']
['16.99', '1.01', 'Female', 'No', 'Sun', 'Dinner', '2']
['10.34', '1.66', 'Male', 'No', 'Sun', 'Dinner', '3']
['21.01', '3.5', 'Male', 'No', 'Sun', 'Dinner', '3']
['23.68', '3.31', 'Male', 'No', 'Sun', 'Dinner', '2']
['24.59', '3.61', 'Female', 'No', 'Sun', 'Dinner', '4']
['25.29', '4.71', 'Male', 'No', 'Sun', 'Dinner', '4']
['8.77', '2.0', 'Male', 'No', 'Sun', 'Dinner', '2']
['26.88', '3.12', 'Male', 'No', 'Sun', 'Dinner', '4']
['15.04', '1.96', 'Male', 'No', 'Sun', 'Dinner', '2']
['14.78', '3.23', 'Male', 'No', 'Sun', 'Dinner', '2']
['10.27', '1.71', 'Male', 'No', 'Sun', 'Dinner', '2']
['35.26', '5.0', 'Female', 'No', 'Sun', 'Dinner', '4']
['15.42', '1.57', 'Male', 'No', 'Sun', 'Dinner', '2']
['18.43', '3.0', 'Male', 'No', 'Sun', 'Dinner', '4']
['14.83', '3.02', 'Female', 'No', 'Sun', 'Dinner', '2']
['21.58', '3.92', 'Male', 'No', 'Sun', 'Dinner', '2']
['10.33', '1.67', 'Female', 'No', 'Sun', 'Dinner', '3']
['16.29', '3.71

---
### JSON Files & Strings  
_slide 14_
~~~ python
import json
import requests

r = requests.get('https://api.github.com/events')
data = json.loads(r.text)

print(data[0]['repo']['url'])
print(data[0]['type'])
~~~

In [21]:
# Your code here
import json
import requests

r = requests.get('https://api.github.com/events')
print(r.headers['content-type']) #json 형식이므로 json 가져오기 
print()
data = r.json() # 1
print(data)

data = json.loads(r.text) #2
# json object로 로드하기 (string으로부터 python object로)
# 3: json.loads(r.text) 혹은 json.dumps(data or r.text) 등으로 읽어올 수 있음
print(data)

print(data[0])
print(data[0]['repo']['url'])
print(data[0]['type'])

print(json.dumps(r.text))

application/json; charset=utf-8

{'id': '26776006360', 'type': 'PushEvent', 'actor': {'id': 77367653, 'login': 'jnsgdm', 'display_login': 'jnsgdm', 'gravatar_id': '', 'url': 'https://api.github.com/users/jnsgdm', 'avatar_url': 'https://avatars.githubusercontent.com/u/77367653?'}, 'repo': {'id': 593760993, 'name': 'jnsgdm/bmr-calculator', 'url': 'https://api.github.com/repos/jnsgdm/bmr-calculator'}, 'payload': {'push_id': 12462435631, 'size': 1, 'distinct_size': 1, 'ref': 'refs/heads/master', 'head': '627cfa7db09cf331256fc42fb697691fd5869985', 'before': 'ed35eb758429d0b0ee86b1e9609572fbc4123e20', 'commits': [{'sha': '627cfa7db09cf331256fc42fb697691fd5869985', 'author': {'email': 'JonasDmoreira@hotmail.com', 'name': 'jnsgdm'}, 'message': 'result component + styles', 'distinct': True, 'url': 'https://api.github.com/repos/jnsgdm/bmr-calculator/commits/627cfa7db09cf331256fc42fb697691fd5869985'}]}, 'public': True, 'created_at': '2023-01-31T17:11:15Z'}
https://api.github.com/repos/jnsgdm/bmr-

---
### Getting Pages: How to Request on the Internet  

__With urllib Library__  
_slide 23_

~~~ python
import urllib
from urllib.request import urlopen

google = urlopen('http://google.com')
google = google.read()
print(google[:200])

url = 'https://google.com?q='
url_with_query = url + urllib.parse.quote_plus('python web scraping')

web_search = urlopen(url_with_query)
web_search = web_search.read()

print(web_search[:200])
~~~

In [22]:
# Your code here
import urllib
from urllib.request import urlopen

# urloppen, xxx.read()
google = urlopen('http://google.com')
google = google.read()
print(google[:200])

url = 'https://google.com?q='
url_with_query = url + urllib.parse.quote_plus('python web scraping')

print(urllib.parse.quote_plus('python web scraping'))

web_search = urlopen(url_with_query)
web_search = web_search.read()

print(web_search[:200])

b'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="en"><head><meta content="Search the world\'s information, including webpages, images, videos and more. Google has many speci'
python+web+scraping
b'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="en"><head><meta content="Search the world\'s information, including webpages, images, videos and more. Google has many speci'


In [None]:
# Your code here


__With requests Library__  
_slide 24_  
_you may also need to use proxy here._

~~~ python
import requests

google = requests.get('http://google.com')      #(1)

print("status code = ", google.status_code)     #(2)
print("content = ", google.content[:200])
print("header = ", google.headers)              #(3)
print("cookie = ", google.cookies.items())      #(4)
~~~

In [None]:
# Your code here
import requests

google = requests.get('http://google.com')      #(1)

print("status code = ", google.status_code)     #(2)
print("content = ", google.content[:200])
print("header = ", google.headers)              #(3)
print("cookie = ", google.cookies.items())      #(4)

status code =  200
content =  b'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="en"><head><meta content="Search the world\'s information, including webpages, images, videos and more. Google has many speci'
header =  {'Date': 'Tue, 31 Jan 2023 06:44:31 GMT', 'Expires': '-1', 'Cache-Control': 'private, max-age=0', 'Content-Type': 'text/html; charset=ISO-8859-1', 'P3P': 'CP="This is not a P3P policy! See g.co/p3phelp for more info."', 'Content-Encoding': 'gzip', 'Server': 'gws', 'Content-Length': '6292', 'X-XSS-Protection': '0', 'X-Frame-Options': 'SAMEORIGIN', 'Set-Cookie': '1P_JAR=2023-01-31-06; expires=Thu, 02-Mar-2023 06:44:31 GMT; path=/; domain=.google.com; Secure, AEC=ARSKqsJ8qCzC7WTl4BbrX_e2f5Wlu6RZbRvi7T04bxTXaKf14W_tW9j1aw; expires=Sun, 30-Jul-2023 06:44:31 GMT; path=/; domain=.google.com; Secure; HttpOnly; SameSite=lax, NID=511=qCY1OGBNESWoq3CQt5SDNjoFE2_ICfHy0h-XX58kqvhVDOylaKTRtxnJw808pWnUd_bS7II4-r-7OpqHatrM5dJprOW5jVFHKxfS8G_AfjHXX4RGOzACZWRAITJ

---
### Beautiful Soup   
__Basic Example__  
_slide 26_

~~~ python
import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup                                     #(1)
from lxml import html

page = urlopen("file:../data/take_action_enough_project.html")    #(2)
bs = BeautifulSoup(page, "html") # if not working, try "lxml"     #(3)

print(bs.title)
print(bs.find_all('a'))                                           #(4)
print(bs.find_all('p'))
~~~

In [24]:
# Your code here
import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup                                     #(1)
from lxml import html

page = urlopen("file:/content/drive/MyDrive/1. Python, Data Processing/data/take_action_enough_project.html")    #(2)
bs = BeautifulSoup(page, "html") # if not working, try "lxml"     #(3)

print(bs.title)
print(bs.find_all('a'))                                           #(4) 각 태그를 가진 element 모두 가져오기 
print(bs.find_all('p'))

<title>Take Action | Enough</title>
[<a id="navigation-top" name="top"></a>, <a href="#navigation">Skip to Navigation</a>, <a href="/"><!-- --></a>, <a href="/events" title="">Events &amp; Appearances</a>, <a href="/multimedia" title="">Multimedia</a>, <a href="/news" title="">Press Room</a>, <a href="/about" title="About"><span></span>About</a>, <a href="/blog" title="Blog"><span></span>Blog</a>, <a href="/conflicts" title="Conflicts"><span></span>Conflicts</a>, <a href="/reports" title="Reports"><span></span>Reports</a>, <a class="selected" href="/take_action" title="Take Action"><span></span>Take Action</a>, <a href="/shop" title="Shop"><span></span>Shop</a>, <a href="/donate" title="Donate"><span></span>Donate</a>, <a href="/">Home</a>, <a href="https://ssl1.americanprogress.org/o/507/p/dia/action3/common/public/?action_KEY=391">South Sudan: On August 17th, Implement "Plan B" </a>, <a href="http://eno.ug/1TtgaLd">Beyond Deadlock: Recommendations for Obama's Plan B on South Sudan</a

__Traversing Relationships__  
_slide 28_

~~~ python
header_children = [c for c in bs.head.children]

navigation_bar = bs.find(id='globalNavigation')

for d in navigation_bar.descendants:
    print("\ndescendants = \n",d)
    
print("\nsiblings = ")
for s in d.previous_siblings:
    print(s)
~~~

In [25]:
# Your code here
# children : 바로 아래 자식만 
# descendants : 자식의 자식까지 전부 다
header_children = [c for c in bs.head.children]
print(header_children)
print()
navigation_bar = bs.find(id='globalNavigation')

for d in navigation_bar.descendants:
    print("\ndescendants = \n",d)

['\n', <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>, '\n', <script type="text/javascript">var _sf_startpt=(new Date()).getTime()</script>, '\n', ' page-take_action.tpl.php ', '\n', <meta content="http://www.enoughproject.org/take_action" property="og:url"/>, '\n', <meta content="http://enoughproject.org/files/enough-project-exclamation.png" property="og:image"/>, '\n', <meta content="Take Action | Enough Project" property="og:title"/>, '\n', <title>Take Action | Enough</title>, '\n', <link href="/rss.xml" rel="alternate" title="Enough RSS" type="application/rss+xml"/>, '\n', <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>, '\n', <link href="/files/enough_favicon.ico" rel="shortcut icon" type="image/x-icon"/>, '\n', <link href="http://fast.fonts.com/cssapi/012e0441-b63e-4313-b1a0-88721eb354bb.css" rel="stylesheet" type="text/css"/>, '\n', <link href="/files/css/css_a6ad7da30f28b094b4ca39c50f794a6d.css" media="all" rel="stylesheet" type="text/

In [None]:
print("\nsiblings = ")
for s in d.previous_siblings:
    print(s)


siblings = 
<ul>
<li id="navAbout"><a href="/about" title="About"><span></span>About</a></li>
<li id="navBlog"><a href="/blog" title="Blog"><span></span>Blog</a></li>
<li id="navConflicts"><a href="/conflicts" title="Conflicts"><span></span>Conflicts</a></li>
<li id="navReports"><a href="/reports" title="Reports"><span></span>Reports</a></li>
<li id="navTakeAction"><a class="selected" href="/take_action" title="Take Action"><span></span>Take Action</a></li>
<li id="navShop"><a href="/shop" title="Shop"><span></span>Shop</a></li>
<li id="navDonate"><a href="/donate" title="Donate"><span></span>Donate</a></li>
</ul>




__Scrap What You Want__  
_slide 30_

~~~ python
from urllib.request import urlopen
from bs4 import BeautifulSoup
from lxml import html

page = urlopen("file:../data/take_action_enough_project.html")
bs = BeautifulSoup(page, "html") # if not working, try "lxml"

ta_divs = bs.find_all("div", class_="views-row") #(1)
print("length of ta_divs = ", len(ta_divs)) #(2)
all_data = []

for ta in ta_divs:
    title = ta.h2 #(3)
    link = ta.a
    about = ta.find_all('p') #(4)
    print("title = ", title, "\n")
    print("link = ", link, "\n")
    print("about = ", about, "\n")
~~~

In [None]:
# Your code here
from urllib.request import urlopen
from bs4 import BeautifulSoup
from lxml import html

page = urlopen("file:/content/drive/MyDrive/1. Python, Data Processing/data/take_action_enough_project.html")
bs = BeautifulSoup(page, "html") # if not working, try "lxml"

ta_divs = bs.find_all("div", class_="views-row") #(1)
print("length of ta_divs = ", len(ta_divs)) #(2)
all_data = []

for ta in ta_divs:
    title = ta.h2 #(3)
    link = ta.a
    about = ta.find_all('p') #(4)
    print("title = ", title, "\n")
    print("link = ", link, "\n")
    print("about = ", about, "\n")

length of ta_divs =  4
title =  <h2><a href="https://ssl1.americanprogress.org/o/507/p/dia/action3/common/public/?action_KEY=391">South Sudan: On August 17th, Implement "Plan B" </a></h2> 

link =  <a href="https://ssl1.americanprogress.org/o/507/p/dia/action3/common/public/?action_KEY=391">South Sudan: On August 17th, Implement "Plan B" </a> 

about =  [<p>During President Obama's recent trip to Africa, the international community set a deadline of August 17 for a peace deal to be signed by South Sudan's warring parties. The President warned that if an agreement is not reached, it will be 'necessary for us to move forward with a different plan.'  With conflict raging since December 2013, the world can no longer sit by as they have while past agreements have been broken.</p>, <p> </p>, <p>Read our latest brief on the issue:<br/>
<a href="http://eno.ug/1TtgaLd">Beyond Deadlock: Recommendations for Obama's Plan B on South Sudan</a></p>, <p><strong>Tell President Obama that if there is no

__Scrap What You Want: Cleaner Results__  
_slide 32_

~~~ python
all_data = []
for ta in ta_divs:
    data_dict = {}
    data_dict['title'] = ta.h2.get_text()
    data_dict['link'] = ta.a.get('href')
    data_dict['about'] = [p.get_text() for p in ta.find_all('p')]
    all_data.append(data_dict)
print("\n")
print("title = ", all_data[0]['title'], "\n")
print("link = ", all_data[0]['link'], "\n")
print("about = ", all_data[0]['about'], "\n")
~~~

In [None]:
# Your code here
all_data = []
for ta in ta_divs:
    data_dict = {}
    data_dict['title'] = ta.h2.get_text()
    data_dict['link'] = ta.a.get('href')
    data_dict['about'] = [p.get_text() for p in ta.find_all('p')]
    all_data.append(data_dict)
print("\n")
print("title = ", all_data[0]['title'], "\n")
print("link = ", all_data[0]['link'], "\n")
print("about = ", all_data[0]['about'], "\n")



title =  South Sudan: On August 17th, Implement "Plan B"  

link =  https://ssl1.americanprogress.org/o/507/p/dia/action3/common/public/?action_KEY=391 

about =  ["During President Obama's recent trip to Africa, the international community set a deadline of August 17 for a peace deal to be signed by South Sudan's warring parties. The President warned that if an agreement is not reached, it will be 'necessary for us to move forward with a different\xa0plan.' \xa0With conflict raging since December 2013, the world can no longer sit by as they have while past agreements have been broken.", '\xa0', "Read our latest brief on the issue:\nBeyond Deadlock: Recommendations for Obama's Plan B on South Sudan", 'Tell President Obama that if there is no agreement by\xa0August 17 between the warring parties, to implement and enforce a strong "Plan\xa0B."'] 



---
### LXML Example  
_cf: [When you get "failed to load external entity."](https://stackoverflow.com/questions/21496857/how-to-prevent-lxml-error-failed-to-load-external-entity)_

__With CSS Select__  
_slide 35_

~~~ python
!pip install cssselect
from lxml import html
from urllib.request import urlopen

page_open = urlopen("file:../data/take_action_enough_project.html")
page = html.parse(page_open)
root = page.getroot()

ta_divs = root.cssselect('div.views-row')
print("ta_divs = ", ta_divs)

all_data = []

for ta in ta_divs:
    data_dict = {}
    title = ta.cssselect('h2')[0]
    data_dict['title'] = title.text_content()
    data_dict['link'] = title.find('a').get('href')
    data_dict['about'] = [p.text_content() for p in ta.cssselect('p')]
    all_data.append(data_dict)
print("\nall_data = ", all_data)
~~~

In [26]:
# Your code here
!pip install cssselect
from lxml import html
from urllib.request import urlopen

page_open = urlopen("file:/content/drive/MyDrive/1. Python, Data Processing/data/take_action_enough_project.html")
page = html.parse(page_open)
root = page.getroot()

ta_divs = root.cssselect('div.views-row')
print("ta_divs = ", ta_divs)

all_data = []

for ta in ta_divs:
    data_dict = {}
    title = ta.cssselect('h2')[0]
    data_dict['title'] = title.text_content()
    data_dict['link'] = title.find('a').get('href')
    data_dict['about'] = [p.text_content() for p in ta.cssselect('p')]
    all_data.append(data_dict)
print("\nall_data = ", all_data)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting cssselect
  Downloading cssselect-1.2.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: cssselect
Successfully installed cssselect-1.2.0
ta_divs =  [<Element div at 0x7eff953facc0>, <Element div at 0x7eff953fac70>, <Element div at 0x7eff953fa9f0>, <Element div at 0x7eff953dbcc0>]

all_data =  [{'title': 'South Sudan: On August 17th, Implement "Plan B" ', 'link': 'https://ssl1.americanprogress.org/o/507/p/dia/action3/common/public/?action_KEY=391', 'about': ["During President Obama's recent trip to Africa, the international community set a deadline of August 17 for a peace deal to be signed by South Sudan's warring parties. The President warned that if an agreement is not reached, it will be 'necessary for us to move forward with a different\xa0plan.' \xa0With conflict raging since December 2013, the world can no longer sit by as they have while past agreements have 

~~~ python
print("root.find('div') = ", root.find('div'))
print("\nroot.find('head') = ", root.find('head'))
print("\nroot.find('head').findall('script') = ", root.find('head').findall('script'))
print("\nroot.cssselect('div') = ", root.cssselect('div'))
print("\nroot.cssselect('head script') = ", root.cssselect('head script'))
~~~

In [None]:
# Your code here
print("root.find('div') = ", root.find('div'))
print("\nroot.find('head') = ", root.find('head'))
print("\nroot.find('head').findall('script') = ", root.find('head').findall('script'))
print("\nroot.cssselect('div') = ", root.cssselect('div'))
print("\nroot.cssselect('head script') = ", root.cssselect('head script'))

root.find('div') =  None

root.find('head') =  <Element head at 0x7fe7af59a540>

root.find('head').findall('script') =  [<Element script at 0x7fe7b8ce08b0>, <Element script at 0x7fe7af5cb3b0>, <Element script at 0x7fe7af5cbef0>, <Element script at 0x7fe7af5cbf90>, <Element script at 0x7fe7af5cb900>, <Element script at 0x7fe7af5cb950>]

root.cssselect('div') =  [<Element div at 0x7fe7af59a540>, <Element div at 0x7fe7af5cb3b0>, <Element div at 0x7fe7af5cbef0>, <Element div at 0x7fe7af5cbf90>, <Element div at 0x7fe7af5cb900>, <Element div at 0x7fe7af5cb950>, <Element div at 0x7fe7af5cbc20>, <Element div at 0x7fe7af5cb810>, <Element div at 0x7fe7af5cbf40>, <Element div at 0x7fe7af5cbb30>, <Element div at 0x7fe7af5cbb80>, <Element div at 0x7fe7af5cba40>, <Element div at 0x7fe7af5cbe50>, <Element div at 0x7fe7af5cba90>, <Element div at 0x7fe7af5cb4a0>, <Element div at 0x7fe7af5cb6d0>, <Element div at 0x7fe7af5cbe00>, <Element div at 0x7fe7af5b60e0>, <Element div at 0x7fe7af5b6090>, <Element 

_slide 38_  
~~~ python
from urllib.request import urlopen

page_open = urlopen('file:../data/emoji-cheat-sheet.html')
page = html.parse(page_open)

body = page.find('body') #(1)
top_header = body.find('h2')

print("top_header.text = ", top_header.text)

headers_and_lists = [sib for sib in top_header.itersiblings()] #(2)
print("\nheaders_and_lists = ", headers_and_lists)
proper_headers_and_lists = [s for s in top_header.itersiblings() if s.tag in ['ul', 'h2', 'h3']] #(3)

print("\nproper_headers_and_lists = ", proper_headers_and_lists)
~~~


In [None]:
# Your code here
from urllib.request import urlopen

page_open = urlopen('file:/content/drive/MyDrive/1. Python, Data Processing/data/emoji-cheat-sheet.html')
page = html.parse(page_open)

body = page.find('body') #(1)
top_header = body.find('h2')

print("top_header.text = ", top_header.text)

headers_and_lists = [sib for sib in top_header.itersiblings()] #(2)
print("\nheaders_and_lists = ", headers_and_lists)
proper_headers_and_lists = [s for s in top_header.itersiblings() if s.tag in ['ul', 'h2', 'h3']] #(3)

print("\nproper_headers_and_lists = ", proper_headers_and_lists)

top_header.text =  People

headers_and_lists =  [<Element ul at 0x7fe7af5cbf40>, <Element h2 at 0x7fe7af5cba40>, <Element ul at 0x7fe7af666ae0>, <Element h2 at 0x7fe7af6669a0>, <Element ul at 0x7fe7af566400>, <Element h2 at 0x7fe7af566590>, <Element ul at 0x7fe7af566630>, <Element h2 at 0x7fe7af5664f0>, <Element ul at 0x7fe7af566680>, <Element h3 at 0x7fe7af5666d0>, <Element ul at 0x7fe7af566720>, <Element h3 at 0x7fe7af566770>, <Element p at 0x7fe7af5667c0>, <Element p at 0x7fe7af566810>, <Element p at 0x7fe7af566860>, <Element p at 0x7fe7af5668b0>, <Element div at 0x7fe7af566900>, <Element script at 0x7fe7af566950>, <Element script at 0x7fe7af5669a0>, <Element script at 0x7fe7af5669f0>]

proper_headers_and_lists =  [<Element ul at 0x7fe7af5cbf40>, <Element h2 at 0x7fe7af5cba40>, <Element ul at 0x7fe7af666ae0>, <Element h2 at 0x7fe7af6669a0>, <Element ul at 0x7fe7af566400>, <Element h2 at 0x7fe7af566590>, <Element ul at 0x7fe7af566630>, <Element h2 at 0x7fe7af5664f0>, <Element ul at 0

##### With XPath
_slide 66_

~~~ python
from lxml import html
from urllib.request import urlopen

page_open = urlopen('file:../data/emoji-cheat-sheet.html')
page = html.parse(page_open)

proper_headers = page.xpath('//h2|//h3') #(1)
proper_lists = page.xpath('//ul') #(2)

print("proper_headers = ", proper_headers)
print("\nproper_lists = ", proper_lists)
~~~

In [None]:
# Your code here
from lxml import html
from urllib.request import urlopen

page_open = urlopen('file:/content/drive/MyDrive/1. Python, Data Processing/data/emoji-cheat-sheet.html')
page = html.parse(page_open)

proper_headers = page.xpath('//h2|//h3') #(1)
proper_lists = page.xpath('//ul') #(2)

print("proper_headers = ", proper_headers)
print("\nproper_lists = ", proper_lists)

proper_headers =  [<Element h2 at 0x7fe7af5b6360>, <Element h2 at 0x7fe7af5b6b80>, <Element h2 at 0x7fe7af5b6c20>, <Element h2 at 0x7fe7af5b6b30>, <Element h2 at 0x7fe7af6667c0>, <Element h3 at 0x7fe7af666860>, <Element h3 at 0x7fe7af6660e0>]

proper_lists =  [<Element ul at 0x7fe7af566540>, <Element ul at 0x7fe7af566a40>, <Element ul at 0x7fe7af566a90>, <Element ul at 0x7fe7af566ae0>, <Element ul at 0x7fe7af566b30>, <Element ul at 0x7fe7af566b80>]


### Selenium 

_slide 71_

~~~ python
!pip install selenium
from selenium import webdriver
!apt-get update
!apt install chromium-chromedriver

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
browser = webdriver.Chrome('chromedriver',chrome_options=chrome_options)

browser.get('http://sites.google.com/a/dblab.postech.ac.kr/db/')
browser.maximize_window()
~~~

In [27]:
# Your code here
!pip install selenium
from selenium import webdriver
!apt-get update
!apt install chromium-chromedriver

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
browser = webdriver.Chrome('chromedriver',chrome_options=chrome_options)

browser.get('http://sites.google.com/a/dblab.postech.ac.kr/db/')
browser.maximize_window()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting selenium
  Downloading selenium-4.8.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
Collecting urllib3[socks]~=1.26
  Downloading urllib3-1.26.14-py2.py3-none-any.whl (140 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 KB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trio~=0.17
  Downloading trio-0.22.0-py3-none-any.whl (384 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m384.9/384.9 KB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trio-websocket~=0.9
  Downloading trio_websocket-0.9.2-py3-none-any.whl (16 kB)
Collecting exceptiongroup>=1.0.0rc9
  Downloading exceptiongroup-1.1.0-py3-none-any.whl (14 kB)
Collecting async-generator>=1.9
  Downloading async_generator-1.10-py3-none-any.whl (18 kB)
Collecti

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease [3,622 B]
0% [Waiting for headers] [Waiting for headers] [1 InRelease 0 B/3,622 B 0%] [Wa0% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Waiting f                                                                               Get:2 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
0% [Waiting for headers] [2 InRelease 8,192 B/114 kB 7%] [Waiting for headers]                                                                                Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64  InRelease
0% [Waiting for headers] [2 InRelease 14.2 kB/114 kB 12%] [Waiting for headers]                                                                               Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease [1,581 B]
0% [Waiting for headers] [2 InRelease 14.2 k

  browser = webdriver.Chrome('chromedriver',chrome_options=chrome_options)


WebDriverException: ignored

_slide 73_

~~~ python
content = browser.find_element_by_css_selector('div.sites-canvas-main')
print(content.text)
~~~

In [None]:
# Your code here


_slide 75_

~~~ python
links = browser.find_elements_by_xpath('//a')
for link in links:
    print(link.get_attribute("href"))
~~~

In [None]:
# Your code here


---
### Fuzzy Matching
_slide 98_
~~~ python
!pip install fuzzywuzzy
from fuzzywuzzy import fuzz
~~~

~~~ python
print(fuzz.ratio('Barack Obama', 'Barack H. Obama'))
print(fuzz.partial_ratio('Barack Obama', 'Barack H. Obama'))

print(fuzz.ratio('ACME Factory', 'ACME Factory Inc'))
print(fuzz.partial_ratio('ACME Factory', 'ACME Factory Inc'))
~~~

In [28]:
# Your code here
!pip install fuzzywuzzy
from fuzzywuzzy import fuzz
# 근사한 문자열 찾기 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0




In [29]:
# Your code here
print(fuzz.ratio('Barack Obama', 'Barack H. Obama'))
print(fuzz.partial_ratio('Barack Obama', 'Barack H. Obama'))

print(fuzz.ratio('ACME Factory', 'ACME Factory Inc'))
print(fuzz.partial_ratio('ACME Factory', 'ACME Factory Inc'))

89
75
86
100


_slide 99_

~~~ python
print(fuzz.token_sort_ratio('Barack Obama', 'Barack H. Obama'))
print(fuzz.token_set_ratio('Barack Obama', 'Barack H. Obama'))

print(fuzz.token_sort_ratio('Barack H Obama', 'Barack H. Obama'))
print(fuzz.token_set_ratio('Barack H Obama', 'Barack H. Obama'))
~~~

In [30]:
# Your code here
print(fuzz.token_sort_ratio('Barack Obama', 'Barack H. Obama'))
print(fuzz.token_set_ratio('Barack Obama', 'Barack H. Obama'))

print(fuzz.token_sort_ratio('Barack H Obama', 'Barack H. Obama'))
print(fuzz.token_set_ratio('Barack H Obama', 'Barack H. Obama'))

92
100
100
100


_slide 100_  

~~~ python
from fuzzywuzzy import process

choices = ['Yes', 'No', 'Maybe', 'N/A']
print(process.extract('ya', choices, limit = 2))
print(process.extractOne('ya', choices))
print(process.extract('nope', choices, limit = 2))
print(process.extractOne('nope', choices))
~~~

In [31]:
# Your code here
from fuzzywuzzy import process

choices = ['Yes', 'No', 'Maybe', 'N/A']
print(process.extract('ya', choices, limit = 2))
print(process.extractOne('ya', choices))
print(process.extract('nope', choices, limit = 2))
print(process.extractOne('nope', choices))

[('Yes', 45), ('Maybe', 45)]
('Yes', 45)
[('No', 90), ('Yes', 29)]
('No', 90)


---
### RegEx Matching
__Basic Methods__  
_slide 105_

~~~ python
import re

text = "adibijoiwefdblabijofeiwjfodblabijosjfojwa"
#       ...........dblab..........dblab..........

# Find the index of the 1st occurrence of "dblab"
match = re.search(r"dblab", text)
print(match.start())

# Does start of text match "dblab"?
print(re.match(r"dblab", text))

# Iterate over all matches for "dblab" in text
for match in re.finditer(r"dblab", text):
    print(match.start())
    
# Return all matches of "dblab" in the text
match = re.findall(r"dblab", text)   
print(match)
~~~

In [34]:
# Your code here
import re

text = "adibijoiwefdblabijofeiwjfodblabijosjfojwa"
#       ...........dblab..........dblab..........

# Find the index of the 1st occurrence of "dblab"
match = re.search(r"dblab", text)
print(match)
print(match.start())

# Does start of text match "dblab"?
print(re.match(r"dblab", text))

# Iterate over all matches for "dblab" in text
print(re.finditer(r"dblab", text))
for match in re.finditer(r"dblab", text):
    print(match.start())

# Return all matches of "dblab" in the text
match = re.findall(r"dblab", text)   
print(match)

<re.Match object; span=(11, 16), match='dblab'>
11
None
<callable_iterator object at 0x7eff9781e190>
11
26
['dblab', 'dblab']


__Compiled Regexes__  
_slide 106_  

~~~ python
# Compile the regular expression "dblab" 
regex = re.compile(r"dblab")

# Use it repeatedly to search for matches in text
print(regex.match(text))
print(regex.search(text))
print(regex.findall(text))
~~~

In [35]:
# Your code here
# Compile the regular expression "dblab" 
regex = re.compile(r"dblab")

# Use it repeatedly to search for matches in text
print(regex.match(text))
print(regex.search(text))
print(regex.findall(text))

None
<re.Match object; span=(11, 16), match='dblab'>
['dblab', 'dblab']


__Matching with multiple groups__  
_slide 107_  

~~~ python
name_regex = '([A-Z]\w+) ([A-Z]\w+)'
names = "Barack Obama, Ronald Reagan, Nancy Drew"

name_match = re.match(name_regex, names)
name_match.group()
name_match.groups()

name_regex = '(?P<first_name>[A-Z]\w+) (?P<last_name>[A-Z]\w+)'
for name in re.finditer(name_regex, names):
    print('Meet {}!'.format(name.group('first_name')))
~~~

In [37]:
name_regex = r'([A-Z]\w+) ([A-Z]\w+)'
# \w -> [a-z] [A-Z] [0-9] _
names = "Barack Obama, Ronald Reagan, Nancy Drew"

name_match = re.match(name_regex, names)
print(name_match)
print(name_match.group())
print(name_match.groups())

name_regex = r'(?P<first_name>[A-Z]\w+) (?P<last_name>[A-Z]\w+)' # 이름 지정 가능 
for name in re.finditer(name_regex, names):
    print('Meet {}!'.format(name.group('first_name'))) 

<re.Match object; span=(0, 12), match='Barack Obama'>
Barack Obama
('Barack', 'Obama')
Meet Barack!
Meet Ronald!
Meet Nancy!
