# This is a simple web scraping example

In [16]:
# the library used to query the website in question
import urllib.request

In [17]:
# we are going to look into scraping some information from the Wikipedia page about a location in Japan
hokkaido = urllib.request.urlopen("https://en.wikipedia.org/wiki/Hokkaido")

In [18]:
# We need to import BeautifulSoup
from bs4 import BeautifulSoup

In [19]:
# We need to parse the page and put it into a beautifulsoup format
pagejp = BeautifulSoup(hokkaido, "html.parser")

In [23]:
# WE can look at the html that we extracted from the page with the prettify feature
print(pagejp.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Hokkaido - Wikipedia
  </title>
  <script>
   document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );
  </script>
  <script>
   (window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Hokkaido","wgTitle":"Hokkaido","wgCurRevisionId":846868078,"wgRevisionId":846868078,"wgArticleId":58092,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Webarchive template wayback links","Articles with Japanese-language external links","All articles with dead external links","Articles with dead external links from November 2017","Articles with permanently dead external links","Articles with short description","Articles containing Japanese-language text","Articles c

In [25]:
# printing the string inside the tag title
print(pagejp.title.string)

Hokkaido - Wikipedia


In [26]:
# Finding all the links within the page
pagejp.find_all("a")

[<a id="top"></a>,
 <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#p-search">Jump to search</a>,
 <a class="mw-redirect" href="/wiki/Hokkaido_(dog)" title="Hokkaido (dog)">Hokkaido (dog)</a>,
 <a href="/wiki/Prefectures_of_Japan" title="Prefectures of Japan">Prefecture</a>,
 <a href="/wiki/Japanese_language" title="Japanese language">Japanese</a>,
 <a href="/wiki/Romanization_of_Japanese" title="Romanization of Japanese">Rōmaji</a>,
 <a href="/wiki/Ainu_language" title="Ainu language">Ainu</a>,
 <a href="/wiki/Ainu_language" title="Ainu language">Ainu</a>,
 <a class="image" href="/wiki/File:Flag_of_Hokkaido_Prefecture.svg" title="Flag of Hokkaido "><img alt="Flag of Hokkaido " class="thumbborder" data-file-height="800" data-file-width="1200" height="67" src="//upload.wikimedia.org/wikipedia/commons/thumb/2/22/Flag_of_Hokkaido_Prefecture.svg/100px-Flag_of_Hokkaido_Prefecture.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/

In [37]:
# Let's show only the titles for all the links on the page, and remove the ones where there are no titles
all_links = pagejp.find_all("a")
for link in all_links:
    title1 = link.get("title")
    if title1 != None:
        print(link.get("title"))

Hokkaido (dog)
Prefectures of Japan
Japanese language
Romanization of Japanese
Ainu language
Ainu language
Flag of Hokkaido 
Official logo of Hokkaido 
Location of Hokkaido 
Japan
List of regions of Japan
List of islands of Japan
List of capitals in Japan
Sapporo
List of prefectural governors in Japan
Harumi Takahashi
List of Japanese prefectures by area
List of Japanese prefectures by population
ISO 3166
Districts of Japan
Municipalities of Japan
Rugosa rose
Jezo spruce
Red-crowned crane
Sparidae
Circuit (political division)
Help:IPA/Japanese
About this sound
Ja-hokkaido.ogg
List of islands of Japan
Japan
Prefectures of Japan
Tsugaru Strait
Honshu
Seikan Tunnel
Sapporo
Cities designated by government ordinance of Japan
Sakhalin
Russia
Kuril Islands dispute
Kuril Islands
Edit section: History
Nihon Shoki
Recorded history
Abe no Hirafu
Mishihase
Emishi
Ainu people
Nara period
Heian period
Dewa Province
History of Japan
Ezo
Muromachi period
Oshima Peninsula
Takeda Nobuhiro
Matsumae clan


In [39]:
# Let's now try to extract the population of the region from the table on the right hand side of the page
# The class name is found by inspecting the webpage in a browser
pagejp.find_all('table', class_="infobox geography vcard")

[<table class="infobox geography vcard" style="width:22em;width:23em">
 <tr>
 <th colspan="2" style="text-align:center;font-size:125%;font-weight:bold;font-size:1.25em; white-space:nowrap"><span class="fn org"><span style="position:relative; bottom:0.2em;">Hokkaido</span></span><br/>
 <span class="nickname"><span style="position: relative; top: 0.1em;"><span style="font-weight:normal;"><span lang="ja" title="Japanese language text"><span lang="ja" title="Japanese language text">北海道</span></span></span></span></span></th>
 </tr>
 <tr>
 <td colspan="2" style="text-align:center;background-color:#cddeff; font-weight:bold;"><span class="category"><a href="/wiki/Prefectures_of_Japan" title="Prefectures of Japan">Prefecture</a></span></td>
 </tr>
 <tr class="mergedtoprow">
 <th colspan="2" style="text-align:center;text-align:left">Japanese transcription(s)</th>
 </tr>
 <tr class="mergedrow">
 <th scope="row"> • <a href="/wiki/Japanese_language" title="Japanese language">Japanese</a></th>
 <td

In [48]:
# Let's see if we can dig deeper into the table to get the data we want
tablecell = pagejp.find_all('td')
for row in tablecell:
    print(row.string)


Prefecture
北海道
Hokkaidō
None
Ainu-Mosir
None
None
None
None
Japan
Hokkaido
Hokkaido
Sapporo
Harumi Takahashi
None
1st
5,377,435
8th
None
JP-01
74
179
None
None
None
Sea bream
None
None
East Asia
None
Japanese archipelago
None
2,290 m (7,510 ft)
Mount Asahi
None
Hokkaido
None
5,377,435 (September 30, 2016)
None
None
None
知床
Akan National Park
阿寒
Kushiro-shitsugen National Park
釧路湿原
Daisetsuzan National Park
大雪山
Shikotsu-Tōya National Park
支笏洞爺
Rishiri-Rebun-Sarobetsu National Park
利尻礼文サロベツ
Abashiri Quasi-National Park
網走
Hidaka-sanmyaku Erimo Quasi-National Park
日高山脈襟裳
Niseko-Shakotan-Otaru Kaigan Quasi-National Park
ニセコ積丹小樽海岸
Ōnuma Quasi-National Park
大沼
Shokanbetsu-Teuri-Yagishiri Quasi-National Park
暑寒別天売焼尻
Kushiro Wetland
釧路湿原
1980-06-17
Lake Kutcharo
クッチャロ湖
1989-07-06
Lake Utonai
ウトナイ湖
1991-12-12
Kiritappu Wetland
霧多布湿原
1993-06-10
None
None
None
Miyajima Marsh
宮島沼
2002-11-18
Uryūnuma Wetland
雨竜沼湿原
2005-11-08
Sarobetsu plain
サロベツ原野
Lake Tōfutsu
濤沸湖
Lake Akan
阿寒湖
None
None
Lake Fūren

In [51]:
# we can see that the population is in the 17th td tag, so let's isolate it
population = tablecell[16]
population.string

'5,377,435'

In [63]:
# We have another table on that page with population by region in the Hokkaido area, and the fourth one in
# the list of table with class wikitable is the one we want to look at
wikitable = pagejp.find_all('table', class_="wikitable")[3]
wikitable

<table class="wikitable">
<tr>
<td colspan="10" style="text-align: center; padding-left: 18%">
<div class="noresize" role="img" style="width: 500px; line-height: 1; text-align: center; background-color: #ffffff; position: relative; max-width: 100%; overflow-x: auto;"><a class="image" href="/wiki/File:Subprefectures_of_Hokkaido.svg"><img alt="" data-file-height="528" data-file-width="848" height="311" src="//upload.wikimedia.org/wikipedia/commons/thumb/1/1d/Subprefectures_of_Hokkaido.svg/500px-Subprefectures_of_Hokkaido.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/1/1d/Subprefectures_of_Hokkaido.svg/750px-Subprefectures_of_Hokkaido.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/1/1d/Subprefectures_of_Hokkaido.svg/1000px-Subprefectures_of_Hokkaido.svg.png 2x" width="500"/></a>
<div style="position:absolute; left:146px; top:152px"><b>1</b></div>
<div style="position:absolute; left:122px; top:172px"><b>a</b></div>
<div style="position:absolute; left:86px; t

In [98]:
# what we are doing here is first storing all the rows of that table, then iterate over it to then store all the columns
# in each row, and getting the string in each cell, and populating 9 lists that we will then put together to re-create table
A = []
B = []
C = []
D = []
E = []
F = []
G = []
H = []
I = []
rows = wikitable.findAll("tr")
for row in rows:
    cols = row.findAll("td")
    if len(cols) == 9: ## checking to make sure there are 9 columns, so we don't include the rows that have less than 9
        A.append(cols[0].find(text=True))
        B.append(cols[1].find(text=True))
        C.append(cols[2].find(text=True))
        D.append(cols[3].find(text=True))
        E.append(cols[4].find(text=True))
        F.append(cols[5].find(text=True))
        G.append(cols[6].find(text=True))
        H.append(cols[7].find(text=True))
        I.append(cols[8].find(text=True))
        



In [99]:
#import pandas to convert lists to data frame, and re-create the table with the data we scraped
import pandas as pd
df=pd.DataFrame(A,columns=['Subprefecture'])
df['Japanese']=B
df['Capital']=C
df['Largest City']=D
df['Population']=E
df['Area']=F
df['Municipalities 1']=G
df['Municipalities 2']=H
df['Municipalities 3']=I
df

Unnamed: 0,Subprefecture,Japanese,Capital,Largest City,Population,Area,Municipalities 1,Municipalities 2,Municipalities 3
0,Sorachi,空知総合振興局,Iwamizawa,Iwamizawa,338485,5791.19,10 cities,14 towns,
1,↳,石狩振興局,Sapporo,Sapporo,2324878,3539.86,6 cities,1 town,1 village
2,Shiribeshi,後志総合振興局,Kutchan,Otaru,234984,4305.83,1 city,13 towns,6 villages
3,Iburi,胆振総合振興局,Muroran,Tomakomai,419115,3698.0,4 cities,7 towns,
4,↳,日高振興局,Urakawa,Shinhidaka,76084,4811.97,,7 towns,
5,Oshima,渡島総合振興局,Hakodate,Hakodate,433475,3936.46,2 cities,9 towns,
6,↳,檜山振興局,Esashi,Setana,43210,2629.94,,7 towns,
7,Kamikawa,上川総合振興局,Asahikawa,Asahikawa,527575,10619.2,4 cities,17 towns,2 villages
8,↳,留萌振興局,Rumoi,Rumoi,53916,3445.75,1 city,6 towns,1 village
9,Sōya,宗谷総合振興局,Wakkanai,Wakkanai,71423,4625.09,1 city,8 towns,1 village
