In [None]:
# !pip install bs4
!pip install html5lib

In [1]:
import pandas as pd
import requests

from bs4 import BeautifulSoup


In [2]:
# The below url contains html tables with the worlds populations
url = "https://en.wikipedia.org/wiki/World_population"

Before proceeding to scrape a web site, you need to examine the contents, and the way data is organized on the website. Open the above url in your browser and check the tables on the webpage.

In [3]:
data = requests.get(url).text

In [28]:
soup = BeautifulSoup(data, "html.parser")

In [5]:
# Find all html table in the webpage

tables = soup.find_all("table")

Assume that we are looking for the 10 most densly populated countries table, we can look through the tables list and find the right one we are look for based on the data in each table or we can search for the table name if it is in the table but this option might not always work.

In [6]:
for index, table in enumerate(tables):
    if ("10 most densely populated countries" in str(table)):
        table_index = index
        print(table_index)

5


See if you can locate the table name of the table, 10 most densly populated countries, below.

In [26]:
# tables[table_index].prettify()

In [8]:
# Turn html table into Pandas datframe
population_data = pd.DataFrame(columns=["Rank", "Country", "Population", "Area", "Density"])
for row in tables[table_index].tbody.find_all("tr"):
    col = row.find_all("td")
    if col != []:
        rank = col[0].text
        country = col[1].text
        population = col[2].text.strip()
        area = col[3].text.strip()
        density = col[4].text.strip()
        population_data = population_data.append({"Rank": rank, "Country": country, "Population": population, 
                                                  "Area": area, "Density" : density
                                                 }, ignore_index= True)
population_data

Unnamed: 0,Rank,Country,Population,Area,Density
0,1,Singapore,5704000,710,8033
1,2,Bangladesh,171410000,143998,1190
2,3,Lebanon,6856000,10452,656
3,4,Taiwan,23604000,36193,652
4,5,South Korea,51781000,99538,520
5,6,Rwanda,12374000,26338,470
6,7,Haiti,11578000,27065,428
7,8,Netherlands,17640000,41526,425
8,9,Israel,9410000,22072,426
9,10,India,1382280000,3287240,420


### Scrape data from HTML tables into a DataFrame using BeautifulSoup and read_html

Using the same url, data, soup, and tables object as in the last section we can use the read_html function to create a DataFrame.

Remember the table we need is located in tables[table_index]

We can now use the pandas function read_html and give it the string version of the table as well as the flavor which is the parsing engine bs4.

In [13]:
tables
table_index

5

In [14]:
pd.read_html(str(tables[table_index]), flavor="bs4")

[   Rank      Country  Population  Area(km2)  Density(pop/km2)
 0     1    Singapore     5704000        710              8033
 1     2   Bangladesh   171410000     143998              1190
 2     3      Lebanon     6856000      10452               656
 3     4       Taiwan    23604000      36193               652
 4     5  South Korea    51781000      99538               520
 5     6       Rwanda    12374000      26338               470
 6     7        Haiti    11578000      27065               428
 7     8  Netherlands    17640000      41526               425
 8     9       Israel     9410000      22072               426
 9    10        India  1382280000    3287240               420]

In [17]:
population_data_read_html = pd.read_html(str(tables[table_index]), flavor="bs4")[0]
population_data_read_html

Unnamed: 0,Rank,Country,Population,Area(km2),Density(pop/km2)
0,1,Singapore,5704000,710,8033
1,2,Bangladesh,171410000,143998,1190
2,3,Lebanon,6856000,10452,656
3,4,Taiwan,23604000,36193,652
4,5,South Korea,51781000,99538,520
5,6,Rwanda,12374000,26338,470
6,7,Haiti,11578000,27065,428
7,8,Netherlands,17640000,41526,425
8,9,Israel,9410000,22072,426
9,10,India,1382280000,3287240,420


## Scrape data from HTML tables into a DataFrame using read_html
We can also use the read_html function to directly get DataFrames from a url.

In [24]:
dataframe_list = pd.read_html(url, flavor="bs4" )
len(dataframe_list)

26

In [20]:
dataframe_list[table_index]

Unnamed: 0,Rank,Country,Population,Area(km2),Density(pop/km2)
0,1,Singapore,5704000,710,8033
1,2,Bangladesh,171410000,143998,1190
2,3,Lebanon,6856000,10452,656
3,4,Taiwan,23604000,36193,652
4,5,South Korea,51781000,99538,520
5,6,Rwanda,12374000,26338,470
6,7,Haiti,11578000,27065,428
7,8,Netherlands,17640000,41526,425
8,9,Israel,9410000,22072,426
9,10,India,1382280000,3287240,420


We can also use the match parameter to select the specific table we want. If the table contains a string matching the text it will be read.

In [23]:
population_df = pd.read_html(url, match="10 most densely populated countries", flavor="bs4")[0]
population_df

Unnamed: 0,Rank,Country,Population,Area(km2),Density(pop/km2)
0,1,Singapore,5704000,710,8033
1,2,Bangladesh,171410000,143998,1190
2,3,Lebanon,6856000,10452,656
3,4,Taiwan,23604000,36193,652
4,5,South Korea,51781000,99538,520
5,6,Rwanda,12374000,26338,470
6,7,Haiti,11578000,27065,428
7,8,Netherlands,17640000,41526,425
8,9,Israel,9410000,22072,426
9,10,India,1382280000,3287240,420
