# Extracting table data from the web

In [0]:
# Setup
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup

## Get a web page and convert it to a dataframe

In [0]:
URL = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"
res = requests.get(URL)
soup = BeautifulSoup(res.content, 'html.parser')
tables = soup.find_all('table')
table = tables[1] # note that the first table (at index 0) is not relevant

# Print the first 250 characters of the html table
print(str(table)[:250])

<table class="wikitable sortable" style="text-align:right">
<tbody><tr>
<th data-sort-type="number">Rank</th>
<th>Country<br/>(or dependent territory)</th>
<th>Population</th>
<th>Date</th>
<th>% of world<br/>population</th>
<th class="unsortable">So


## # Convert html to pandas dataframe

In [0]:
df = pd.read_html(str(table), header=0, flavor='html5lib')[0]
print(df.head(10).to_string())

  Rank Country(or dependent territory)  Population             Date % of worldpopulation                        Source
0    1                   China[Note 2]  1393240000    July 31, 2018                18.2%     Official population clock
1    2                   India[Note 3]  1335030000    July 31, 2018                17.5%     Official population clock
2    3           United States[Note 4]   327573000    July 31, 2018                4.29%     Official population clock
3    4                       Indonesia   265015300     July 1, 2018                3.47%    Official annual projection
4    5                        Pakistan   212423000    July 31, 2018                2.78%     Official population clock
5    6                          Brazil   209385000    July 31, 2018                2.74%     Official population clock
6    7                         Nigeria   193392517     July 1, 2016                2.53%      Annual official estimate
7    8                      Bangladesh   1649340

## Convert dataframe to json string

In [0]:
str_json = (df.head(10).to_json(orient='records'))
print(str_json)

[{"Rank":"1","Country(or dependent territory)":"China[Note 2]","Population":1393240000,"Date":"July 31, 2018","% of worldpopulation":"18.2%","Source":"Official population clock"},{"Rank":"2","Country(or dependent territory)":"India[Note 3]","Population":1335030000,"Date":"July 31, 2018","% of worldpopulation":"17.5%","Source":"Official population clock"},{"Rank":"3","Country(or dependent territory)":"United States[Note 4]","Population":327573000,"Date":"July 31, 2018","% of worldpopulation":"4.29%","Source":"Official population clock"},{"Rank":"4","Country(or dependent territory)":"Indonesia","Population":265015300,"Date":"July 1, 2018","% of worldpopulation":"3.47%","Source":"Official annual projection"},{"Rank":"5","Country(or dependent territory)":"Pakistan","Population":212423000,"Date":"July 31, 2018","% of worldpopulation":"2.78%","Source":"Official population clock"},{"Rank":"6","Country(or dependent territory)":"Brazil","Population":209385000,"Date":"July 31, 2018","% of worldp

## Convert json string to list of dictionaries using 'json.loads'

In [0]:
li_json = json.loads(str_json)
for i,v in enumerate(li_json):
    print(i,v)

0 {'Rank': '1', 'Country(or dependent territory)': 'China[Note 2]', 'Population': 1393240000, 'Date': 'July 31, 2018', '% of worldpopulation': '18.2%', 'Source': 'Official population clock'}
1 {'Rank': '2', 'Country(or dependent territory)': 'India[Note 3]', 'Population': 1335030000, 'Date': 'July 31, 2018', '% of worldpopulation': '17.5%', 'Source': 'Official population clock'}
2 {'Rank': '3', 'Country(or dependent territory)': 'United States[Note 4]', 'Population': 327573000, 'Date': 'July 31, 2018', '% of worldpopulation': '4.29%', 'Source': 'Official population clock'}
3 {'Rank': '4', 'Country(or dependent territory)': 'Indonesia', 'Population': 265015300, 'Date': 'July 1, 2018', '% of worldpopulation': '3.47%', 'Source': 'Official annual projection'}
4 {'Rank': '5', 'Country(or dependent territory)': 'Pakistan', 'Population': 212423000, 'Date': 'July 31, 2018', '% of worldpopulation': '2.78%', 'Source': 'Official population clock'}
5 {'Rank': '6', 'Country(or dependent territory)'