# Extracting table data from html

In [1]:
# Setup
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup

## Get the web page and convert to dataframe

In [2]:
URL = "https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)"
res = requests.get(URL)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[1] # note that the first table at index 0 is not relevant
df = pd.read_html(str(table))[0]

# Print top 10 countries by population
print(df.head(11).to_string())

       0                1                         2                         3                            4                            5                          6
0   Rank  Country or area  UN continental region[1]  UN statistical region[1]  Population (1 July 2016)[2]  Population (1 July 2017)[2]                     Change
1      —            World                         —                         —                   7466964280                   7550262101  7000111555135228260♠+1.1%
2      1         China[a]                      Asia              Eastern Asia                   1403500365                   1409517397  6999428716097982630♠+0.4%
3      2            India                      Asia             Southern Asia                   1324171354                   1339180127  7000113344643460699♠+1.1%
4      3    United States                  Americas          Northern America                    322179605                    324459463  6999707635730076710♠+0.7%
5      4        Indone

## Convert to json string

In [3]:
str_json = (df.head(11).to_json(orient='records'))
print(str_json)

[{"0":"Rank","1":"Country or area","2":"UN continental region[1]","3":"UN statistical region[1]","4":"Population (1 July 2016)[2]","5":"Population (1 July 2017)[2]","6":"Change"},{"0":"\u2014","1":"World","2":"\u2014","3":"\u2014","4":"7466964280","5":"7550262101","6":"7000111555135228260\u2660+1.1%"},{"0":"1","1":"China[a]","2":"Asia","3":"Eastern Asia","4":"1403500365","5":"1409517397","6":"6999428716097982630\u2660+0.4%"},{"0":"2","1":"India","2":"Asia","3":"Southern Asia","4":"1324171354","5":"1339180127","6":"7000113344643460699\u2660+1.1%"},{"0":"3","1":"United States","2":"Americas","3":"Northern America","4":"322179605","5":"324459463","6":"6999707635730076710\u2660+0.7%"},{"0":"4","1":"Indonesia","2":"Asia","3":"South-Eastern Asia","4":"261115456","5":"263991379","6":"7000110139899186970\u2660+1.1%"},{"0":"5","1":"Brazil","2":"Americas","3":"South America","4":"207652865","5":"209288278","6":"6999787570641031120\u2660+0.8%"},{"0":"6","1":"Pakistan","2":"Asia","3":"Southern Asi

## Convert json to list of dictionaries

In [4]:
li_json = json.loads(str_json)
for i,v in enumerate(li_json):
    print(i,v)
    if i >= 10:
        break
    

0 {'6': 'Change', '3': 'UN statistical region[1]', '2': 'UN continental region[1]', '1': 'Country or area', '4': 'Population (1 July 2016)[2]', '5': 'Population (1 July 2017)[2]', '0': 'Rank'}
1 {'6': '7000111555135228260♠+1.1%', '3': '—', '2': '—', '1': 'World', '4': '7466964280', '5': '7550262101', '0': '—'}
2 {'6': '6999428716097982630♠+0.4%', '3': 'Eastern Asia', '2': 'Asia', '1': 'China[a]', '4': '1403500365', '5': '1409517397', '0': '1'}
3 {'6': '7000113344643460699♠+1.1%', '3': 'Southern Asia', '2': 'Asia', '1': 'India', '4': '1324171354', '5': '1339180127', '0': '2'}
4 {'6': '6999707635730076710♠+0.7%', '3': 'Northern America', '2': 'Americas', '1': 'United States', '4': '322179605', '5': '324459463', '0': '3'}
5 {'6': '7000110139899186970♠+1.1%', '3': 'South-Eastern Asia', '2': 'Asia', '1': 'Indonesia', '4': '261115456', '5': '263991379', '0': '4'}
6 {'6': '6999787570641031120♠+0.8%', '3': 'South America', '2': 'Americas', '1': 'Brazil', '4': '207652865', '5': '209288278', '0'