In [61]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [62]:
page_to_scrape = "https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)"

In [63]:
page_response = requests.get(page_to_scrape, timeout = 5)
page_response

<Response [200]>

In [64]:
soup = BeautifulSoup(page_response.content, "html5lib")
# print(soup.prettify())

## Built-ins

In [65]:
soup.title

<title>List of countries by population (United Nations) - Wikipedia</title>

In [66]:
soup.title.string

'List of countries by population (United Nations) - Wikipedia'

## Brass Tacks

In [67]:
table = soup.find("table", attrs = {'class':'wikitable', 'class':'sortable'})
# table
type(table)

bs4.element.Tag

In [68]:
col_headers = table.find('tbody').find('tr').find_all('th')
# col_headers
type(col_headers)

bs4.element.ResultSet

In [69]:
table_headers = [ele.text.strip() for ele in col_headers]
table_headers

['Rank',
 'Country or area',
 'UN continentalregion[2]',
 'UN statisticalregion[2]',
 'Population(1 July 2016)[3]',
 'Population(1 July 2017)[3]',
 'Change']

In [70]:
rows = table.find('tbody').find_all('tr')
clean_rows = []
# clean_rows.append(table_headers)
for row in rows:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    if cols and cols[1] != 'World':
        clean_rows.append(cols)

In [71]:
mydf = pd.DataFrame(clean_rows)
mydf.columns = ['rank','country','continent','region','pop2016','population','change']
mydf['population'] = mydf['population'].str.replace(',','').astype('int')
mydf.head()

Unnamed: 0,rank,country,continent,region,pop2016,population,change
0,1,China[a],Asia,Eastern Asia,1403500365,1409517397,+0.4%
1,2,India,Asia,Southern Asia,1324171354,1339180127,+1.1%
2,3,United States,Americas,Northern America,322179605,324459463,+0.7%
3,4,Indonesia,Asia,South-eastern Asia,261115456,263991379,+1.1%
4,5,Brazil,Americas,South America,207652865,209288278,+0.8%


In [72]:
cols_to_keep = ['country','continent','population']
world_pop = mydf[cols_to_keep].reset_index(drop=True)
world_pop.head()

Unnamed: 0,country,continent,population
0,China[a],Asia,1409517397
1,India,Asia,1339180127
2,United States,Americas,324459463
3,Indonesia,Asia,263991379
4,Brazil,Americas,209288278


In [74]:
world_pop.to_csv('world_pop.csv', index = False)