In [1]:
# import libraries
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
url = "https://en.wikipedia.org/wiki/World_population" # Fill in URL
data = requests.get(url).text # Grabs contents in text format.

In [3]:
soup = BeautifulSoup(data, 'html5lib') # Builds soup object.

In [4]:
tables = soup.find_all('table') # Finds tables in page. HTML table is represented by the tag <table>.

In [6]:
len(tables) # Number of tables found.

26

In [8]:
print(tables[1].prettify()) # When you pick a table, .prettify cleans it up and the HTML is easier to view. 

<table class="wikitable sortable">
 <caption>
  Population by continent (2020 estimates)
 </caption>
 <tbody>
  <tr>
   <th>
    Continent
   </th>
   <th>
    Density
    <br/>
    <small>
     (inhabitants/km
     <sup>
      2
     </sup>
     )
    </small>
   </th>
   <th>
    Population
    <br/>
    <small>
     (millions)
    </small>
   </th>
   <th>
    Most populous country
   </th>
   <th>
    Most populous city (metropolitan area)
   </th>
  </tr>
  <tr>
   <td>
    Asia
   </td>
   <td style="text-align:right">
    104.1
   </td>
   <td style="text-align:right">
    4,641
   </td>
   <td>
    1,439,323,000
    <sup class="reference" id="cite_ref-19">
     <a href="#cite_note-19">
      [note 1]
     </a>
    </sup>
    –
    <span class="flagicon">
     <img alt="" class="thumbborder" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/f/fa/Flag_of_the_People%27s_Republic_of_China.svg/23px-Flag_of_t

In [15]:
df = pd.read_html(str(tables[1]), flavor='bs4')[0] # Creates a Pandas dataframe using the read_html cmd and bs4. 

In [16]:
df

Unnamed: 0,Continent,Density(inhabitants/km2),Population(millions),Most populous country,Most populous city (metropolitan area)
0,Asia,104.1,4641,"1,439,323,000[note 1] – China","37,393,000/13,929,000 – Greater Tokyo Area/Tok..."
1,Africa,44.4,1340,"0206,139,000 – Nigeria","20,900,000 – Cairo[17]"
2,Europe,73.4,747,"0145,934,000 – Russia;approx. 110 million in E...","16,855,000/12,537,000 – Moscow metropolitan ar..."
3,Latin America,24.1,653,"0212,559,000 – Brazil","22,043,000/12,176,000 – São Paulo Metro Area/S..."
4,Northern America[note 2],14.9,368,"0331,002,000 – United States","23,724,000/8,323,000 – New York metropolitan a..."
5,Oceania,5,42,"0025,499,000 – Australia","4,925,000 – Sydney"
6,Antarctica,~0,0.004[16],N/A[note 3],"1,258 – McMurdo Station"


# Another way to do it is with the match function in read_html.

In [23]:
pd.read_html(url, match="World historical and predicted populations", flavor='bs4')[0] # Search for a table by name/string.

Unnamed: 0,Region,1500,1600,1700,1750,1800,1850,1900,1950,1999,2008,2010,2012,2050,2150
0,World,585,660,710,791,978,1262,1650,2521,6008,6707,6896,7052,9725,9746
1,Africa,86,114,106,106,107,111,133,221,783,973,1022,1052,2478,2308
2,Asia,282,350,411,502,635,809,947,1402,3700,4054,4164,4250,5267,5561
3,Europe,168,170,178,190,203,276,408,547,675,732,738,740,734,517
4,Latin America[Note 1],40,20,10,16,24,38,74,167,508,577,590,603,784,912
5,Northern America[Note 1],6,3,2,2,7,26,82,172,312,337,345,351,433,398
6,Oceania,3,3,3,2,2,2,6,13,30,34,37,38,57,51


# Also can use Pandas to read the HTML, then use index to pick one. 

In [17]:
dataframe_list = pd.read_html(url, flavor='bs4')

In [19]:
dataframe_list[4] # Use the index to pick which table to use.

Unnamed: 0,Rank,Country,Population,% of world,Date,Source(official or UN)
0,1,China,1408723880,17.9%,7 Jul 2021,National population clock[89]
1,2,India,1379096407,17.5%,7 Jul 2021,National population clock[90]
2,3,United States,331972516,4.21%,7 Jul 2021,National population clock[91]
3,4,Indonesia,269603400,3.42%,1 Jul 2020,National annual projection[92]
4,5,Pakistan,220892331,2.80%,1 Jul 2020,UN Projection[93]
5,6,Brazil,213371106,2.71%,7 Jul 2021,National population clock[94]
6,7,Nigeria,206139587,2.62%,1 Jul 2020,UN Projection[93]
7,8,Bangladesh,170965506,2.17%,7 Jul 2021,National population clock[95]
8,9,Russia,146748590,1.86%,1 Jan 2020,National annual estimate[96]
9,10,Mexico,127792286,1.62%,1 Jul 2020,National annual projection[97]


In [25]:
dataframe_list[8] # Might have to view the webpage yourself to know which table you are referrencing. 

Unnamed: 0,Region,1500,1600,1700,1750,1800,1850,1900,1950,1999,2008,2010,2012,2050,2150
0,World,585,660,710,791,978,1262,1650,2521,6008,6707,6896,7052,9725,9746
1,Africa,86,114,106,106,107,111,133,221,783,973,1022,1052,2478,2308
2,Asia,282,350,411,502,635,809,947,1402,3700,4054,4164,4250,5267,5561
3,Europe,168,170,178,190,203,276,408,547,675,732,738,740,734,517
4,Latin America[Note 1],40,20,10,16,24,38,74,167,508,577,590,603,784,912
5,Northern America[Note 1],6,3,2,2,7,26,82,172,312,337,345,351,433,398
6,Oceania,3,3,3,2,2,2,6,13,30,34,37,38,57,51


In [39]:
HAP_pop = dataframe_list[8]

In [40]:
AdjustedTable = HAP_pop.loc[0:5, 'Region':'2012'] # Using Pandas functions to manipulate dataframe.

In [41]:
AdjustedTable

Unnamed: 0,Region,1500,1600,1700,1750,1800,1850,1900,1950,1999,2008,2010,2012
0,World,585,660,710,791,978,1262,1650,2521,6008,6707,6896,7052
1,Africa,86,114,106,106,107,111,133,221,783,973,1022,1052
2,Asia,282,350,411,502,635,809,947,1402,3700,4054,4164,4250
3,Europe,168,170,178,190,203,276,408,547,675,732,738,740
4,Latin America[Note 1],40,20,10,16,24,38,74,167,508,577,590,603
5,Northern America[Note 1],6,3,2,2,7,26,82,172,312,337,345,351


In [46]:
LastPop = AdjustedTable[['2012']]
LastPop

Unnamed: 0,2012
0,7052
1,1052
2,4250
3,740
4,603
5,351
