<h1> Practical Applications of Web Scraping

In [1]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup

# Send an HTTP request to the URL of the webpage
response = requests.get('https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population')

# Parse the content of the request
soup = BeautifulSoup(response.text, 'html.parser')

# Find the main table using the class attribute
table = soup.find('table', {'class': 'wikitable'})

# Find all rows in the table
rows = table.find_all('tr')

# Loop through each row
for row in rows:
    # Find all columns in each row
    cols = row.find_all('td')
    # Get the text from each column
    cols = [col.text.strip() for col in cols]
    # Print the columns
    print(cols)

[]
['–', 'World', '8,107,182,000', '100%', '17 May 2024', 'UN projection[3]', '']
['1/2  [b]', 'China', '1,409,670,000', '17.4%', '31 Dec 2023', 'Official estimate[5]', '[c]']
['India', '1,400,744,000', '17.3%', '1 Mar 2024', 'Official projection[6]', '[d]']
['3', 'United States', '335,893,238', '4.1%', '1 Jan 2024', 'Official estimate[7]', '[e]']
['4', 'Indonesia', '279,118,866', '3.4%', '1 Jul 2023', 'National annual projection[8]', '']
['5', 'Pakistan', '241,499,431', '3.0%', '1 Mar 2023', '2023 census result[9]', '[f]']
['6', 'Nigeria', '223,800,000', '2.8%', '1 Jul 2023', 'Official projection[10]', '']
['7', 'Brazil', '203,080,756', '2.5%', '1 Aug 2022', '2022 census result[11]', '']
['8', 'Bangladesh', '169,828,911', '2.1%', '14 Jun 2022', '2022 census result[12]', '']
['9', 'Russia', '146,150,789', '1.8%', '1 Jan 2024', 'Official estimate[13]', '[g]']
['10', 'Mexico', '129,625,968', '1.6%', '31 Dec 2023', 'National quarterly estimate[14]', '']
['11', 'Japan', '124,000,000', '1.5

In [2]:
def extract_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('table', {'class': 'wikitable'})
    rows = table.find_all('tr')
    data = []
    for row in rows:
        cols = row.find_all('td')
        cols = [col.text.strip() for col in cols]
        data.append(cols)
    return data
        
population_data = 'https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population'

extract_data(population_data)[:5]

[[],
 ['–',
  'World',
  '8,107,182,000',
  '100%',
  '17 May 2024',
  'UN projection[3]',
  ''],
 ['1/2  [b]',
  'China',
  '1,409,670,000',
  '17.4%',
  '31 Dec 2023',
  'Official estimate[5]',
  '[c]'],
 ['India',
  '1,400,744,000',
  '17.3%',
  '1 Mar 2024',
  'Official projection[6]',
  '[d]'],
 ['3',
  'United States',
  '335,893,238',
  '4.1%',
  '1 Jan 2024',
  'Official estimate[7]',
  '[e]']]

<h2> Extracting Data from Web Pages

In [3]:
response = requests.get(population_data)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', {'class': 'wikitable'})
rows = table.find_all('tr')
top_100_countries = rows[2:102]
for row in top_100_countries:
    cols = row.find_all('td')
    cols = [col.text.strip() for col in cols]
    print(cols)

['1/2  [b]', 'China', '1,409,670,000', '17.4%', '31 Dec 2023', 'Official estimate[5]', '[c]']
['India', '1,400,744,000', '17.3%', '1 Mar 2024', 'Official projection[6]', '[d]']
['3', 'United States', '335,893,238', '4.1%', '1 Jan 2024', 'Official estimate[7]', '[e]']
['4', 'Indonesia', '279,118,866', '3.4%', '1 Jul 2023', 'National annual projection[8]', '']
['5', 'Pakistan', '241,499,431', '3.0%', '1 Mar 2023', '2023 census result[9]', '[f]']
['6', 'Nigeria', '223,800,000', '2.8%', '1 Jul 2023', 'Official projection[10]', '']
['7', 'Brazil', '203,080,756', '2.5%', '1 Aug 2022', '2022 census result[11]', '']
['8', 'Bangladesh', '169,828,911', '2.1%', '14 Jun 2022', '2022 census result[12]', '']
['9', 'Russia', '146,150,789', '1.8%', '1 Jan 2024', 'Official estimate[13]', '[g]']
['10', 'Mexico', '129,625,968', '1.6%', '31 Dec 2023', 'National quarterly estimate[14]', '']
['11', 'Japan', '124,000,000', '1.5%', '1 Apr 2024', 'Official estimate[15]', '']
['12', 'Philippines', '112,892,781'

<h2> Handling Errors and Exceptions in Web Scraping

In [4]:
from requests.exceptions import RequestException, HTTPError

try:
    response = requests.get('https://www.dataquest.io')
    response.raise_for_status()  # Raise an HTTPError if the status is 4xx, 5xx
except RequestException as e:
    print(f"There was an issue with your request: {e}")
except HTTPError as e:
    print(f"HTTP error occurred: {e}")

In [5]:
try:
    response = requests.get(population_data)
    response.raise_for_status()
except RequestException as e:
    print(f"There was an issue with your request: {e}")
except HTTPError as e:
    print(f"HTTP error occurred: {e}")
    
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', {'class': 'wikitable'})
rows = table.find_all('tr')
top_20_countries = rows[2:22]
for row in top_20_countries:
    cols = row.find_all('td')
    cols = [col.text.strip() for col in cols]
    print(cols)


['1/2  [b]', 'China', '1,409,670,000', '17.4%', '31 Dec 2023', 'Official estimate[5]', '[c]']
['India', '1,400,744,000', '17.3%', '1 Mar 2024', 'Official projection[6]', '[d]']
['3', 'United States', '335,893,238', '4.1%', '1 Jan 2024', 'Official estimate[7]', '[e]']
['4', 'Indonesia', '279,118,866', '3.4%', '1 Jul 2023', 'National annual projection[8]', '']
['5', 'Pakistan', '241,499,431', '3.0%', '1 Mar 2023', '2023 census result[9]', '[f]']
['6', 'Nigeria', '223,800,000', '2.8%', '1 Jul 2023', 'Official projection[10]', '']
['7', 'Brazil', '203,080,756', '2.5%', '1 Aug 2022', '2022 census result[11]', '']
['8', 'Bangladesh', '169,828,911', '2.1%', '14 Jun 2022', '2022 census result[12]', '']
['9', 'Russia', '146,150,789', '1.8%', '1 Jan 2024', 'Official estimate[13]', '[g]']
['10', 'Mexico', '129,625,968', '1.6%', '31 Dec 2023', 'National quarterly estimate[14]', '']
['11', 'Japan', '124,000,000', '1.5%', '1 Apr 2024', 'Official estimate[15]', '']
['12', 'Philippines', '112,892,781'

<h2> Understanding HTML Elements, IDs, and Classes

In [7]:
# In BeautifulSoup, we can find elements with a specific id using a dictionary with the id as the key, like so:
element = soup.find('div', {'id': 'unique_id'})

In [8]:
try:
    response = requests.get('https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population')
    response.raise_for_status()
except RequestException as e:
    print(f"There was an issue with your request: {e}")
except HTTPError as e:
    print(f"HTTP error occurred: {e}")
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', {'class': 'wikitable'})
print(table)

<table class="wikitable sortable sticky-header sort-under mw-datatable col2left col6left" style="text-align:right">
<caption>List of countries and territories by total population
</caption>
<tbody><tr>
<th>
</th>
<th>Location
</th>
<th>Population
</th>
<th style="width:2em">% of<br/>world
</th>
<th>Date
</th>
<th><span class="nowrap">Source (official or from</span><br/>the <a href="/wiki/United_Nations" title="United Nations">United Nations</a>)
</th>
<th class="unsortable">
</th></tr>
<tr>
<td><span data-sort-value="5000000000000000000♠" style="display:none"></span> –
</td>
<td><b><span class="flagicon" style="padding-left:25px;"> </span>World</b>
</td>
<td>8,107,182,000</td>
<td><div class="center" style="width:auto; margin-left:auto; margin-right:auto;">100%</div></td>
<td><span data-sort-value="000000002024-05-17-0000" style="white-space:nowrap">17 May 2024</span>
</td>
<td>UN projection<sup class="reference" id="cite_ref-unpop_4-0"><a href="#cite_note-unpop-4">[3]</a></sup></td>
<

<h2> Applying CSS Selectors for Targeted Data Extraction

In [10]:
paragraphs = soup.select('p')
paragraphs

[<p class="mw-empty-elt">
 </p>,
 <p class="mw-empty-elt">
 </p>,
 <p>This is a <b>list of countries and dependencies by population</b>. It includes <a href="/wiki/Sovereign_state" title="Sovereign state">sovereign states</a>, inhabited <a href="/wiki/Dependent_territory" title="Dependent territory">dependent territories</a> and, in some cases, constituent countries of sovereign states, with inclusion within the list being primarily based on the <a href="/wiki/International_Organization_for_Standardization" title="International Organization for Standardization">ISO</a> standard <a href="/wiki/ISO_3166-1" title="ISO 3166-1">ISO 3166-1</a>. For instance, the <a href="/wiki/United_Kingdom" title="United Kingdom">United Kingdom</a> is considered a single entity, while the constituent countries of the <a href="/wiki/Kingdom_of_the_Netherlands" title="Kingdom of the Netherlands">Kingdom of the Netherlands</a> are considered separately. In addition, this list includes certain <a href="/wiki/L

In [11]:
# Select elements with the class 'history'
history_elements = soup.select('.history')

# Select the element with the id 'book_123'
book_123 = soup.select('#book_123')

In [12]:
# Select all `p` elements inside `div` elements
div_paragraphs = soup.select('div p')
div_paragraphs

[<p class="mw-empty-elt">
 </p>,
 <p class="mw-empty-elt">
 </p>,
 <p>This is a <b>list of countries and dependencies by population</b>. It includes <a href="/wiki/Sovereign_state" title="Sovereign state">sovereign states</a>, inhabited <a href="/wiki/Dependent_territory" title="Dependent territory">dependent territories</a> and, in some cases, constituent countries of sovereign states, with inclusion within the list being primarily based on the <a href="/wiki/International_Organization_for_Standardization" title="International Organization for Standardization">ISO</a> standard <a href="/wiki/ISO_3166-1" title="ISO 3166-1">ISO 3166-1</a>. For instance, the <a href="/wiki/United_Kingdom" title="United Kingdom">United Kingdom</a> is considered a single entity, while the constituent countries of the <a href="/wiki/Kingdom_of_the_Netherlands" title="Kingdom of the Netherlands">Kingdom of the Netherlands</a> are considered separately. In addition, this list includes certain <a href="/wiki/L

td_elements = td_elements[13:]  # The first two rows are unstructured, so we start at the 14th element, assuming the 3rd row onwards.

# Loop through each 'td' element
for i in range(len(td_elements)):
    # If the index of the 'td' element is 4 (which corresponds to the 'Date' column)
    if i % 7 == 4:
        # Extract the text from the 'td' element and print it
        date = td_elements[i].text
        print(date)

In [21]:
try:
    response = requests.get(population_data)
    response.raise_for_status()
except RequestException as e:
    print(f"There was an issue with your request: {e}")
except HTTPError as e:
    print(f"HTTP error occurred: {e}")
soup = BeautifulSoup(response.text, 'html.parser')
td_elements = soup.select('tr td')
td_elements = td_elements[13:]
population_list = []
for i in range(len(td_elements)):
    if i % 7 == 2:
        population = td_elements[i].text
        population_list.append(population)
        
print(population_list[:10])

['1,400,744,000', '335,893,238', '279,118,866', '241,499,431', '223,800,000', '203,080,756', '169,828,911', '146,150,789', '129,625,968', '124,000,000']


<h2> Handling Different Data Types in Web Scraping

In [22]:
# For the population, we can remove the commas and convert the string to an integer:

population = '1,411,750,000'
population = int(population.replace(',', ''))
print(population)

1411750000


In [23]:
# For the percentage, we can remove the '%' sign and convert the string to a float:

percentage = '17.5%'
percentage = float(percentage.replace('%', ''))
print(percentage)

17.5


In [24]:
# For the date, we can use the datetime module to convert the string to a datetime object:

from datetime import datetime

date = '31 Dec 2022'
date = datetime.strptime(date, '%d %b %Y')
print(date)

2022-12-31 00:00:00


In [27]:
population = '1,411,750,000abc'
try:
    population = int(population.replace(',', ''))
except ValueError:
    print(f"Could not convert {population} to an integer.")

Could not convert 1,411,750,000abc to an integer.


In [28]:
try:
    response = requests.get(population_data)
    response.raise_for_status()
except RequestException as e:
    print(f"There was an issue with your request: {e}")
except HTTPError as e:
    print(f"HTTP error occurred {e}")
soup = BeautifulSoup(response.text, 'html.parser')
td_elements = soup.select('tr td')
td_elements = td_elements[13:]
first_30_rows = 0
for i in range(len(td_elements)):
    if i % 7 == 0:
        first_30_rows += 1
    if first_30_rows > 30:
        break
    if i % 7 == 2:
        population = td_elements[i].text
        population = int(population.replace(',', ''))
        print(population)
    if i % 7 == 3:
        percentage = td_elements[i].text
        percentage = float(percentage.replace('%', ''))
        print(percentage)
    if i % 7 == 4:
        date = td_elements[i].text
        date = date.strip()
        try:
            date = datetime.strptime(date, '%d %b %Y')
            print(date)
        except ValueError:
            print("Date not found")

1400744000
17.3
2024-03-01 00:00:00
335893238
4.1
2024-01-01 00:00:00
279118866
3.4
2023-07-01 00:00:00
241499431
3.0
2023-03-01 00:00:00
223800000
2.8
2023-07-01 00:00:00
203080756
2.5
2022-08-01 00:00:00
169828911
2.1
2022-06-14 00:00:00
146150789
1.8
2024-01-01 00:00:00
129625968
1.6
2023-12-31 00:00:00
124000000
1.5
2024-04-01 00:00:00
112892781
1.4
2023-07-01 00:00:00
107334000
1.3
2023-07-01 00:00:00
104462545
1.3
2023-01-01 00:00:00
100300000
1.2
Date not found
95370000
1.2
2019-07-01 00:00:00
85372377
1.1
2023-12-31 00:00:00
84607016
1.0
2023-09-30 00:00:00
84055000
1.0
2022-03-20 00:00:00
68394000
0.8
2024-03-01 00:00:00
67596281
0.8
2022-06-30 00:00:00
66090475
0.8
2022-07-01 00:00:00
62027503
0.8
2022-02-02 00:00:00
61741120
0.8
2022-08-23 00:00:00
58972268
0.7
2024-01-31 00:00:00
55770232
0.7
2022-07-01 00:00:00
52695952
0.6
2024-01-01 00:00:00
51526000
0.6
2023-01-01 00:00:00
51293934
0.6
2024-03-31 00:00:00
48692804
0.6
2024-04-01 00:00:00
46654581
0.6
2023-07-01 00:00:00

<h2> Storing and Structuring Scraped Data

In [31]:
import pandas as pd
data = [['China', '1,411,750,000', '17.5%', '31 Dec 2022', 'Official estimate[4]', '[b]'], ['India', '1,392,329,000', '17.3%', '1 Mar 2023', 'Official projection[5]', '[c]'], ['United States', '335,495,000', '4.2%', '11 Oct 2023', 'National population clock[7]', '[d]']]
# Define the column names
columns = ['Country/Dependency', 'Population', '% of World', 'Date', 'Source', 'Notes']

# Create a DataFrame from the data
df = pd.DataFrame(data, columns=columns)

print(df)
# Write the DataFrame to a CSV file
df.to_csv('population_data.csv', index=False)

  Country/Dependency     Population % of World         Date   
0              China  1,411,750,000      17.5%  31 Dec 2022  \
1              India  1,392,329,000      17.3%   1 Mar 2023   
2      United States    335,495,000       4.2%  11 Oct 2023   

                         Source Notes  
0          Official estimate[4]   [b]  
1        Official projection[5]   [c]  
2  National population clock[7]   [d]  
