In [16]:
#The objective is to automate the extraction of HTML content, article titles, text,
#and internal links from Wikipedia pages into a consolidated function that accepts 
#any Wikipedia URL for efficient data retrieval and processing.


import requests
from bs4 import BeautifulSoup
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [18]:
url = "https://en.wikipedia.org/wiki/Lagos"

In [20]:
soup = BeautifulSoup(url,"html.parser")

In [34]:
# function to Extract article title

def extract_wikipedia_title(url):
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to retrieve page, status code: {response.status_code}")

    soup = BeautifulSoup(response.content, 'html.parser')
    title = soup.find(id="firstHeading")
    return title.text if title else "No Title Found"
    url = "https://en.wikipedia.org/wiki/Lagos"
print(extract_wikipedia_title(url))

Lagos


In [36]:
#alternative method to extract article title

def extract_wikipedia_title(url):
    response = requests.get(url)
    response.raise_for_status()  # Raises an HTTPError for bad responses
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup.find(id="firstHeading").get_text("No Title Found")

# Example usage:
url = "https://en.wikipedia.org/wiki/Lagos"
print(extract_wikipedia_title(url))

Lagos


In [38]:
def extract_headings_paragraphs(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    content = soup.find(id="mw-content-text")
    result, current_heading = {}, 'Introduction'
    for tag in content.find_all(['h2', 'p']):
        if tag.name == 'h2':
            current_heading = tag.get_text(strip=True)
        elif tag.name == 'p':
            result.setdefault(current_heading, []).append(tag.get_text(strip=True))
    return result

# Example usage:
url = "https://en.wikipedia.org/wiki/Lagos"
headings_paragraphs = extract_headings_paragraphs(url)
for heading, paragraphs in headings_paragraphs.items():
    print(f"Heading: {heading}")
    for para in paragraphs:
        print(f"  Paragraph: {para[:100]}...")  # Print a preview of each paragraph

Heading: Introduction
  Paragraph: ...
  Paragraph: Lagos(/ˈleɪɡɒs/LAY-goss;[10][11]alsoUS:/ˈlɑːɡoʊs/LAH-gohss;[11][12]Yoruba:Èkó), orLagos City, is a l...
  Paragraph: Lagos emerged as a home to theAworisubgroup of theYorubaofWest Africain the 15th century, which are ...
  Paragraph: However, the state capital was later moved toIkejain 1976,[37]and the federal capital moved toAbujai...
  Paragraph: The population of Metropolitan Lagos is disputed.[44]In the 2006 federal census data, the conurbatio...
  Paragraph: TheUniversity of Lagosis one of thefirst generation universitiesof Nigeria. The business district of...
Heading: Etymology
  Paragraph: Lagosis derived from the Portuguese word for "lakes". The pronunciation/ˈleɪɡɒs/(LAY-goss) is typica...
Heading: Administration
  Paragraph: Lagos was formerly the capital city ofNigeria, but it has since been replaced byAbuja. Abuja officia...
  Paragraph: In terms of administration, Lagos is not a singlemunicipalityand therefore has no over

In [40]:
def extract_wikipedia_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    base_url = 'https://en.wikipedia.org'
    return [base_url + a['href'] for a in soup.find_all('a', href=True) if a['href'].startswith('/wiki/') and ':' not in a['href']]

# Example usage:
url = "https://en.wikipedia.org/wiki/Lagos"
links = extract_wikipedia_links(url)
for link in links:
    print(link)

https://en.wikipedia.org/wiki/Main_Page
https://en.wikipedia.org/wiki/Main_Page
https://en.wikipedia.org/wiki/Lagos
https://en.wikipedia.org/wiki/Lagos
https://en.wikipedia.org/wiki/Lagos
https://en.wikipedia.org/wiki/Geographic_coordinate_system
https://en.wikipedia.org/wiki/Lagos_(disambiguation)
https://en.wikipedia.org/wiki/%C3%88k%C3%B3_(song)
https://en.wikipedia.org/wiki/Lego_(disambiguation)
https://en.wikipedia.org/wiki/Laos
https://en.wikipedia.org/wiki/Yoruba_language
https://en.wikipedia.org/wiki/Metropolis
https://en.wikipedia.org/wiki/Lagos_Island
https://en.wikipedia.org/wiki/Civic_Tower_(Lagos)
https://en.wikipedia.org/wiki/Tinubu_Square
https://en.wikipedia.org/wiki/Lekki%E2%80%93Epe_Expressway
https://en.wikipedia.org/wiki/National_Arts_Theatre
https://en.wikipedia.org/wiki/Lekki-Ikoyi_Link_Bridge
https://en.wikipedia.org/wiki/Cathedral_Church_of_Christ,_Lagos
https://en.wikipedia.org/wiki/Lagos_State
https://en.wikipedia.org/wiki/Nigeria
https://en.wikipedia.org/wiki

In [42]:
#Wraping all the previous functions into a single function that takes as parameters a Wikipedia link

def extract_wikipedia_data(url):
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
    base_url = 'https://en.wikipedia.org'

    # Extract title
    title = soup.find(id="firstHeading").get_text(strip=True)

    # Extract headings and paragraphs
    content = soup.find(id="mw-content-text")
    sections = {}
    current_heading = 'Introduction'
    sections[current_heading] = []

    for tag in content.find_all(['h2', 'p']):
        if tag.name == 'h2':
            current_heading = tag.get_text(strip=True)
            sections[current_heading] = []
        elif tag.name == 'p':
            sections[current_heading].append(tag.get_text(strip=True))

    # Extract internal links
    internal_links = [base_url + a['href'] for a in soup.find_all('a', href=True)
                      if a['href'].startswith('/wiki/') and ':' not in a['href']]

    return {
        'title': title,
        'sections': sections,
        'internal_links': internal_links
    }

# Example usage:
url = "https://en.wikipedia.org/wiki/Lagos"
data = extract_wikipedia_data(url)

# Output example
print(f"Title: {data['title']}\n")
for heading, paragraphs in data['sections'].items():
    print(f"Heading: {heading}")
    for para in paragraphs:
        print(f"  Paragraph: {para[:100]}...")  # Print a preview of each paragraph
    print()
print("Internal Links:")
for link in data['internal_links'][:10]:  # Print first 10 links as a sample
    print(link)

Title: Lagos

Heading: Introduction
  Paragraph: ...
  Paragraph: Lagos(/ˈleɪɡɒs/LAY-goss;[10][11]alsoUS:/ˈlɑːɡoʊs/LAH-gohss;[11][12]Yoruba:Èkó), orLagos City, is a l...
  Paragraph: Lagos emerged as a home to theAworisubgroup of theYorubaofWest Africain the 15th century, which are ...
  Paragraph: However, the state capital was later moved toIkejain 1976,[37]and the federal capital moved toAbujai...
  Paragraph: The population of Metropolitan Lagos is disputed.[44]In the 2006 federal census data, the conurbatio...
  Paragraph: TheUniversity of Lagosis one of thefirst generation universitiesof Nigeria. The business district of...

Heading: Etymology
  Paragraph: Lagosis derived from the Portuguese word for "lakes". The pronunciation/ˈleɪɡɒs/(LAY-goss) is typica...

Heading: History

Heading: Administration
  Paragraph: Lagos was formerly the capital city ofNigeria, but it has since been replaced byAbuja. Abuja officia...
  Paragraph: In terms of administration, Lagos is not a singlemun