scrap data on mps from http://www.parliament.go.ke/the-national-assembly/mps

In [24]:
from bs4 import BeautifulSoup
import json
import requests
import re

root_url = "http://www.parliament.go.ke"

def extract_mp_data(url):
  """
  Extracts data for each Member of Parliament (MP) from a webpage.

  Args:
      url: The URL of the webpage containing the table.

  Returns:
      A list of dictionaries, where each dictionary represents an MP with their details.
  """
  data = []
  profileLinks=[]
  tablePages=[]
  response = requests.get(url)
  soup = BeautifulSoup(response.content, 'html.parser')

  # parses name into names array
  def parse_name(string):
    # Remove quoted characters and preceding whitespaces
    withoutHon=string.replace("HON.", "").replace("Hon.", "").replace(",", "")
    withoutBracketTitles=re.sub(r'\s*[\(\[].*?[\)\]]', '', withoutHon)
    return withoutBracketTitles.split()

  # Target the specific nav element
  pager_nav = soup.find('nav', class_='pager', role='navigation')
  if pager_nav:
  # Find the last 'li' element within the 'ul' with class 'pager__items'
    ul_tag = pager_nav.find('ul')
    if ul_tag:
      # Only call find_last if ul_tag is not None
      # print(str(ul_tag))
      li_elements = ul_tag.find_all('li')
      # Check if any 'li' elements were found
      if li_elements:
          href = li_elements[-1].find('a').get('href')# Get the last 'li' element

          # Parse the query parameters
          print('last pagination: ',href)
          # Split the href_value by '='
          # Split the href_value by '='
          parts = href.split('=')

          # Extract the second part (which contains the digits)
          pages = parts[-1]
          for i in range(int(pages)+1):
            tablePages.append(f"{url}?page={i}")
          
          print("pages: ",tablePages)

      else:
          print("No 'li' elements found in the ul")

  for link in tablePages:
    # Your code here (will be executed for each element)
    # print("LINK: ",link)
    response_ = requests.get(link)
    linkSoup = BeautifulSoup(response_.content, 'html.parser')

    # Find the table body (tbody element)
    tbody = linkSoup.find('tbody')

    # Check if tbody exists
    if not tbody:
      continue

    # Extract data from each table row (class: mp)
    for row in tbody.find_all('tr', class_='mp'):
      cells = row.find_all("td")
      mp_data = {}

      # Name cell (1st cell)
      name_cell = cells[0]
      if not name_cell.text.strip():
        continue
      names = parse_name(name_cell.text.strip())
      # print("Name: ",names)
      mp_data['lastName'] = names[0].title()
      if len(names) > 1:
        mp_data['firstName'] = names[1].title()
      if len(names) > 2:
        mp_data['otherName'] = names[2].title()


      # print('name',name_cell.text.strip())

      # County cell (2nd cell)
      image_cell = cells[1]
      anchor_tag = image_cell.find('a')
      img_tag = anchor_tag.find('img')

      if img_tag:  # Check if anchor_tag exists before accessing attributes
        image_url = img_tag.get('src')
      else:
        image_url = None  # Handle cases where no anchor tag is found
      mp_data['photoUrl'] = root_url+image_url

      # County cell (3rd cell)
      county_cell = cells[2]
      county_arr=county_cell.text.strip().split("-")
      cleaned_county_arr = [str.strip() for str in county_arr]
      mp_data['county'] = " ".join(cleaned_county_arr)

      # Constituency cell (4th cell)
      constituency_cell = cells[3]
      constituency_arr=constituency_cell.text.strip().split("-")
      cleaned_constituency_arr = [str.strip() for str in constituency_arr]
      mp_data['constituency'] = " ".join(cleaned_constituency_arr)

      # Party cell (5th cell)
      party_cell = cells[4]
      mp_data['party'] = party_cell.text.strip()

      # Status cell (6th cell)
      if len(cells) >= 6:  # Ensure there are at least 6 cells in the row
        status_cell = cells[5]
        mp_data['appointment'] = status_cell.text.strip().lower()

      # Add MP data to the list
      data.append(mp_data)

  return data

if __name__ == "__main__":
  # Replace with the actual URL of the members list
  url = "http://www.parliament.go.ke/the-national-assembly/mps"
  mp_data = extract_mp_data(url)
  print("MPs count: ",len(mp_data))
  # Convert data to JSON
  json_data = json.dumps(mp_data, indent=4)
  print("JSON: ",json_data)


    

last pagination:  ?field_name_value=%20&field_parliament_value=2022&field_employment_history_value=&page=35
pages:  ['http://www.parliament.go.ke/the-national-assembly/mps?page=0', 'http://www.parliament.go.ke/the-national-assembly/mps?page=1', 'http://www.parliament.go.ke/the-national-assembly/mps?page=2', 'http://www.parliament.go.ke/the-national-assembly/mps?page=3', 'http://www.parliament.go.ke/the-national-assembly/mps?page=4', 'http://www.parliament.go.ke/the-national-assembly/mps?page=5', 'http://www.parliament.go.ke/the-national-assembly/mps?page=6', 'http://www.parliament.go.ke/the-national-assembly/mps?page=7', 'http://www.parliament.go.ke/the-national-assembly/mps?page=8', 'http://www.parliament.go.ke/the-national-assembly/mps?page=9', 'http://www.parliament.go.ke/the-national-assembly/mps?page=10', 'http://www.parliament.go.ke/the-national-assembly/mps?page=11', 'http://www.parliament.go.ke/the-national-assembly/mps?page=12', 'http://www.parliament.go.ke/the-national-assemb