In [None]:
# pip install pandas requests beautifulsoup4

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urlparse
from datetime import date
import unicodedata

In [3]:
def clean_text(text):
    # Normalize Unicode characters
    cleaned_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return cleaned_text

In [None]:
base_url = 'https://opportunitiesforyoungkenyans.co.ke'
pagination_date_pattern = base_url + '/2024/06/13'
pagination_pattern = pagination_date_pattern + '/page/3'

headers = {
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

In [4]:
def get_post_body(url, headers):
  response = requests.get(url, headers=headers)

  if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    section = soup.find('section', class_='main-content')

    if section:
      post_article = section.find('article', class_='post-content')

      if post_article:
        post_container = post_article.find('div', class_='clearfix').text
        return post_container

In [5]:
def extract_date_from_url(url):
  parsed_url =  urlparse(url)
  
  path_parts = parsed_url.path.split('/')
  
  year = path_parts[1]
  month = path_parts[2]
  day = path_parts[3]
  
  date = f'{year}-{month}-{day}'
  return date

In [6]:
def get_job_postings(url, headers):
  response = requests.get(url, headers=headers)

  if response.status_code == 200:
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.content, 'html.parser')
    section = soup.find('section', class_='main-content')

    if section:
      post_container = section.find('div', class_='col-lg-8')

      job_posts = []
      
      for post in post_container.find_all('div', class_='post-classic'):
        title_heading_element = post.find('h5')

        title = clean_text(title_heading_element.find('a').text.strip())
        link = title_heading_element.find('a').get("href")

        content = get_post_body(link, headers)
        date = extract_date_from_url(link)

        job_posts.append({
          'title': title,
          'link': link,
          'content': content,
          'date': date
        })

      return job_posts
  else:
    return []

# all_jobs = get_job_postings(pagination_date_pattern, headers)
# df = pd.DataFrame(all_jobs)
# df.to_csv('job_postings.csv', index=False)

In [None]:
def aggregate_job_postings():
  headers = {
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
  }
  
  first_year = 2018
  first_month = 1
  first_day = 1

  year_today = 2018
  month_today = 1
  day_today = 31
  # date_today = date.today()
  # year_today = date_today.year
  # month_today = date_today.month
  # day_today = date_today.day

  base_url = 'https://opportunitiesforyoungkenyans.co.ke'

  jobs_from_all_dates = []
  merged_jobs = []
  print('a')
  for year in range(first_year, year_today + 1):
    print('b')
    
    if year == year_today:
      last_month = month_today
    else:
      last_month = 12
    
    for month in range(first_month, last_month + 1):
      print('c')
      
      if year == year_today and month == month_today:
        last_day = day_today
      else:
        last_day = 31

      for day in range(first_day, last_day + 1):
        print('d')
          
        current_date_url = f'{base_url}/{year}/{month}/{day}'
        current_date_url_response = requests.get(current_date_url, headers=headers)

        if current_date_url_response.status_code == 200:
          print('e')
            
          soup = BeautifulSoup(current_date_url_response.content, 'html.parser')
          date_url_body_tag = soup.body

          if date_url_body_tag and 'error404' in date_url_body_tag.get('class', []):
            print('f')
              
            print(f'No posts found at {page_url}. Skipping.')
            continue
          else:
            for page_count in range(1, 1000):
              print('g')
                
              page_url = f'{current_date_url}/page/{page_count}'
              page_url_response = requests.get(page_url, headers=headers)
              
              if page_url_response.status_code == 200:
                print('h')
                  
                page_url_soup = BeautifulSoup(page_url_response.content, 'html.parser')
                page_url_body_tag = page_url_soup.body

                if page_url_body_tag and 'error404' in page_url_body_tag.get('class', []):
                  print('i')
                    
                  print(f'Error 404 found on {page_url}. Stopping search.')
                  break # Stop searching further pages because if current page is not found then there are no more pages for this date.
                else:
                  print('j')
                    
                  jobs_per_date = get_job_postings(page_url, headers)
                  jobs_from_all_dates.append(jobs_per_date)
                  print(f'Found {page_url}')
              else:
                print('k')
                
                print(f'Failed to retrieve {page_url}. Status code: {page_url_response.status_code}')
                break  # Stop searching further pages on HTTP error
            
            print(f'Day {day} over.')
          # Continue to next day
        else:
          print(f'Failed to retrieve {current_date_url}. Status code: {current_date_url_response.status_code}')
          continue  # Stop searching further pages on HTTP error
      
      print(f'Month {month} over.')
    print(f'Year {year} over.')

  for jobs in jobs_from_all_dates:
    merged_jobs.extend(jobs)

  count_of_merged_jobs = len(merged_jobs)

  print(f'Count of merged jobs: {count_of_merged_jobs}')  
  return merged_jobs

aggregate_job_postings()
# all_jobs = aggregate_job_postings()
# df = pd.DataFrame(all_jobs)
# df.to_csv('job_postings.csv', index=False)