In [None]:
# pip install pandas requests beautifulsoup4

In [7]:
import requests
import unicodedata
import os
import pandas as pd

from bs4 import BeautifulSoup
from urllib.parse import urlparse
from datetime import date
from pathlib import Path

In [2]:
def clean_text(text):
    # Normalize Unicode characters
    cleaned_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return cleaned_text

In [3]:
def get_post_body(url, headers):
  response = requests.get(url, headers=headers)

  if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    section = soup.find('section', class_='main-content')

    if section:
      post_article = section.find('article', class_='post-content')
      if post_article:
        # post_container = post_article.find('div', class_='clearfix').text
        post_container = post_article.find('div', class_='clearfix')
        
        if post_container:
          post_container_text =  clean_text(post_container.get_text(separator=" ", strip=True))

          if len(post_container_text) > 10000:
            post_container_text = len(post_container_text)
            
          return post_container_text

In [4]:
def extract_date_from_url(url):
  parsed_url =  urlparse(url)
  
  path_parts = parsed_url.path.split('/')
  
  year = path_parts[1]
  month = path_parts[2]
  day = path_parts[3]
  
  date = f'{year}-{month}-{day}'
  return date

In [5]:
def get_job_postings(url, headers):
  response = requests.get(url, headers=headers)

  if response.status_code == 200:
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.content, 'html.parser')
    section = soup.find('section', class_='main-content')

    if section:
      post_container = section.find('div', class_='col-lg-8')

      job_posts = []
      
      for post in post_container.find_all('div', class_='post-classic'):
        title_heading_element = post.find('h5')

        title = clean_text(title_heading_element.find('a').text.strip())
        link = title_heading_element.find('a').get("href")

        content = get_post_body(link, headers)
        date = extract_date_from_url(link)

        job_posts.append({
          'title': title,
          'link': link,
          'content': content,
          'date': date
        })
      return job_posts
  else:
    return []

In [8]:
def aggregate_job_postings():
  headers = {
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
  }

  data_folder = Path('data')
  data_folder.mkdir(parents=True, exist_ok=True)

  first_year = 2018
  first_month = 1

  date_today = date.today()
  year_today = date_today.year
  month_today = date_today.month

  base_url = 'https://opportunitiesforyoungkenyans.co.ke'

  jobs_from_all_dates = []
  merged_jobs = []

  for year in range(first_year, year_today + 1):
    if year == year_today:
      last_month = month_today
    else:
      last_month = 12
    
    for month in range(first_month, last_month + 1):
      current_month_url = f'{base_url}/{year}/{month}'
      current_month_url_response = requests.get(current_month_url, headers=headers)
      print(current_month_url)
      if current_month_url_response.status_code == 200:
        for page_count in range(1, 1000):
          page_url = f'{current_month_url}/page/{page_count}'
          page_url_response = requests.get(page_url, headers=headers)
          
          if page_url_response.status_code == 200:
            page_url_soup = BeautifulSoup(page_url_response.content, 'html.parser')
            page_url_body_tag = page_url_soup.body
            
            if page_url_body_tag and 'error404' in page_url_body_tag.get('class', []):
              print(f'Error 404 found on {page_url}. Stopping search.')
              break # Stop searching further pages because if current page is not found then there are no more pages for this date.
            else:
              jobs_per_date = get_job_postings(page_url, headers)
              jobs_from_all_dates.append(jobs_per_date)
              print(f'Found {page_url}')
          else:
            print(f'Failed to retrieve {page_url}. Status code: {page_url_response.status_code}')
            break  # Stop searching further pages on HTTP error
      else:
        print(f'Failed to retrieve {current_month_url}. Status code: {current_month_url_response.status_code}')
        continue  # Stop searching further pages on HTTP error

      print(f'Month {month} over.')

      for jobs in jobs_from_all_dates:
        merged_jobs.extend(jobs)

      file_name = f'job_postings_{month}-{year}.csv'
      file_path = data_folder / file_name 
        
      df = pd.DataFrame(merged_jobs)
      df.to_csv(file_path, index=False)
        
      print(f'{month}-{year} spreadsheet exported')

      merged_jobs = []
      jobs_from_all_dates = []
        
    print(f'Year {year} over.')     

  print(f'Scraping finished.')  

aggregate_job_postings()

https://opportunitiesforyoungkenyans.co.ke/2018/1
Found https://opportunitiesforyoungkenyans.co.ke/2018/1/page/1
Found https://opportunitiesforyoungkenyans.co.ke/2018/1/page/2
Found https://opportunitiesforyoungkenyans.co.ke/2018/1/page/3
Found https://opportunitiesforyoungkenyans.co.ke/2018/1/page/4
Found https://opportunitiesforyoungkenyans.co.ke/2018/1/page/5
Found https://opportunitiesforyoungkenyans.co.ke/2018/1/page/6
Found https://opportunitiesforyoungkenyans.co.ke/2018/1/page/7
Found https://opportunitiesforyoungkenyans.co.ke/2018/1/page/8
Found https://opportunitiesforyoungkenyans.co.ke/2018/1/page/9
Found https://opportunitiesforyoungkenyans.co.ke/2018/1/page/10
Found https://opportunitiesforyoungkenyans.co.ke/2018/1/page/11
Found https://opportunitiesforyoungkenyans.co.ke/2018/1/page/12
Found https://opportunitiesforyoungkenyans.co.ke/2018/1/page/13
Found https://opportunitiesforyoungkenyans.co.ke/2018/1/page/14
Found https://opportunitiesforyoungkenyans.co.ke/2018/1/page/15

KeyboardInterrupt: 

In [12]:
def combine_files():
    data_folder = Path('data')
    data_folder.mkdir(parents=True, exist_ok=True)

    first_year = 2018
    first_month = 1
    
    date_today = date.today()
    year_today = date_today.year
    month_today = date_today.month

    all_dataframes = []
    
    for year in range(first_year, year_today + 1):
        if year == year_today:
            last_month = month_today
        else:
            last_month = 12
        
        for month in range(first_month, last_month + 1):
            file_name = f'job_postings_{month}-{year}.csv'
            file_path = data_folder / file_name
            
            print(f'Reading file: {file_path}')
            
            try:
                df = pd.read_csv(file_path)
                all_dataframes.append(df)
            except FileNotFoundError:
                print(f'File not found: {file_path}')
                continue

    if all_dataframes:
        file_name = 'complete_unlabeled_data.csv'
        file_path = data_folder / file_name

        combined_df = pd.concat(all_dataframes, ignore_index=True)
        combined_df.to_csv(file_path, index=False)
        print('All files have been combined and saved to /data/complete_unlabeled_data.csv')
    else:
        print('No files were found to combine.')

combine_files()

Reading file: data\job_postings_1-2018.csv
Reading file: data\job_postings_2-2018.csv
Reading file: data\job_postings_3-2018.csv
Reading file: data\job_postings_4-2018.csv
Reading file: data\job_postings_5-2018.csv
Reading file: data\job_postings_6-2018.csv
Reading file: data\job_postings_7-2018.csv
Reading file: data\job_postings_8-2018.csv
Reading file: data\job_postings_9-2018.csv
Reading file: data\job_postings_10-2018.csv
Reading file: data\job_postings_11-2018.csv
Reading file: data\job_postings_12-2018.csv
Reading file: data\job_postings_1-2019.csv
Reading file: data\job_postings_2-2019.csv
Reading file: data\job_postings_3-2019.csv
Reading file: data\job_postings_4-2019.csv
Reading file: data\job_postings_5-2019.csv
Reading file: data\job_postings_6-2019.csv
Reading file: data\job_postings_7-2019.csv
Reading file: data\job_postings_8-2019.csv
Reading file: data\job_postings_9-2019.csv
Reading file: data\job_postings_10-2019.csv
Reading file: data\job_postings_11-2019.csv
Readin