# Linkedin Scraper

#### 1. Imports and constants

In [None]:
import requests  # Web requests.
from bs4 import BeautifulSoup  # HTML/XML parsing.
import pandas as pd  # Data manipulation.
import time  # Time-related functions.
import random  # Random numbers.
from typing import List, Dict, Optional  # Enables type hinting for improved code readability and maintainability.

# User-Agent pool to rotate requests
headers = {"User-Agent": "Mozilla/5.0"}

USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
]

# Targeted African locations
AFRICAN_LOCATIONS = [
    "Nairobi", "Kenya",
    "Lagos", "Nigeria",
    "Johannesburg", "South Africa",
    "Cairo", "Egypt",
    "Accra", "Ghana",
    "Kampala", "Uganda",
    "Dar es Salaam", "Tanzania"
]

#### 2. Rate Limiting & User Agent Rotation

In [None]:
def get_random_delay() -> float:
    """Return a random delay between requests to avoid rate limiting"""
    return random.uniform(1, 3)

def get_random_user_agent() -> str:
    """Return a random user agent to rotate headers"""
    return random.choice(USER_AGENTS)


#### 3. URL Construction & Page Fetching

In [None]:
def construct_search_url(title: str, location: str, start: int = 0) -> str:
    """Construct the LinkedIn job search URL"""
    return f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords={title}&location={location}&start={start}"

def fetch_job_listings(url: str) -> Optional[BeautifulSoup]:
    """Fetch job listings page with random delays and user agents"""
    headers = {'User-Agent': get_random_user_agent()}
    try:
        time.sleep(get_random_delay())
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return BeautifulSoup(response.text, "html.parser")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching job listings: {e}")
        return None

#### 4. Extract Job IDs from Listing Page

In [None]:
def extract_job_ids(soup: BeautifulSoup) -> List[str]:
    """Extract job IDs from the listings page"""
    id_list = []
    for job in soup.find_all("li"):
        base_card_div = job.find("div", {"class": "base-card"})
        if base_card_div and "data-entity-urn" in base_card_div.attrs:
            job_id = base_card_div.get("data-entity-urn").split(":")[3]
            id_list.append(job_id)
    return id_list


#### 5. Fetch and Parse Individual Job Postings

In [None]:
def fetch_job_details(job_id: str) -> Optional[Dict]:
    """Fetch detailed information for a single job posting"""
    job_url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
    headers = {'User-Agent': get_random_user_agent()}
    try:
        time.sleep(get_random_delay())
        response = requests.get(job_url, headers=headers)
        response.raise_for_status()
        job_soup = BeautifulSoup(response.text, "html.parser")

        job_post = {
            "job_title": extract_text(job_soup, "h2", {"class": "top-card-layout__title"}),
            "company_name": extract_text(job_soup, "a", {"class": "topcard__org-name-link"}),
            "location": extract_text(job_soup, "span", {"class": "topcard__flavor--bullet"}),
            "time_posted": extract_text(job_soup, "span", {"class": "posted-time-ago__text"}),
            "num_applicants": extract_text(job_soup, "span", {"class": "num-applicants__caption"}),
            "job_description": extract_text(job_soup, "div", {"class": "description__text"}),
            "employment_type": extract_text(job_soup, "span", {"class": "description__job-criteria-text"}, 1),
            "seniority_level": extract_text(job_soup, "span", {"class": "description__job-criteria-text"}, 0),
            "job_function": extract_text(job_soup, "span", {"class": "description__job-criteria-text"}, 2),
            "industries": extract_text(job_soup, "span", {"class": "description__job-criteria-text"}, 3),
            "scraped_location": location,
            "scraped_at": pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        return job_post
    except requests.exceptions.RequestException as e:
        print(f"Error fetching job details for ID {job_id}: {e}")
        return None


#### 6. Utility Function for Safe Text Extraction

In [None]:
def extract_text(soup: BeautifulSoup, tag: str, attrs: Dict, index: int = None) -> Optional[str]:
    """Helper function to safely extract text from HTML elements"""
    try:
        elements = soup.find_all(tag, attrs)
        if elements:
            if index is not None and len(elements) > index:
                return elements[index].text.strip()
            return elements[0].text.strip()
    except:
        pass
    return None

#### 7. Full Scraping Routine per Location



In [None]:
def scrape_linkedin_jobs(title: str, location: str, pages: int = 3) -> pd.DataFrame:
    """Main function to scrape LinkedIn jobs"""
    all_jobs = []
    for page in range(pages):
        start = page * 25
        url = construct_search_url(title, location, start)
        soup = fetch_job_listings(url)
        if not soup:
            continue
        job_ids = extract_job_ids(soup)
        print(f"Found {len(job_ids)} jobs on page {page + 1}")
        for job_id in job_ids:
            job_details = fetch_job_details(job_id)
            if job_details:
                all_jobs.append(job_details)
    return pd.DataFrame(all_jobs)

#### 8. Main Execution and Data Processing

In [None]:
if __name__ == "__main__":
    job_title = "Data Analyst"
    all_african_jobs = pd.DataFrame()

    for location in AFRICAN_LOCATIONS:
        print(f"\nScraping {job_title} jobs in {location}...")
        jobs_df = scrape_linkedin_jobs(job_title, location)
        if not jobs_df.empty:
            all_african_jobs = pd.concat([all_african_jobs, jobs_df], ignore_index=True)

    if not all_african_jobs.empty:
        # APP platform-specific enrichment
        all_african_jobs['is_remote'] = all_african_jobs['job_description'].str.contains('remote|work from home', case=False, regex=True)
        all_african_jobs['requires_sql'] = all_african_jobs['job_description'].str.contains('sql|mysql|postgresql', case=False, regex=True)
        all_african_jobs['requires_python'] = all_african_jobs['job_description'].str.contains('python', case=False)
        all_african_jobs['requires_powerbi'] = all_african_jobs['job_description'].str.contains('powerbi|power bi', case=False, regex=True)

        # Save to CSV with timestamp
        timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
        filename = f"APP_Data_Analyst_Jobs_Africa_{timestamp}.csv"
        all_african_jobs.to_csv(filename, index=False)
        print(f"\nSuccessfully saved {len(all_african_jobs)} jobs to {filename}")
    else:
        print("\nNo jobs found for the given criteria.")



Scraping Data Analyst jobs in Nairobi...
Found 9 jobs on page 1
Found 10 jobs on page 2
Found 10 jobs on page 3

Scraping Data Analyst jobs in Kenya...
Found 9 jobs on page 1
Found 10 jobs on page 2
Found 10 jobs on page 3

Scraping Data Analyst jobs in Lagos...
Found 10 jobs on page 1
Found 10 jobs on page 2
Found 10 jobs on page 3

Scraping Data Analyst jobs in Nigeria...
Found 10 jobs on page 1
Found 10 jobs on page 2
Found 10 jobs on page 3

Scraping Data Analyst jobs in Johannesburg...
Found 10 jobs on page 1
Found 0 jobs on page 2
Found 0 jobs on page 3

Scraping Data Analyst jobs in South Africa...
Found 10 jobs on page 1
Found 10 jobs on page 2
Found 10 jobs on page 3

Scraping Data Analyst jobs in Cairo...
Found 10 jobs on page 1
Found 10 jobs on page 2
Found 10 jobs on page 3

Scraping Data Analyst jobs in Egypt...
Found 10 jobs on page 1
Found 10 jobs on page 2
Found 10 jobs on page 3

Scraping Data Analyst jobs in Accra...
Found 10 jobs on page 1
Found 10 jobs on page 2
Fo

# Brighter Monday Jobs

In [None]:
# Helper Functions to safely extract text
def scrape_brightermonday(job_title: str, max_pages: int = 3) -> pd.DataFrame:
    """
    Scrape Data Analyst jobs from BrighterMonday Kenya
    """
    base_url = "https://www.brightermonday.co.ke"
    search_url = f"{base_url}/jobs?q={job_title.replace(' ', '+')}"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    all_jobs = []

    for page in range(1, max_pages + 1):
        try:
            time.sleep(random.uniform(1, 3))
            url = f"{search_url}&page={page}" if page > 1 else search_url
            response = requests.get(url, headers=headers)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')
            job_cards = soup.find_all('article', class_='search-result')

            if not job_cards:
                break

            for card in job_cards:
                job = {
                    'source': 'BrighterMonday',
                    'scraped_at': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    'job_title': get_text(card, 'h3', 'search-result__job-title'),
                    'company': get_text(card, 'div', 'search-result__job-meta'),
                    'location': get_text(card, 'div', 'search-result__location'),
                    'date_posted': get_text(card, 'div', 'search-result__job-age'),
                    'job_url': base_url + card.find('a')['href'] if card.find('a') else None,
                }

                # Get additional details from job page
                if job['job_url']:
                    try:
                        time.sleep(random.uniform(1, 2))
                        detail_response = requests.get(job['job_url'], headers=headers)
                        detail_soup = BeautifulSoup(detail_response.text, 'html.parser')

                        job['job_description'] = get_text(detail_soup, 'div', 'description__content')
                        job['employment_type'] = get_text(detail_soup, 'span', 'description__job-criteria-text', 0)
                        job['experience_level'] = get_text(detail_soup, 'span', 'description__job-criteria-text', 1)
                        job['salary'] = get_text(detail_soup, 'span', 'description__job-criteria-text', 2)

                        # Skill detection
                        desc = job.get('job_description', '').lower()
                        job['is_remote'] = 'remote' in desc or 'work from home' in desc
                        job['requires_sql'] = 'sql' in desc or 'mysql' in desc or 'postgresql' in desc
                        job['requires_python'] = 'python' in desc
                        job['requires_powerbi'] = 'power bi' in desc or 'powerbi' in desc
                        job['requires_excel'] = 'excel' in desc

                    except Exception as e:
                        print(f"Error scraping job details: {e}")

                all_jobs.append(job)

            print(f"Scraped page {page} of BrighterMonday")

        except Exception as e:
            print(f"Error scraping BrighterMonday page {page}: {e}")
            break

    return pd.DataFrame(all_jobs)

def get_text(soup, tag: str, class_name: str, index: int = None) -> str:
    """Helper to safely extract text"""
    elements = soup.find_all(tag, class_=class_name)
    if elements:
        if index is not None and len(elements) > index:
            return elements[index].get_text(strip=True)
        return elements[0].get_text(strip=True)
    return None

# Fuzu Scraper

In [None]:
def scrape_fuzu(job_title: str, max_pages: int = 3) -> pd.DataFrame:
    """
    Scrape Data Analyst jobs from Fuzu Kenya
    """
    base_url = "https://www.fuzu.com"
    search_url = f"{base_url}/kenya/job?q={job_title.replace(' ', '+')}"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9',
    }

    all_jobs = []

    for page in range(1, max_pages + 1):
        try:
            time.sleep(random.uniform(1, 3))
            url = f"{search_url}&page={page}" if page > 1 else search_url
            response = requests.get(url, headers=headers)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')
            job_cards = soup.find_all('div', class_='job-card')

            if not job_cards:
                break

            for card in job_cards:
                job = {
                    'source': 'Fuzu',
                    'scraped_at': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    'job_title': get_text(card, 'h3', 'job-card__title'),
                    'company': get_text(card, 'div', 'job-card__company'),
                    'location': get_text(card, 'div', 'job-card__location'),
                    'date_posted': get_text(card, 'div', 'job-card__date'),
                    'job_url': base_url + card.find('a')['href'] if card.find('a') else None,
                }

                # Get additional details from job page
                if job['job_url']:
                    try:
                        time.sleep(random.uniform(1, 2))
                        detail_response = requests.get(job['job_url'], headers=headers)
                        detail_soup = BeautifulSoup(detail_response.text, 'html.parser')

                        job['job_description'] = get_text(detail_soup, 'div', 'job-description')
                        job['requirements'] = get_text(detail_soup, 'div', 'job-requirements')

                        # Extract salary if available
                        salary_div = detail_soup.find('div', class_='salary-range')
                        if salary_div:
                            job['salary'] = salary_div.get_text(strip=True).replace('Salary:', '').strip()

                        # Skill detection
                        desc = (job.get('job_description', '') + job.get('requirements', '')).lower()
                        job['is_remote'] = 'remote' in desc or 'work from home' in desc
                        job['requires_sql'] = 'sql' in desc or 'mysql' in desc or 'postgresql' in desc
                        job['requires_python'] = 'python' in desc
                        job['requires_powerbi'] = 'power bi' in desc or 'powerbi' in desc
                        job['requires_excel'] = 'excel' in desc

                    except Exception as e:
                        print(f"Error scraping job details: {e}")

                all_jobs.append(job)

            print(f"Scraped page {page} of Fuzu")

        except Exception as e:
            print(f"Error scraping Fuzu page {page}: {e}")
            break

    return pd.DataFrame(all_jobs)

# Combined Execution for Brighter monday and Fuzu

In [None]:
from datetime import datetime

def scrape_all_kenyan_job_sites():
    """Scrape all platforms and combine results"""
    job_title = "Data Analyst"

    print("Starting BrighterMonday scrape...")
    bm_df = scrape_brightermonday(job_title)

    print("\nStarting Fuzu scrape...")
    fuzu_df = scrape_fuzu(job_title)

    print("\nStarting LinkedIn Kenya scrape...")
    linkedin_df = scrape_linkedin_jobs(job_title, "Kenya")

    # Combine all dataframes
    combined_df = pd.concat([bm_df, fuzu_df, linkedin_df], ignore_index=True)

    # Save to CSV
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"APP_Kenya_Data_Analyst_Jobs_{timestamp}.csv"
    combined_df.to_csv(filename, index=False)

    print(f"\nScraping complete! Saved {len(combined_df)} jobs to {filename}")
    return combined_df

# Run the combined scraper
if __name__ == "__main__":
    df = scrape_all_kenyan_job_sites()
    print(df.head())

Starting BrighterMonday scrape...

Starting Fuzu scrape...
Error scraping Fuzu page 1: 403 Client Error: Forbidden for url: https://www.fuzu.com/kenya/job?q=Data+Analyst

Starting LinkedIn Kenya scrape...
Found 9 jobs on page 1
Found 10 jobs on page 2
Found 10 jobs on page 3

Scraping complete! Saved 29 jobs to APP_Kenya_Data_Analyst_Jobs_20250519_081402.csv
                              job_title                  company_name  \
0                      SQL Data Analyst                 Emma of Torre   
1                Marketing Data Analyst                    3Commas.io   
2                        Data Scientist  ENGIE Energy Access (Africa)   
3                   Junior Game Analyst                Gunzilla Games   
4  Business Intelligence Analyst - 1570                  In All Media   

                                       location   time_posted num_applicants  \
0                                         Kenya    3 days ago           None   
1  Nzalae/ Nzawa locations, Kitui County