<a href="https://colab.research.google.com/github/gathungugabriel/WEB-CRAWLING-CRAIGSLIST-JOB-SCRAPER/blob/main/WEB_CRAWLING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Extract the cities and their corresponding href within the original url

In [2]:
from selenium import webdriver
from selenium.common.exceptions import WebDriverException

def scrape_craigslist_geo(driver_path):
    cities_dict = {}
    url = 'https://geo.craigslist.org/iso/us'
    try:
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.binary_location = driver_path  # Set Chrome binary location
        driver = webdriver.Chrome(options=options)
        driver.get(url)
        select_tag = driver.find_element('css selector', '.geo-site-list')
        if select_tag:
            li_tags = select_tag.find_elements('tag name', 'li')
            for li_tag in li_tags:
                a_tag = li_tag.find_element('tag name', 'a')
                city = a_tag.text
                href = a_tag.get_attribute('href')
                cities_dict[city] = href
        driver.quit()
    except WebDriverException as e:
        print(f"An error occurred while fetching {url}: {e}")
    return cities_dict

# Specify the Chrome WebDriver path
driver_path = r'C:\Users\Gabriel\Desktop\apps\chrome-win64\chrome-win64\chrome.exe'

# Call the function to scrape cities listed under "choose the site nearest you"
cities_dict = scrape_craigslist_geo(driver_path)

# Print the extracted cities and their href anchors
for city, href in cities_dict.items():
    print(f"{city}: {href}")


abilene, TX: https://abilene.craigslist.org/
akron / canton: https://akroncanton.craigslist.org/
albany, GA: https://albanyga.craigslist.org/
albany, NY: https://albany.craigslist.org/
albuquerque: https://albuquerque.craigslist.org/
altoona-johnstown: https://altoona.craigslist.org/
amarillo, TX: https://amarillo.craigslist.org/
ames, IA: https://ames.craigslist.org/
anchorage / mat-su: https://anchorage.craigslist.org/
annapolis, MD: https://annapolis.craigslist.org/
ann arbor, MI: https://annarbor.craigslist.org/
appleton-oshkosh-FDL: https://appleton.craigslist.org/
asheville, NC: https://asheville.craigslist.org/
ashtabula, OH: https://ashtabula.craigslist.org/
athens, GA: https://athensga.craigslist.org/
athens, OH: https://athensohio.craigslist.org/
atlanta, GA: https://atlanta.craigslist.org/
auburn, AL: https://auburn.craigslist.org/
augusta, GA: https://augusta.craigslist.org/
austin, TX: https://austin.craigslist.org/
bakersfield, CA: https://bakersfield.craigslist.org/
balt

### Construct today's job urls for all the cities

In [3]:
# Initialize the list to store today's job URLs
todays_job_urls = []

# Iterate over the cities_dict and construct the URLs
for city, href in cities_dict.items():
    city_url = href + 'search/ggg?postedToday=1#search=1~thumb~0~0'
    todays_job_urls.append(city_url)

# Print the constructed URLs
print(todays_job_urls)


['https://abilene.craigslist.org/search/ggg?postedToday=1#search=1~thumb~0~0', 'https://akroncanton.craigslist.org/search/ggg?postedToday=1#search=1~thumb~0~0', 'https://albanyga.craigslist.org/search/ggg?postedToday=1#search=1~thumb~0~0', 'https://albany.craigslist.org/search/ggg?postedToday=1#search=1~thumb~0~0', 'https://albuquerque.craigslist.org/search/ggg?postedToday=1#search=1~thumb~0~0', 'https://altoona.craigslist.org/search/ggg?postedToday=1#search=1~thumb~0~0', 'https://amarillo.craigslist.org/search/ggg?postedToday=1#search=1~thumb~0~0', 'https://ames.craigslist.org/search/ggg?postedToday=1#search=1~thumb~0~0', 'https://anchorage.craigslist.org/search/ggg?postedToday=1#search=1~thumb~0~0', 'https://annapolis.craigslist.org/search/ggg?postedToday=1#search=1~thumb~0~0', 'https://annarbor.craigslist.org/search/ggg?postedToday=1#search=1~thumb~0~0', 'https://appleton.craigslist.org/search/ggg?postedToday=1#search=1~thumb~0~0', 'https://asheville.craigslist.org/search/ggg?posted

### Extract todays jobs and their Href

In [4]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Initialize the Chrome WebDriver
driver = webdriver.Chrome()

# Create an empty dictionary to store job postings and their href links
job_dict = {}

# Iterate through the URLs
for url in todays_job_urls:
    print(f"Processing URL: {url}")  # Debugging statement
    
    try:
        # Open the webpage
        driver.get(url)
        
        # Wait for the job listings to load
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "ol > li[data-pid]")))
        
        # Find all the job list items
        job_list_items = driver.find_elements(By.CSS_SELECTOR, "ol > li[data-pid]")
        
        print(f"Number of job items found: {len(job_list_items)}")
        
        # Iterate through each job list item
        for job_item in job_list_items:
            try:
                # Extract data-pid attribute
                data_pid = job_item.get_attribute("data-pid")
                
                # Extract job title
                title_element = job_item.find_element(By.CSS_SELECTOR, ".posting-title > span.label")
                title = title_element.text if title_element else "Title not found"
                
                # Extract href link
                href_element = job_item.find_element(By.CSS_SELECTOR, ".cl-app-anchor")
                href = href_element.get_attribute("href") if href_element else "Href not found"
                
                # Add the job title and href link to the dictionary
                job_dict[data_pid] = {"title": title, "href": href}
            except Exception as e:
                print(f"Error extracting job details: {e}")
    except Exception as e:
        print(f"Error loading job listings for URL {url}: {e}")

# Close the WebDriver
driver.quit()

# Print the dictionary
for data_pid, job_details in job_dict.items():
    print(f"Data PID: {data_pid}")
    print(f"Job Title: {job_details['title']}")
    print(f"Href Link: {job_details['href']}")
    print()


Processing URL: https://abilene.craigslist.org/search/ggg?postedToday=1#search=1~thumb~0~0
Number of job items found: 60
Processing URL: https://akroncanton.craigslist.org/search/ggg?postedToday=1#search=1~thumb~0~0
Number of job items found: 60
Processing URL: https://albanyga.craigslist.org/search/ggg?postedToday=1#search=1~thumb~0~0
Number of job items found: 60
Processing URL: https://albany.craigslist.org/search/ggg?postedToday=1#search=1~thumb~0~0
Number of job items found: 60
Processing URL: https://albuquerque.craigslist.org/search/ggg?postedToday=1#search=1~thumb~0~0
Number of job items found: 21
Processing URL: https://altoona.craigslist.org/search/ggg?postedToday=1#search=1~thumb~0~0
Number of job items found: 60
Processing URL: https://amarillo.craigslist.org/search/ggg?postedToday=1#search=1~thumb~0~0
Number of job items found: 13
Processing URL: https://ames.craigslist.org/search/ggg?postedToday=1#search=1~thumb~0~0
Number of job items found: 55
Processing URL: https://an

In [6]:
len(job_dict)

1618

### Remove job duplicates

In [18]:
# Create an empty dictionary to store unique job postings
unique_jobs = {}

# Create a set to keep track of unique titles
seen_titles = set()

# Iterate through the job_dict
for data_pid, job_details in job_dict.items():
    # Get the title of the job
    title = job_details['title']
    
    # Check if the title is already seen
    if title not in seen_titles:
        # Add the title to the set of seen titles
        seen_titles.add(title)
        
        # Add the job details to the unique_jobs dictionary
        unique_jobs[data_pid] = job_details

# Update job_dict with unique job postings
job_dict = unique_jobs

# Print the updated job_dict
len(job_dict)


907

Job postings saved to C:\Users\Gabriel\Desktop\Coding\WEB-CRAWLING-CRAIGSLIST-JOB-SCRAPER\job_postings.csv


### Filter the in person jobs from job_dict

In [12]:
# import nltk
# nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gabriel\AppData\Roaming\nltk_data...


True

In [19]:
import re
from nltk.stem import WordNetLemmatizer

# List of words to be eradicated
words_to_eradicate = ['till', 'garden', 'repair', 'delivery', 'housekeeping', 'movers', 'cashiers',
                      'door', 'model', 'shopper', 'videographer', 'photographer', 'mechanic',
                      'carpenter', 'cleaner', 'salgate']

# Lemmatize the words in the list
wordnet_lemmatizer = WordNetLemmatizer()
words_to_eradicate = [wordnet_lemmatizer.lemmatize(word) for word in words_to_eradicate]

# Create a regex pattern to match any form of the words in the list
pattern = re.compile(r'\b(?:' + '|'.join(words_to_eradicate) + r')\b', flags=re.IGNORECASE)

# Create a new dictionary to store filtered job postings
filtered_job_dict = {}

# Iterate over the job_dict
for data_pid, job_details in job_dict.items():
    # Check if the job title contains any form of the words to be eradicated
    if not re.search(pattern, job_details['title']):
        # If the job title doesn't contain any form of the words to be eradicated, add it to the filtered dictionary
        filtered_job_dict[data_pid] = job_details

# Print the filtered job postings
for data_pid, job_details in filtered_job_dict.items():
    print(f"Data PID: {data_pid}")
    print(f"Job Title: {job_details['title']}")
    print(f"Href Link: {job_details['href']}")
    print()


Data PID: 7741921742
Job Title: Need Moving Help TODAY! Monday, 4/29!!!
Href Link: https://austin.craigslist.org/lbg/d/round-rock-need-moving-help-today/7741921742.html
Data PID: 7741915872
Job Title: * Deliver with DoorDash *
Href Link: https://austin.craigslist.org/lbg/d/austin-deliver-with-doordash/7741915872.html
Data PID: 7741917480
Job Title: Food Preparation Worker - San Antonio, TX
Href Link: https://sanantonio.craigslist.org/lbg/d/san-antonio-food-preparation-worker-san/7741917480.html
Data PID: 7741909932
Job Title: Remote Travel Agent
Href Link: https://amarillo.craigslist.org/cpg/d/amarillo-remote-travel-agent/7741909932.html
Data PID: 7741907517
Job Title: Warehouse Countersales - Odessa, TX
Href Link: https://odessa.craigslist.org/lbg/d/odessa-warehouse-countersales-odessa-tx/7741907517.html
Data PID: 7741906087
Job Title: NEED LAWN CARE WORKER FOR TASKS - at least $23/hr as a Lawn Care Worke
Href Link: https://sanmarcos.craigslist.org/tlg/d/san-antonio-need-lawn-care-wor

In [20]:
len(filtered_job_dict)

786

### Saving the final jobs list into a csv

In [22]:
import csv

# Define the filename for the CSV file
csv_filename = r"C:\Users\Gabriel\Desktop\Coding\WEB-CRAWLING-CRAIGSLIST-JOB-SCRAPER\job_postings.csv"

# Open the CSV file in write mode
with open(csv_filename, mode='w', newline='', encoding='utf-8') as csv_file:
    # Define the fieldnames for the CSV file
    fieldnames = ['Data PID', 'Job Title', 'Href Link']
    
    # Create a CSV writer object
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    
    # Write the header row
    writer.writeheader()
    
    # Write each job posting to the CSV file
    for data_pid, job_details in filtered_job_dict.items():
        writer.writerow({'Data PID': data_pid, 'Job Title': job_details['title'], 'Href Link': job_details['href']})

print(f"Job postings saved to {csv_filename}")


Job postings saved to C:\Users\Gabriel\Desktop\Coding\WEB-CRAWLING-CRAIGSLIST-JOB-SCRAPER\job_postings.csv
