In [32]:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
import unicodedata


In [33]:
def generate_url(position, location):
    base_url = "https://remoteok.io/remote"
    position = position.replace(" ", "-")
    location = location.replace(" ", "-")
    return f"{base_url}-{position}-jobs"


In [37]:


# Function to extract job data from a job card
def extract_job_data(job_card):
    # Extract job title
    try:
        job_title = clean_text(job_card.find("a", itemprop="url").find("h2", itemprop="title").text)
    except AttributeError:
        job_title = "N/A"

    # Extract company name
    try:
        company_name = clean_text(job_card.find("span", itemprop="hiringOrganization").find("h3", itemprop="name").text)
    except AttributeError:
        company_name = "N/A"

    # Extract location
    try:
        location = clean_text(job_card.find("div", class_="location").text)
    except AttributeError:
        location = "Remote"

    # Extract date posted
    try:
        date_posted = job_card.find("time")["datetime"]
    except (AttributeError, KeyError):
        date_posted = "N/A"

    # Extract job URL
    try:
        job_url = "https://remoteok.io" + job_card.find("a", class_="preventLink")["href"]
    except (AttributeError, KeyError):
        job_url = "N/A"

    return {
        "Job Title": job_title,
        "Company Name": company_name,
        "Location": location,
        "Date Posted": date_posted,
        "Job URL": job_url
    }

In [38]:
# Main function to scrape jobs
def main(position, location):
    url = generate_url(position, location)
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    }

    # Send HTTP request to the URL
    response = requests.get(url, headers=headers)

    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to retrieve the page, status code: {response.status_code}")
        return

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")

    # Find all job cards on the page
    job_cards = soup.find_all("tr", class_="job")

    # List to store job data
    job_data = []

    # Loop through each job card and extract data
    for job_card in job_cards:
        job_info = extract_job_data(job_card)
        job_data.append(job_info)

    # Write the extracted job data to a CSV file
    filename = f"remoteok_{position}_{location}_{datetime.now().strftime('%Y%m%d%H%M%S')}.csv"
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=["Job Title", "Company Name", "Location", "Date Posted", "Job URL"])
        writer.writeheader()
        writer.writerows(job_data)

    print(f"Scraping complete. Data saved to {filename}")

# Run the main function
if __name__ == "__main__":
    main("developer", "remote")  # Example: Scrape for remote developer jobs

Scraping complete. Data saved to remoteok_developer_remote_20250125183921.csv


In [40]:

main('developer', 'remote')  # Test for remote developers
main('designer', 'remote')   # Test for remote designers
main('data scientist', 'new york')  # Test for data scientists in New York
main('backend engineer', 'california')  # Test for backend engineers in California


Scraping complete. Data saved to remoteok_developer_remote_20250125184411.csv
Scraping complete. Data saved to remoteok_designer_remote_20250125184412.csv
Scraping complete. Data saved to remoteok_data scientist_new york_20250125184413.csv
Scraping complete. Data saved to remoteok_backend engineer_california_20250125184413.csv


In [42]:
import pandas as pd
import os

# Load the CSV file generated by the scraping process
def load_scraped_data(file_path):
    try:
        data = pd.read_csv(file_path)
        print(f"Data loaded successfully from {file_path}")
        return data
    except FileNotFoundError:
        print(f"Error: {file_path} not found.")
        return None
    except pd.errors.EmptyDataError:
        print("Error: No data found in the CSV file.")
        return None

# Check for missing values in the dataset
def check_missing_values(data):
    missing_values = data.isnull().sum()
    print("\nMissing Values:")
    print(missing_values)
    return missing_values

# Check for duplicates in the dataset
def check_duplicates(data):
    duplicate_rows = data.duplicated().sum()
    print("\nDuplicate Rows:")
    print(f"Total duplicates found: {duplicate_rows}")
    return duplicate_rows

# Evaluate the overall structure of the data (columns, rows, types)
def evaluate_data_structure(data):
    print("\nData Structure Evaluation:")
    print(f"Number of rows: {len(data)}")
    print(f"Number of columns: {len(data.columns)}")
    print(f"Columns: {data.columns.tolist()}")
    print("\nData Types:")
    print(data.dtypes)

# Check for any outliers or unusual values in key columns (e.g., dates or salary ranges)
def evaluate_outliers(data):
    # Check if the 'Date Posted' column has any unrealistic dates
    if 'Date Posted' in data.columns:
        data['Date Posted'] = pd.to_datetime(data['Date Posted'], errors='coerce')
        out_of_range_dates = data[data['Date Posted'].isnull()]
        print("\nOut of Range Dates:")
        print(out_of_range_dates[['Job Title', 'Date Posted']])

    # Check if salary ranges are available and reasonable
    if 'Salary Range' in data.columns:
        salary_data = data['Salary Range'].dropna()
        # Example: Check if salary is within a reasonable range (e.g., 0 to 1M USD)
        salary_data = salary_data.str.replace(r'[^0-9\-]', '', regex=True)
        salary_data = pd.to_numeric(salary_data, errors='coerce')
        out_of_range_salaries = salary_data[(salary_data < 1000) | (salary_data > 1000000)]
        print("\nOut of Range Salaries:")
        print(out_of_range_salaries)

# Evaluate job titles and company names for consistency
def evaluate_job_titles_and_companies(data):
    print("\nJob Titles and Company Evaluation:")
    job_titles = data['Job Title'].value_counts().head(10)
    print("Top 10 Most Common Job Titles:")
    print(job_titles)

    companies = data['Company Name'].value_counts().head(10)
    print("\nTop 10 Most Common Companies:")
    print(companies)

# Main evaluation function
def evaluate_scraped_data(file_path):
    data = load_scraped_data(file_path)
    if data is not None:
        check_missing_values(data)
        check_duplicates(data)
        evaluate_data_structure(data)
        evaluate_outliers(data)
        evaluate_job_titles_and_companies(data)

# Example: Path to the CSV file generated after scraping
file_path = "/content/remoteok_developer_remote_20250125184411.csv"
evaluate_scraped_data(file_path)


Data loaded successfully from /content/remoteok_developer_remote_20250125184411.csv

Missing Values:
Job Title       0
Company Name    0
Location        0
Date Posted     0
Job URL         0
dtype: int64

Duplicate Rows:
Total duplicates found: 0

Data Structure Evaluation:
Number of rows: 19
Number of columns: 5
Columns: ['Job Title', 'Company Name', 'Location', 'Date Posted', 'Job URL']

Data Types:
Job Title       object
Company Name    object
Location        object
Date Posted     object
Job URL         object
dtype: object

Out of Range Dates:
Empty DataFrame
Columns: [Job Title, Date Posted]
Index: []

Job Titles and Company Evaluation:
Top 10 Most Common Job Titles:
Job Title
Backend Engineer                  2
Senior Backend Golang Engineer    1
React Frontend developer          1
Software Engineer Backend         1
Wordpress Developer               1
Front End lead developer          1
Intermediate Angular Developer    1
Software Engineer                 1
Full Stack React Eng