## This script contains Web Scraping logic to scrape all the Data Engineer jobs from Internet.

- The data scraped will be
  - Date, Job Name, Company, Link to apply,
- We will be using BeautifulSoup for this.

In [1]:
import html

from bs4 import BeautifulSoup
import requests
from datetime import datetime, timedelta
import pandas as pd
import matplotlib
import os


In [2]:
url = "https://weworkremotely.com/remote-jobs"
response = requests.get(url)
print("Status: ", response.status_code)
print(response.text[:100])

Status:  200
<!DOCTYPE html><html><head><meta charset=utf-8 /><script>window.NREUM||(NREUM={});NREUM.info={"beaco


In [3]:

def parse_relative_date(date_string):
    """
    In weworkremotely site the instead of date posted
    how many days ago the job posted was used, so
    this function is used to convert back to date
    :param date_string: how many days ago the job posted
    :return: date (%Y-%m-%d)
    """
    now = datetime.now()
    if "d" in date_string:
        days = int(date_string.replace("d", ""))
        return (now - timedelta(days=days)).strftime("%Y-%m-%d")
    else:
        return now.strftime("%Y-%m-%d")


In [84]:

def scraper_main_page(url):
    """
    Scrape wework site and return a pandas dataframe
    :param url: url to scrape
    :return: dataframe
    """
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    if response.status_code == 200:
        print("Successfully retrieved page")
        soup = BeautifulSoup(response.text, "html.parser")

        # new jobs are wrapped inside the new-listing-container
        jobs = soup.find_all("li",
                             class_="new-listing-container")

    else:
        print("Failed to scrape")
        print("status code: " + str(response.status_code))
        return None

    job_data = []

    for job in jobs:
        title = job.find("h3", class_="new-listing__header__title")
        company = job.find("p", class_="new-listing__company-name")
        location = job.find("p", class_=
        "new-listing__company-headquarters")
        links = job.find_all("a", href=True)
        date = (job.find("p",
                         class_="new-listing__header__icons__date")
                .text.strip())
        date_new = parse_relative_date(date)

        if title or company or links:
            job_data.append({
                "title": title.text.strip(),
                "company": company.text.strip(),
                "location": location.text.strip() if location else None,
                "date": date_new,
                "url": "https://www.weworkremotely.com"
                       + links[1]["href"] if links[1] else None,
                "date_posted": date_new
            })
    print("Successfully scraped, Number of jobs: " +
          str(len(job_data)))
    return pd.DataFrame(job_data)


In [5]:
url = "https://weworkremotely.com/remote-jobs"
df = scraper_main_page(url)
df.head()

Successfully retrieved page
Successfully scraped, Number of jobs: 165


Unnamed: 0,title,company,location,date,url,date_posted
0,Senior Full-Stack Engineer — CRM Platform (Rem...,Tasty LLC,Tasty LLC,2025-09-12,https://www.weworkremotely.com/remote-jobs/tas...,2025-09-12
1,Senior Fullstack Developer,Zencastr,Zencastr,2025-09-08,https://www.weworkremotely.com/remote-jobs/zen...,2025-09-08
2,Full-Stack Developer (TypeScript/Next.js • Cloud),VidRush,UK,2025-09-05,https://www.weworkremotely.com/remote-jobs/vid...,2025-09-05
3,QA Engineer Automation OR manual,CloudDevs,San Francisco,2025-08-31,https://www.weworkremotely.com/remote-jobs/clo...,2025-08-31
4,Coding bootcamp - Job Guaranteed,Metana,Learn job-ready skills on your schedule with 1...,2025-09-12,https://www.weworkremotely.comhttps://metana.i...,2025-09-12


In [6]:
sample_url = df['url'][0]

In [7]:
sample_url_scrape = requests.get(sample_url)
print(sample_url_scrape.status_code)

403


In [8]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.example.com",  # use the listing page URL here
}

In [9]:
sample_url_scrape = requests.get(sample_url, headers=headers)
print(sample_url_scrape.status_code)

200


In [10]:
print(sample_url_scrape.text)

<!DOCTYPE html><html><head><meta charset=utf-8 /><script>window.NREUM||(NREUM={});NREUM.info={"beacon":"bam.nr-data.net","errorBeacon":"bam.nr-data.net","licenseKey":"f7ae79e7ca","applicationID":"192262830","transactionName":"d1gPFhEMXVVWQxwMDVZEThELDEY=","queueTime":1,"applicationTime":73,"agent":""}</script><script>(window.NREUM||(NREUM={})).init={ajax:{deny_list:["bam.nr-data.net"]}};(window.NREUM||(NREUM={})).loader_config={licenseKey:"f7ae79e7ca",applicationID:"192262830"};;/*! For license information please see nr-loader-rum-1.297.0.min.js.LICENSE.txt */
    "@context" : "http://schema.org/",
    "@type" : "JobPosting",
    "title" : "Senior Full-Stack Engineer — CRM Platform (Remote, High Ownership)",
       "image": "https://we-work-remotely.imgix.net/logos/0171/1721/logo.gif?ixlib=rails-4.0.0&w=50&h=50&dpr=2&fit=fill&auto=compress",
    "description" : "&lt;h1 dir=&quot;ltr&quot;&gt;About the role&lt;/h1&gt;
&lt;p dir=&quot;ltr&quot;&gt;We&amp;rsquo;re hiring Senior Full-Stac

In [12]:
soup = BeautifulSoup(sample_url_scrape.text, "html.parser")
soup

<!DOCTYPE html>
<html><head><meta charset="utf-8"/><script>window.NREUM||(NREUM={});NREUM.info={"beacon":"bam.nr-data.net","errorBeacon":"bam.nr-data.net","licenseKey":"f7ae79e7ca","applicationID":"192262830","transactionName":"d1gPFhEMXVVWQxwMDVZEThELDEY=","queueTime":1,"applicationTime":73,"agent":""}</script><script>(window.NREUM||(NREUM={})).init={ajax:{deny_list:["bam.nr-data.net"]}};(window.NREUM||(NREUM={})).loader_config={licenseKey:"f7ae79e7ca",applicationID:"192262830"};;/*! For license information please see nr-loader-rum-1.297.0.min.js.LICENSE.txt */
    "@context" : "http://schema.org/",
    "@type" : "JobPosting",
    "title" : "Senior Full-Stack Engineer — CRM Platform (Remote, High Ownership)",
       "image": "https://we-work-remotely.imgix.net/logos/0171/1721/logo.gif?ixlib=rails-4.0.0&w=50&h=50&dpr=2&fit=fill&auto=compress",
    "description" : "&lt;h1 dir=&quot;ltr&quot;&gt;About the role&lt;/h1&gt;
&lt;p dir=&quot;ltr&quot;&gt;We&amp;rsquo;re hiring Senior Full-Sta

In [13]:
import json
script = soup.find("script", type="application/ld+json")

In [18]:
soup

<!DOCTYPE html>
<html><head><meta charset="utf-8"/><script>window.NREUM||(NREUM={});NREUM.info={"beacon":"bam.nr-data.net","errorBeacon":"bam.nr-data.net","licenseKey":"f7ae79e7ca","applicationID":"192262830","transactionName":"d1gPFhEMXVVWQxwMDVZEThELDEY=","queueTime":1,"applicationTime":73,"agent":""}</script><script>(window.NREUM||(NREUM={})).init={ajax:{deny_list:["bam.nr-data.net"]}};(window.NREUM||(NREUM={})).loader_config={licenseKey:"f7ae79e7ca",applicationID:"192262830"};;/*! For license information please see nr-loader-rum-1.297.0.min.js.LICENSE.txt */
    "@context" : "http://schema.org/",
    "@type" : "JobPosting",
    "title" : "Senior Full-Stack Engineer — CRM Platform (Remote, High Ownership)",
       "image": "https://we-work-remotely.imgix.net/logos/0171/1721/logo.gif?ixlib=rails-4.0.0&w=50&h=50&dpr=2&fit=fill&auto=compress",
    "description" : "&lt;h1 dir=&quot;ltr&quot;&gt;About the role&lt;/h1&gt;
&lt;p dir=&quot;ltr&quot;&gt;We&amp;rsquo;re hiring Senior Full-Sta

In [41]:
import re, html, json

script = soup.find("script", type="application/ld+json")
if script and script.string:
    raw = html.unescape(script.string)

    # Escape HTML tags inside the description
    raw = re.sub(
        r'("description"\s*:\s*")([\s\S]*?)(",$)',
        lambda m: m.group(1) + m.group(2).replace('"', '\\"').replace("\n", " ") + m.group(3),
        raw,
        flags=re.MULTILINE
    )

    # Remove control characters
    raw = re.sub(r'[\x00-\x1f\x7f]', '', raw)

    try:
        job_data = json.loads(raw)
        print(job_data["title"], job_data["hiringOrganization"]["name"])
    except json.JSONDecodeError as e:
        print("Still invalid JSON:", e)
else:
    print("No JSON-LD found")

Senior Full-Stack Engineer — CRM Platform (Remote, High Ownership) Tasty LLC


In [42]:
soup.text

"Remote Senior Full-Stack Engineer — CRM Platform (Remote, High Ownership) at Tasty LLCJoin the World's Largest Remote Community. We're waiting for you!  Connect and Grow Today! Jobs  Top Trending Remote Jobs New!All JobsProgramming Full-Stack ProgrammingFront-End ProgrammingBack-End ProgrammingDesignDevops and SysadminManagement and FinanceProductCustomer SupportSales and MarketingAll Other JobsFull-TimeContractCommunity BlogPodcastRemote Work TrendsRemote Work Community New!Find work Top 100 Remote CompaniesTop Remote Job SearchesView All Jobs Posted New! Automate your Job Search New!FAQ - Job SeekersContact SupportRSS FeedRemote Work ResourcesCareer Services Resume Review Service New! Automate your Job Search New!Find talent Remote Hiring GuideHiring Remote Software DevelopersWhy Choose WWR?FAQ - Employers Save with Bundles  Sign in Post a job Sign in  Jobs Top Trending Remote JobsNew!All Jobs Programming Full-Stack ProgrammingFront-End ProgrammingBack-End ProgrammingDesignDevops an

In [43]:
raw

' {    "@context" : "http://schema.org/",    "@type" : "JobPosting",    "title" : "Senior Full-Stack Engineer — CRM Platform (Remote, High Ownership)",       "image": "https://we-work-remotely.imgix.net/logos/0171/1721/logo.gif?ixlib=rails-4.0.0&w=50&h=50&dpr=2&fit=fill&auto=compress",    "description" : "<h1 dir=\\"ltr\\">About the role</h1> <p dir=\\"ltr\\">We&rsquo;re hiring Senior Full-Stack Engineers at Tasty to build the internal CRM platform that powers our teams and growth. You&rsquo;ll ship real product, own meaningful slices of the stack, and help define the architecture as we scale.<br>Compensation: We tailor packages (salary/equity/bonus) to impact. The more value and revenue you drive, the more you earn.</p> <h1 dir=\\"ltr\\">What you&rsquo;ll do</h1> <ul> <li dir=\\"ltr\\" aria-level=\\"1\\"> <p dir=\\"ltr\\" role=\\"presentation\\">Build &amp; evolve our CRM: Design, implement, and maintain features that organize teams, workflows, and data across the company.<br><br></p>

In [44]:
json.loads(raw)

{'@context': 'http://schema.org/',
 '@type': 'JobPosting',
 'title': 'Senior Full-Stack Engineer — CRM Platform (Remote, High Ownership)',
 'image': 'https://we-work-remotely.imgix.net/logos/0171/1721/logo.gif?ixlib=rails-4.0.0&w=50&h=50&dpr=2&fit=fill&auto=compress',
 'description': '<h1 dir="ltr">About the role</h1> <p dir="ltr">We&rsquo;re hiring Senior Full-Stack Engineers at Tasty to build the internal CRM platform that powers our teams and growth. You&rsquo;ll ship real product, own meaningful slices of the stack, and help define the architecture as we scale.<br>Compensation: We tailor packages (salary/equity/bonus) to impact. The more value and revenue you drive, the more you earn.</p> <h1 dir="ltr">What you&rsquo;ll do</h1> <ul> <li dir="ltr" aria-level="1"> <p dir="ltr" role="presentation">Build &amp; evolve our CRM: Design, implement, and maintain features that organize teams, workflows, and data across the company.<br><br></p> </li> <li dir="ltr" aria-level="1"> <p dir="ltr"

In [46]:
job_data.keys()

dict_keys(['@context', '@type', 'title', 'image', 'description', 'datePosted', 'validThrough', 'employmentType', 'directApply', 'occupationalCategory', 'url', 'jobLocationType', 'baseSalary', 'applicantLocationRequirements', 'hiringOrganization', 'identifier'])

In [64]:
job_data["hiringOrganization"]

{'@type': 'Organization',
 'name': 'Tasty LLC',
 'address': 'Tasty LLC',
 'sameAs': 'https://weworkremotely.com',
 'logo': 'https://we-work-remotely.imgix.net/logos/0171/1721/logo.gif?ixlib=rails-4.0.0&w=50&h=50&dpr=2&fit=fill&auto=compress'}

In [73]:
job_data["baseSalary"].get("value", {}).get("minValue")

'50000'

In [75]:
job_data["baseSalary"]["value"]["minValue"]

'50000'

In [76]:
from urllib.parse import quote

In [77]:

search_terms = [
    "software engineer",
    "data engineer",
    "python developer",
    "machine learning",
    "backend developer"
]

In [78]:

base_url = "https://weworkremotely.com/remote-jobs/search?search_uuid=&sort=&term={}&categories_chosen=&countries_chosen=&chosen-salary_range=&skills_chosen="

base_url.format(quote("Software Engineer"))

'https://weworkremotely.com/remote-jobs/search?search_uuid=&sort=&term=Software%20Engineer&categories_chosen=&countries_chosen=&chosen-salary_range=&skills_chosen='

In [79]:
format(quote("Software Engineer"))

'Software%20Engineer'

In [80]:
string = "dinesh is king {}"
string.format('dinesh')

'dinesh is king dinesh'

In [85]:
for i in search_terms:
    url = base_url.format(quote(i))
    scraper_main_page(url)

Successfully retrieved page
Successfully scraped, Number of jobs: 21
Successfully retrieved page
Successfully scraped, Number of jobs: 4
Successfully retrieved page
Successfully scraped, Number of jobs: 1
Successfully retrieved page
Successfully scraped, Number of jobs: 2
Successfully retrieved page
Successfully scraped, Number of jobs: 1
