In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [10]:
#Search term
job_title = "data+scientist"
location = "California"

In [12]:
#Construct the search URL
url = f"https://www.indeed.com/jobs?q={job_title}&l={location}"

In [13]:
# Send GET request to the Indeed page
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"
}
response = requests.get(url, headers=headers)

In [14]:
#Parse HTML using BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")

In [17]:
#Extract job cards
job_cards = soup.find_all("div", class_="job_seen_beacon")
job_cards

[]

In [18]:
#Prepare empty list to hold job data
job_list = []

In [19]:
#Loop through each job posting and extract data
for job in job_cards:
    title = job.find("h2", class_="jobTitle")
    company = job.find("span", class_="companyName")
    location = job.find("div", class_="companyLocation")
    summary = job.find("div", class_="job-snippet")
    post_date = job.find("span", class_="date")
    salary = job.find("div", class_="attribute_snippet")

    job_dict = {
        "Title": title.text.strip() if title else None,
        "Company": company.text.strip() if company else None,
        "Location": location.text.strip() if location else None,
        "Summary": summary.text.strip() if summary else None,
        "Posted": post_date.text.strip() if post_date else None,
        "Salary": salary.text.strip() if salary else "Not disclosed"
    }

    job_list.append(job_dict)

In [23]:
#Convert list to DataFrame
df = pd.DataFrame(job_list)

In [24]:
#Save to CSV
df.to_csv("indeed_jobs.csv", index=False)

In [25]:
print(df.head())

Empty DataFrame
Columns: []
Index: []


In [26]:
def scrape_indeed(job_title, location, pages=1):
    job_title = job_title.replace(" ", "+")
    job_list = []

    headers = {"User-Agent": "Mozilla/5.0"}

    for page in range(0, pages * 10, 10):
        url = f"https://www.indeed.com/jobs?q={job_title}&l={location}&start={page}"
        res = requests.get(url, headers=headers)
        soup = BeautifulSoup(res.text, "html.parser")
        jobs = soup.find_all("div", class_="job_seen_beacon")

        for job in jobs:
            title = job.find("h2", class_="jobTitle")
            company = job.find("span", class_="companyName")
            loc = job.find("div", class_="companyLocation")
            summary = job.find("div", class_="job-snippet")
            date = job.find("span", class_="date")
            salary = job.find("div", class_="attribute_snippet")

            job_list.append({
                "Title": title.text.strip() if title else None,
                "Company": company.text.strip() if company else None,
                "Location": loc.text.strip() if loc else None,
                "Summary": summary.text.strip() if summary else None,
                "Posted": date.text.strip() if date else None,
                "Salary": salary.text.strip() if salary else "Not disclosed"
            })

    return pd.DataFrame(job_list)




In [30]:
df_jobs = scrape_indeed("cyber security", "Florida", pages=2)
df_jobs.to_csv("cyber_jobs_florida.csv", index=False)

In [32]:
# List of all 50 US states in abbreviated form
states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA',
          'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD',
          'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ',
          'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC',
          'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']

job_title = "cybersecurity"
base_url = "https://www.indeed.com/jobs?q={}&l={}&remotejob=1"

headers = {
    "User-Agent": "Mozilla/5.0"
}

all_jobs = []

# Loop through each state
for state in states:
    url = base_url.format(job_title, state)
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    for job_card in soup.find_all('a', class_='tapItem'):
        title = job_card.find('h2', class_='jobTitle').text.strip() if job_card.find('h2') else ''
        company = job_card.find('span', class_='companyName').text.strip() if job_card.find('span', class_='companyName') else ''
        location = job_card.find('div', class_='companyLocation').text.strip() if job_card.find('div', class_='companyLocation') else ''
        summary = job_card.find('div', class_='job-snippet').text.strip().replace('\n', ' ') if job_card.find('div', class_='job-snippet') else ''
        link = "https://www.indeed.com" + job_card.get('href')

        all_jobs.append({
            "State": state,
            "Title": title,
            "Company": company,
            "Location": location,
            "Short Description": summary,
            "Detail Link": link
        })
# Save to CSV
df = pd.DataFrame(all_jobs)
df.to_csv("remote_cybersecurity_jobs_us.csv", index=False)

# Download to local system
from google.colab import files
files.download("remote_cybersecurity_jobs_us.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>