In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import random
from openai import OpenAI
import os
from tqdm import tqdm

class JobScraper:
    def __init__(self, api_key):
        self.client = OpenAI(api_key=api_key)
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
    def extract_job_details_with_llm(self, job_description):
        """
        Use OpenAI to extract structured data from job descriptions
        """
        try:
            response = self.client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that extracts job information from text. Extract the following fields: job title, location, preferred qualifications, and minimum qualifications."},
                    {"role": "user", "content": f"Extract the job details from the following job description. Return a JSON object with these fields: 'job_title', 'location', 'preferred_qualifications' (as a list), 'minimum_qualifications' (as a list).\n\nJob Description:\n{job_description}"}
                ],
                temperature=0.1,
                max_tokens=1000
            )
            
            result = response.choices[0].message.content
            
            # Try to extract the JSON part if the AI added explanations
            import json
            try:
                # First try to parse directly
                parsed_data = json.loads(result)
                return parsed_data
            except:
                # If direct parsing fails, try to extract JSON part with regex
                json_pattern = r'```json\n(.*?)\n```'
                json_match = re.search(json_pattern, result, re.DOTALL)
                
                if json_match:
                    json_str = json_match.group(1)
                    return json.loads(json_str)
                
                # If that fails, try another approach
                json_pattern = r'\{[\s\S]*\}'
                json_match = re.search(json_pattern, result)
                
                if json_match:
                    json_str = json_match.group(0)
                    return json.loads(json_str)
                
                # If all parsing attempts fail
                return {
                    "job_title": "Parsing Error",
                    "location": "Parsing Error", 
                    "preferred_qualifications": ["Parsing Error"],
                    "minimum_qualifications": ["Parsing Error"]
                }
                
        except Exception as e:
            print(f"Error with OpenAI API: {e}")
            return {
                "job_title": "API Error",
                "location": "API Error", 
                "preferred_qualifications": ["API Error"],
                "minimum_qualifications": ["API Error"]
            }
    
    def scrape_indeed(self, query, location, num_pages=1):
        """
        Scrape job listings from Indeed
        """
        jobs_data = []
        
        for page in range(num_pages):
            start = page * 10  # Indeed uses increments of 10 for pagination
            url = f"https://www.indeed.com/jobs?q={query.replace(' ', '+')}&l={location.replace(' ', '+')}&start={start}"
            
            try:
                response = requests.get(url, headers=self.headers)
                soup = BeautifulSoup(response.text, 'html.parser')
                
                job_cards = soup.find_all('div', class_='job_seen_beacon')
                
                for job in tqdm(job_cards, desc=f"Scraping page {page+1} jobs"):
                    try:
                        # Extract job title
                        title_elem = job.find('h2', class_='jobTitle')
                        if title_elem:
                            job_title = title_elem.get_text().strip()
                        else:
                            continue  # Skip if no title found
                        
                        # Extract job URL
                        job_link_elem = job.find('a', id=lambda x: x and x.startswith('job_'))
                        if job_link_elem:
                            job_url = "https://www.indeed.com" + job_link_elem.get('href', '')
                        else:
                            job_url = ""
                        
                        # Extract company
                        company_elem = job.find('span', class_='companyName')
                        company = company_elem.get_text().strip() if company_elem else "Not specified"
                        
                        # Extract location
                        location_elem = job.find('div', class_='companyLocation')
                        job_location = location_elem.get_text().strip() if location_elem else "Not specified"
                        
                        # Get full job description
                        job_desc = self.get_job_description(job_url)
                        
                        # Extract detailed information using LLM
                        details = self.extract_job_details_with_llm(job_desc)
                        
                        # Combine all job information
                        job_data = {
                            'job_title': details.get('job_title', job_title),
                            'company': company,
                            'location': details.get('location', job_location),
                            'preferred_qualifications': details.get('preferred_qualifications', []),
                            'minimum_qualifications': details.get('minimum_qualifications', []),
                            'url': job_url
                        }
                        
                        jobs_data.append(job_data)
                        
                        # Sleep to avoid rate limiting
                        time.sleep(random.uniform(1, 3))
                        
                    except Exception as e:
                        print(f"Error processing job listing: {e}")
                        continue
                
                # Sleep between pages
                time.sleep(random.uniform(2, 5))
                
            except Exception as e:
                print(f"Error scraping page {page+1}: {e}")
                continue
                
        return pd.DataFrame(jobs_data)
    
    def scrape_linkedin(self, query, location, num_pages=1):
        """
        Scrape job listings from LinkedIn
        """
        jobs_data = []
        
        for page in range(num_pages):
            start = page * 25  # LinkedIn uses increments of 25 for pagination
            url = f"https://www.linkedin.com/jobs/search/?keywords={query.replace(' ', '%20')}&location={location.replace(' ', '%20')}&start={start}"
            
            try:
                response = requests.get(url, headers=self.headers)
                soup = BeautifulSoup(response.text, 'html.parser')
                
                job_cards = soup.find_all('div', class_='base-card')
                
                for job in tqdm(job_cards, desc=f"Scraping LinkedIn page {page+1} jobs"):
                    try:
                        # Extract job title
                        title_elem = job.find('h3', class_='base-search-card__title')
                        if title_elem:
                            job_title = title_elem.get_text().strip()
                        else:
                            continue  # Skip if no title found
                        
                        # Extract job URL
                        job_link_elem = job.find('a', class_='base-card__full-link')
                        if job_link_elem:
                            job_url = job_link_elem.get('href', '').split('?')[0]  # Remove tracking parameters
                        else:
                            job_url = ""
                        
                        # Extract company
                        company_elem = job.find('h4', class_='base-search-card__subtitle')
                        company = company_elem.get_text().strip() if company_elem else "Not specified"
                        
                        # Extract location
                        location_elem = job.find('span', class_='job-search-card__location')
                        job_location = location_elem.get_text().strip() if location_elem else "Not specified"
                        
                        # Get full job description
                        job_desc = self.get_job_description(job_url)
                        
                        # Extract detailed information using LLM
                        details = self.extract_job_details_with_llm(job_desc)
                        
                        # Combine all job information
                        job_data = {
                            'job_title': details.get('job_title', job_title),
                            'company': company,
                            'location': details.get('location', job_location),
                            'preferred_qualifications': details.get('preferred_qualifications', []),
                            'minimum_qualifications': details.get('minimum_qualifications', []),
                            'url': job_url
                        }
                        
                        jobs_data.append(job_data)
                        
                        # Sleep to avoid rate limiting
                        time.sleep(random.uniform(1, 3))
                        
                    except Exception as e:
                        print(f"Error processing LinkedIn job listing: {e}")
                        continue
                
                # Sleep between pages
                time.sleep(random.uniform(2, 5))
                
            except Exception as e:
                print(f"Error scraping LinkedIn page {page+1}: {e}")
                continue
                
        return pd.DataFrame(jobs_data)
    
    def get_job_description(self, url):
        """
        Fetch the full job description from the job URL
        """
        try:
            response = requests.get(url, headers=self.headers)
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Try different selectors for job descriptions (varies by site)
            job_desc_elem = None
            
            # Indeed
            if 'indeed.com' in url:
                job_desc_elem = soup.find('div', id='jobDescriptionText')
            # LinkedIn
            elif 'linkedin.com' in url:
                job_desc_elem = soup.find('div', class_='show-more-less-html__markup')
            # Generic attempt if specific selectors fail
            if not job_desc_elem:
                job_desc_elem = (
                    soup.find('div', class_=lambda x: x and ('description' in x.lower() or 'details' in x.lower())) or
                    soup.find('section', class_=lambda x: x and ('description' in x.lower() or 'details' in x.lower()))
                )
            
            if job_desc_elem:
                return job_desc_elem.get_text(separator='\n').strip()
            else:
                # Try to get all text from the page as a fallback
                main_content = soup.find('main') or soup.find('div', id='main-content') or soup.body
                if main_content:
                    return main_content.get_text(separator='\n').strip()
                return "No job description found"
                
        except Exception as e:
            print(f"Error fetching job description: {e}")
            return "Error fetching job description"
    
    def save_to_csv(self, df, filename='job_data.csv'):
        """
        Save the DataFrame to a CSV file
        """
        df.to_csv(filename, index=False)
        print(f"Data saved to {filename}")
    
    def save_to_excel(self, df, filename='job_data.xlsx'):
        """
        Save the DataFrame to an Excel file
        """
        df.to_excel(filename, index=False)
        print(f"Data saved to {filename}")


def main():
    # Get OpenAI API key
    api_key = input("Please enter your OpenAI API key: ")
    
    # Initialize scraper
    scraper = JobScraper(api_key)
    
    # Get search parameters
    job_title = input("Enter job title to search for (e.g., 'software engineer'): ")
    location = input("Enter location (e.g., 'San Francisco, CA'): ")
    num_pages = int(input("Enter number of pages to scrape (1-5 recommended): "))
    source = input("Enter job source to scrape (indeed/linkedin/both): ").lower()
    
    all_jobs = pd.DataFrame()
    
    # Scrape Indeed
    if source in ['indeed', 'both']:
        print("\nScraping Indeed jobs...")
        indeed_jobs = scraper.scrape_indeed(job_title, location, num_pages)
        if not indeed_jobs.empty:
            indeed_jobs['source'] = 'Indeed'
            all_jobs = pd.concat([all_jobs, indeed_jobs])
            print(f"Found {len(indeed_jobs)} jobs on Indeed")
    
    # Scrape LinkedIn
    if source in ['linkedin', 'both']:
        print("\nScraping LinkedIn jobs...")
        linkedin_jobs = scraper.scrape_linkedin(job_title, location, num_pages)
        if not linkedin_jobs.empty:
            linkedin_jobs['source'] = 'LinkedIn'
            all_jobs = pd.concat([all_jobs, linkedin_jobs])
            print(f"Found {len(linkedin_jobs)} jobs on LinkedIn")
    
    # Save results
    if not all_jobs.empty:
        print(f"\nTotal jobs found: {len(all_jobs)}")
        save_format = input("Save as CSV or Excel? (csv/excel): ").lower()
        
        if save_format == 'csv':
            filename = input("Enter filename (default: job_data.csv): ") or "job_data.csv"
            scraper.save_to_csv(all_jobs, filename)
        else:
            filename = input("Enter filename (default: job_data.xlsx): ") or "job_data.xlsx"
            scraper.save_to_excel(all_jobs, filename)
            
        print("\nJob data preview:")
        print(all_jobs[['job_title', 'company', 'location', 'url']].head())
    else:
        print("No jobs found matching your criteria.")


if __name__ == "__main__":
    main()

Please enter your OpenAI API key:  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
Enter job title to search for (e.g., 'software engineer'):  Data Scientist
Enter location (e.g., 'San Francisco, CA'):  San Francisco, CA
Enter number of pages to scrape (1-5 recommended):  3
Enter job source to scrape (indeed/linkedin/both):  both



Scraping Indeed jobs...


Scraping page 1 jobs: 0it [00:00, ?it/s]
Scraping page 2 jobs: 0it [00:00, ?it/s]
Scraping page 3 jobs: 0it [00:00, ?it/s]



Scraping LinkedIn jobs...


Scraping LinkedIn page 1 jobs:   0%|                     | 0/60 [00:00<?, ?it/s]2025-03-22 10:23:00,594 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Scraping LinkedIn page 1 jobs:   2%|▏            | 1/60 [00:04<04:48,  4.89s/it]2025-03-22 10:23:04,688 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Scraping LinkedIn page 1 jobs:   3%|▍            | 2/60 [00:10<04:53,  5.05s/it]2025-03-22 10:23:10,442 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Scraping LinkedIn page 1 jobs:   5%|▋            | 3/60 [00:14<04:20,  4.58s/it]2025-03-22 10:23:14,452 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Scraping LinkedIn page 1 jobs:   7%|▊            | 4/60 [00:19<04:30,  4.82s/it]2025-03-22 10:23:20,148 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Scraping LinkedIn page 1 jobs:   8%

Found 180 jobs on LinkedIn

Total jobs found: 180


Save as CSV or Excel? (csv/excel):  CSV
Enter filename (default: job_data.csv):  Data_Scientist.csv


Data saved to Data_Scientist.csv

Job data preview:
                            job_title company                        location  \
0               Growth Data Scientist  Notion      Notion (in person company)   
1              Product Data Scientist  Notion  San Francisco or New York City   
2  Data Scientist - Product Analytics    Brex                   San Francisco   
3           Data Scientist II, Credit    Brex                   San Francisco   
4                      AI/ML Engineer  Ikigai                     Ikigai Labs   

                                                 url  
0  https://www.linkedin.com/jobs/view/data-scient...  
1  https://www.linkedin.com/jobs/view/data-scient...  
2  https://www.linkedin.com/jobs/view/data-scient...  
3  https://www.linkedin.com/jobs/view/data-scient...  
4  https://www.linkedin.com/jobs/view/machine-lea...  


In [3]:
import pandas as pd 
df=pd.read_csv('Data_Scientist.csv')
df

Unnamed: 0,job_title,company,location,preferred_qualifications,minimum_qualifications,url,source
0,Growth Data Scientist,Notion,Notion (in person company),"['Experience working on a growth team', 'Exper...",['Meaningful experience as a data scientist pa...,https://www.linkedin.com/jobs/view/data-scient...,LinkedIn
1,Product Data Scientist,Notion,San Francisco or New York City,['Expertise in SQL and at least one scripting ...,['Meaningful experience as a data scientist pa...,https://www.linkedin.com/jobs/view/data-scient...,LinkedIn
2,Data Scientist - Product Analytics,Brex,San Francisco,"['Experience working in B2B SaaS or fintech', ...","['Master’s degree or Ph.D. in Statistics, Comp...",https://www.linkedin.com/jobs/view/data-scient...,LinkedIn
3,"Data Scientist II, Credit",Brex,San Francisco,"['Experience working in B2B SaaS or fintech, p...","['Master’s degree or Ph.D. in Statistics, Econ...",https://www.linkedin.com/jobs/view/data-scient...,LinkedIn
4,AI/ML Engineer,Ikigai,Ikigai Labs,"[""Master's degree in Computer Science, Math, E...","['Bachelor’s degree in Computer Science, Math,...",https://www.linkedin.com/jobs/view/machine-lea...,LinkedIn
...,...,...,...,...,...,...,...
175,ML Solutions Engineer,SuperAnnotate,Partial remote working,"[""Master's degree in Computer Science, Machine...","['1+ years of customer-facing experience', 'Fa...",https://www.linkedin.com/jobs/view/ml-solution...,LinkedIn
176,Data Scientist - Financial Forecasting,Brex,San Francisco,['Experience building and maintaining financia...,"[""Master's degree or Ph.D. in Finance, Statist...",https://www.linkedin.com/jobs/view/senior-data...,LinkedIn
177,Autonomy Pipeline Designer and Builder,Weave Robotics,Weave,"['Experience in vision language models, reinfo...",[],https://www.linkedin.com/jobs/view/ml-research...,LinkedIn
178,Data Scientist,Pinterest,US based applicants only,['Experience using machine learning and deep l...,['4+ years of experience analyzing data in a f...,https://www.linkedin.com/jobs/view/data-scient...,LinkedIn
