In [None]:
import json
import ssl
import requests
from urllib.request import Request,urlopen
import pandas as pd
import numpy as np
import re
import string
import nltk
from bs4 import BeautifulSoup
import time
import logging

# Scraping for Data Scientist Jobs in New York City

In [None]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [None]:
# Define the base URL template with a placeholder for the start parameter
base_url = "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data%2BScientist&location=New%2BYork%2C%2BNew%2BYork%2C%2BUnited%2BStates&start={}"

# Initialize an empty list to store job IDs
id_list = []

# Define the number of pages you want to scrape
num_pages = 250  # Example: Scrape 20 pages

# Loop through each page
for page in range(num_pages):
    start = page * 25  # Calculate the start parameter for each page
    url = base_url.format(start)  # Construct the URL
    response = requests.get(url)  # Fetch the data
    list_data = response.text
    list_soup = BeautifulSoup(list_data, "html.parser")
    page_jobs = list_soup.find_all("li")

    for job in page_jobs:
        base_card_div = job.find("div", {"class": "base-card"})
        if base_card_div:  # Check if the div exists to avoid errors
            job_id = base_card_div.get('data-entity-urn').split(':')[3]
            id_list.append(job_id)

# Print the collected job IDs
#print(id_list)

In [None]:
job_list = []

# Function to fetch job details for a given job ID - iterate through list of IDs scraped
def fetch_job_details(job_id):
    job_url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
    for attempt in range(3):  # Retry up to 3 times
        try:
            job_response = requests.get(job_url, timeout=10)
            if job_response.status_code == 200:
                job_soup = BeautifulSoup(job_response.text, "html.parser")
                job_post = {}

                # Job Title
                try:
                    job_post['job_title'] = job_soup.find('h2', {'class': "top-card-layout__title"}).text.strip()
                except AttributeError:
                    job_post['job_title'] = None
                    
                # Job Link
                try:
                    job_post['job_link'] = f'https://www.linkedin.com/jobs/collections/recommended/?currentJobId={job_id}'
                except AttributeError:
                    job_post['job_link'] = None

                # Company Name
                try:
                    job_post['company_name'] = job_soup.find('a', {'class': "topcard__org-name-link"}).text.strip()
                except AttributeError:
                    job_post['company_name'] = None

                # Time Posted
                try:
                    job_post['time_posted'] = job_soup.find('span', {'class': "posted-time-ago__text"}).text.strip()
                except AttributeError:
                    job_post['time_posted'] = None

                # Number of Applicants
                try:
                    job_post['number_applicants'] = job_soup.find('span', {'class': "num-applicants__caption topcard__flavor--metadata topcard__flavor--bullet"}).text.strip()
                    if not job_post['number_applicants']:
                        raise AttributeError
                except AttributeError:
                    try:
                        job_post['number_applicants'] = job_soup.find('figcaption', {'class': "num-applicants__caption"}).text.strip()
                    except AttributeError:
                        job_post['number_applicants'] = None

                # Salary
                try:
                    job_post['salary'] = job_soup.find('div', {'class': "salary"}).text.strip()
                except AttributeError:
                    job_post['salary'] = None
                #Job Description
                try:
                    job_post['company_description'] = job_soup.find('div', {'class': "show-more-less-html__markup show-more-less-html__markup--clamp-after-5 relative overflow-hidden"}).text.strip()
                except AttributeError:
                    job_post['company_description'] = None
    
                return job_post
            
            else:
                logging.warning(f"Failed to fetch job ID {job_id}: HTTP {job_response.status_code}")
        except requests.RequestException as e:
            logging.error(f"Error fetching job ID {job_id}: {e}")
        time.sleep(5)  # Delay before retrying
    return None

In [None]:
# Loop through each job ID
for job_id in id_list:
    logging.info(f"Fetching details for job ID {job_id}")
    job_details = fetch_job_details(job_id)
    if job_details:
        job_list.append(job_details)
    time.sleep(2)  # Delay between requests to avoid rate limiting

# Print the collected job details
#for job in job_list:
 #   print(job)

In [None]:
df=pd.DataFrame(job_list)

In [None]:
df

In [None]:
#df.to_csv('linkedin_job_scrape_only_data_scientist_roles.csv')

# Scraping for Additional Similar Job Titles

In [None]:
# Initialize an empty list to store new job IDs
id_list_2 = []
job_list_2 = []

# List of job titles similar to Data Scientists
job_titles = [
    "Machine Learning Engineer",
    "Data Analyst",
    "Software Engineer",
    "Data Engineer",
    "Business Intelligence Analyst",
    "AI Specialist",
    "Big Data Engineer",
    "Quantitative Analyst",
    "Research Scientist",
    "Data Architect",
    "Statistician",
    "Predictive Modeler",
    "Operations Research Analyst",
    "Decision Scientist",
    "Data Visualization Specialist",
    "Analytics Consultant"
]

# Function to format job titles for URL
def format_job_title(title):
    return title.replace(" ", "%2B")

# Define the base URL with placeholders for the job title and the start parameter
base_url_template = "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords={}&location=New%2BYork%2C%2BNew%2BYork%2C%2BUnited%2BStates&start={}"

# Define the number of pages you want to scrape
num_pages_2 = 250  # Example: Scrape 50 pages

# Loop through each job title
for job_title in job_titles:
    formatted_title = format_job_title(job_title)
    for page in range(num_pages_2):
        start_2 = page * 25  # Calculate the start parameter for each page
        url_2 = base_url_template.format(formatted_title, start_2)  # Construct the URL
        response_2 = requests.get(url_2)  # Fetch the data
        list_data_2 = response_2.text
        list_soup_2 = BeautifulSoup(list_data_2, "html.parser")
        page_jobs_2 = list_soup_2.find_all("li")

        for job in page_jobs_2:
            base_card_div = job.find("div", {"class": "base-card"})
            if base_card_div:  # Check if the div exists to avoid errors
                job_id_2 = base_card_div.get('data-entity-urn').split(':')[3]
                id_list_2.append(job_id_2)

# Output the collected job IDs
#print(id_list_2)

In [None]:
# Loop through each job ID
for job_id in id_list_2:
    logging.info(f"Fetching details for job ID {job_id}")
    job_details_2 = fetch_job_details(job_id)
    if job_details_2:
        job_list_2.append(job_details_2)
    time.sleep(2)  # Delay between requests to avoid rate limiting

# Print the collected job details
#for job in job_list:
 #   print(job)

In [None]:
#Output DataFrame for additional IDs/roles scraped
df1=pd.DataFrame(job_list_2)

In [None]:
df1

In [None]:
#Concat the two DataFrames into one to analyze
df_final=pd.concat([df,df1])

In [None]:
df_final

In [None]:
df_final.to_csv('linkedin_job_scrape_all_roles.csv')