## Scrape jobs from LinkedIn

In [1]:
import csv
import requests 
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Aim: Retrieve all current data jobs in Germany

url = 'https://de.linkedin.com/jobs/search?keywords=data&location=Deutschland&geoId=101282230&trk=guest_homepage-basic_jobs-search-bar_search-submit&position=1&pageNum=0'

response = requests.get(url)
response

<Response [200]>

In [3]:
# Create instance of Soup with the responsy body and html parser. Soup will be used to process the html
soup = BeautifulSoup(response.content, 'html.parser')

In [4]:
# Test soup and get number of jobs found
soup.find('span', class_='results-context-header__job-count').text.strip()

'53.000'

In [5]:
# Test Selectors
job_title = soup.find('h3', class_='base-search-card__title').text.strip()
job_company = soup.find('h4', class_='base-search-card__subtitle').text.strip()
job_location = soup.find('span', class_='job-search-card__location').text.strip()
job_url = soup.find('a', class_='base-card__full-link')['href']

f'{job_title} - {job_company} - {job_location} - {job_url}'

'(Junior) Data Analyst (f/m/d) - receeve - Hamburg - https://de.linkedin.com/jobs/view/junior-data-analyst-f-m-d-at-receeve-3419057929?refId=cRKg7f7vgVZ7oIOp9V%2FZ8w%3D%3D&trackingId=kd6XRMbCa7YXFqoZi9n6Uw%3D%3D&position=1&pageNum=0&trk=public_jobs_jserp-result_search-card'

After playing with the url to figure out the parameters and how to retrieve the dynamic jobs loaded when scrolling and pagination, it can be concluded:
- position: is not relevant
- start: indicates the index of the firt post to show, or the delta. As in each call it retrieves 25 jobs, calls should be made in step=25 jumps.
- pageNum: indicates the pagination, which changes every 1000 jobs. Therefore it changes every step=1000, and start is reset back to 0.

The url follows the next pattern:
https://de.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data&location=Deutschland&geoId=101282230&trk=guest_homepage-basic_jobs-search-bar_search-submit&position=1&pageNum=0&start=0

In [6]:
# Define method to get all jobs
url = 'https://de.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data&location=Deutschland&geoId=101282230&trk=guest_homepage-basic_jobs-search-bar_search-submit&position=1&'

def get_jobs(url):
    jobs = []
    start_num = 0
    page_num = 0
    
    while True:
        page = url + f'pageNum={page_num}&start={start_num}'
        response = requests.get(page)
        print(page)
        if response.status_code == 404:
            break

        soup = BeautifulSoup(response.content, 'html.parser') 

        # Get current page jobs (step=25)
        page_jobs = soup.find_all('div', class_='base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card')
        for i, job in enumerate(page_jobs):
            jobs.append({
                'Title': job.find('h3', class_='base-search-card__title').text.strip(),
                'Company': job.find('h4', class_='base-search-card__subtitle').text.strip(),
                'Location': job.find('span', class_='job-search-card__location').text.strip(),
                'Link': job.find('a', class_='base-card__full-link')['href']
            })

        # Update start_num (step = 25, restart = 1000)
        start_num += 25

        if start_num == 1000: 
            start_num = 0
            page_num += 1


    return jobs

jobs = get_jobs(url)

https://de.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data&location=Deutschland&geoId=101282230&trk=guest_homepage-basic_jobs-search-bar_search-submit&position=1&pageNum=0&start=0
https://de.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data&location=Deutschland&geoId=101282230&trk=guest_homepage-basic_jobs-search-bar_search-submit&position=1&pageNum=0&start=25
https://de.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data&location=Deutschland&geoId=101282230&trk=guest_homepage-basic_jobs-search-bar_search-submit&position=1&pageNum=0&start=50
https://de.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data&location=Deutschland&geoId=101282230&trk=guest_homepage-basic_jobs-search-bar_search-submit&position=1&pageNum=0&start=75
https://de.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data&location=Deutschland&geoId=101282230&trk=guest_homepage-basic_jobs-search-bar_search-submit&p

In [None]:
# Create dataframe with records
df = pd.DataFrame(data = jobs)
df

Unnamed: 0,Title,Company,Location,Link
0,Data Scientist,Data Scientist_company,Data Scientist_loc,Data Scientist_link
1,Data Analyst,Data Analyst_company,Data Analyst_loc,Data Analyst_link
2,Data Architect,Data Architect_company,Data Architect_loc,Data Architect_link
3,ML Engineer,ML Engineer_company,ML Engineer_loc,ML Engineer_link


In [None]:
df.to_parquet('../data/processed/linkedin_jobs.parquet.gzip', compression='gzip')  