## Scrape jobs from LinkedIn

In [1]:
import csv
import requests 
import datetime
from bs4 import BeautifulSoup
import pandas as pd
import time

In [2]:
# Aim: Retrieve all current data jobs in Germany
url = 'https://de.linkedin.com/jobs/search?keywords=data&location=Deutschland&geoId=101282230&trk=guest_homepage-basic_jobs-search-bar_search-submit&position=1&pageNum=0' # last month
url = 'https://de.linkedin.com/jobs/search?keywords=data&location=Deutschland&geoId=101282230&sortBy=R&f_TPR=r86400&trk=guest_homepage-basic_jobs-search-bar_search-submit&position=1&pageNum=0' # last 24hours

response = requests.get(url)
response

<Response [200]>

In [3]:
# Create instance of Soup with the response body and html parser. Soup will be used to process the html
soup = BeautifulSoup(response.content, 'html.parser')

In [4]:
# Test soup and get number of jobs found
nresults = soup.find('span', class_='results-context-header__job-count').text.strip()
nresults = int(nresults.replace('.', ''))
nresults

1000

In [5]:
# Test Selectors
job_title = soup.find('h3', class_='base-search-card__title').text.strip()
job_company = soup.find('h4', class_='base-search-card__subtitle').text.strip()
job_location = soup.find('span', class_='job-search-card__location').text.strip()
job_url = soup.find('a', class_='base-card__full-link')['href']

f'{job_title} - {job_company} - {job_location} - {job_url}'

'Healthcare Data Analyst - vitagroup - Mannheim - https://de.linkedin.com/jobs/view/healthcare-data-analyst-at-vitagroup-3467828706?refId=pR5QXAPn%2Bgl%2FHrDVKB9OCA%3D%3D&trackingId=P3MWghXLlqwcZ9wcw7ULcg%3D%3D&position=1&pageNum=0&trk=public_jobs_jserp-result_search-card'

After playing with the url to figure out the parameters and how to retrieve the dynamic jobs loaded when scrolling and pagination, it can be concluded:
- position: is not relevant
- start: indicates the index of the firt post to show, or the delta. As in each call it retrieves 25 jobs, calls should be setting start+=25.
- pageNum: indicates the pagination, which changes every 1000 jobs. Therefore it goes up one unit every time start=1000, and start is reset back to 0.

The url follows the next pattern:
https://de.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data&location=Deutschland&geoId=101282230&trk=guest_homepage-basic_jobs-search-bar_search-submit&position=1&pageNum=0&start=0

In [6]:
# Modified urlfound in the API calls
url = 'https://de.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data&location=Deutschland&geoId=101282230&trk=guest_homepage-basic_jobs-search-bar_search-submit&position=1&' # past month
url = 'https://de.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data&location=Deutschland&geoId=101282230&sortBy=R&f_TPR=r86400&trk=guest_homepage-basic_jobs-search-bar_search-submit&position=1&' # past 24hours
url = 'https://de.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data&location=Deutschland&geoId=101282230&sortBy=R&f_TPR=r86400&trk=guest_homepage-basic_jobs-search-bar_search-submit&position=1&pageNum=0&start=0' # past 24hours

def get_jobs(url, nresults):
    jobs = []
    page_num = 0
    start_num = 0
    
    while len(jobs) < nresults:
        page = url + f'pageNum={page_num}&start={start_num}'
        response = requests.get(page)

        if response.status_code == 404:
            break

        # Get current page jobs (step=25)
        soup = BeautifulSoup(response.content, 'html.parser') 
        page_jobs = soup.find_all('div', class_='base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card')
        jobs.extend(page_jobs)

        # Update params
        start_num += 25
        if start_num == 1000: 
            start_num = 0
            page_num += 1

            print(len(jobs))

        time.sleep(2) # too many calls in a short time provoked soup not finding the tags or not parsing well.

    return jobs

jobs = get_jobs(url, nresults)

986


In [7]:
def process_jobs(jobs):
    data = []       # jobs
    error = []      # jobs unable to process

    for i, job in enumerate(jobs):
        try :
            data.append({
                'Title': job.find('h3', class_='base-search-card__title').text.strip(),
                'Company': job.find('h4', class_='base-search-card__subtitle').text.strip(),
                'Location': job.find('span', class_='job-search-card__location').text.strip(),
                'Link': job.find('a', class_='base-card__full-link')['href']
            })
        except:
            # process jobs that rise error
            error.append(job)
            print(f'There was a problem scrapping job {i}')

    return data, error

data, _ = process_jobs(jobs) # realizing error always comes empty so no need to process it, if it happens is ok to skip

In [8]:
# Create dataframe with records
df = pd.DataFrame(data)
df

Unnamed: 0,Title,Company,Location,Link
0,Healthcare Data Analyst,vitagroup,Mannheim,https://de.linkedin.com/jobs/view/healthcare-d...
1,Data Engineer,Staffbase,Dresden,https://de.linkedin.com/jobs/view/data-enginee...
2,Data Engineer,Staffbase,Chemnitz,https://de.linkedin.com/jobs/view/data-enginee...
3,Data Engineer,Staffbase,Leipzig,https://de.linkedin.com/jobs/view/data-enginee...
4,Data Scientist,aparkado,Deutschland,https://de.linkedin.com/jobs/view/data-scienti...
...,...,...,...,...
1006,Red Bull HQ Graduate Programme (based in Salzb...,Red Bull,"Berlin, Deutschland",https://de.linkedin.com/jobs/view/red-bull-hq-...
1007,Data Engineer - Marketing,Cititec,"Berlin, Deutschland",https://de.linkedin.com/jobs/view/data-enginee...
1008,Business Analyst (m/w/d),Arctic Wolf,Frankfurt am Main,https://de.linkedin.com/jobs/view/business-ana...
1009,Web / Digital Analyst Business Intelligence (m...,"zipmend GmbH üöõüí® Express, LTL & FTL Trans...",Hamburg,https://de.linkedin.com/jobs/view/web-digital-...


In [19]:
print(df.loc[1, 'Link'])

https://de.linkedin.com/jobs/view/data-engineer-at-staffbase-3473357475?refId=vttpEyVwQApXk9Dhk8V2PQ%3D%3D&trackingId=EUFWDd2MpvdqtroF9e1kOQ%3D%3D&position=2&pageNum=0&trk=public_jobs_jserp-result_search-card


Bad pipe message: %s [b'\r\xb1\xa7\xfbhWA\xf3\xfa\x9b\xa9Ur\xd7\xb1:\t\xfe T\x9bu\xde\xacV`\x7f\x00r]\x9b\x9c\xdb\\\xa6\x1f\xf4\xdb\x1c\x12OW\xa6g\x031;\xa3\x1c']
Bad pipe message: %s [b'i\xc6B\x90BG\xef{\xb8"\xec\xea\xa4\x93%\x16\xa8=\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0\'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00']
Bad pipe message: %s [b'\xff\x01\x00\x00']
Bad pipe message: %s [b'\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x000\x00.\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x

### Scrape job post text

In [13]:
links = df.Link

def scrape_text(links):
    texts = []

    for link in links:
        response = requests.get(page)
        print(response)
        break

In [None]:
# Save data 
dt = datetime.datetime.now()
df.to_parquet(f'../data/processed/linkedin_jobs_{str(datetime.date(dt.year, dt.month, dt.day))}.parquet.gzip', compression='gzip')  