In [None]:
# default_exp data_collection_linkedin_job_list

# data_collection_linkedin_job_list

> API details.

In [None]:
#hide
# from nbdev.showdoc import *

In [None]:
import re, time, requests
import pandas as pd
from parsel import Selector
from selenium import webdriver
from selenium.webdriver import *
from selenium.webdriver.chrome.options import Options
import config

In [None]:
def get_driver(url):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(
        executable_path=config.PATH_CHROME_DRIVER,
        options=chrome_options)
    driver.get(url)
    return driver

def get_ref_id(job_url):
    try:
        found = re.search('(?<=refId=).*(?=&trackingId)', job_url).group()
    except AttributeError:
        found = ''
    return found

def get_job_info(selector):
    title = selector.css('h3::text').get().strip()
    published = selector.xpath("//time/@datetime").get()
    job_url = selector.xpath("//a/@href").get()    
    return [title, published, job_url]
    
def get_job_list(driver):
    job_list = driver.find_elements_by_xpath("//ul[contains(@class,'jobs-search__results-list')]/li")    
    for job in job_list:
        time.sleep(0.2)
        selector = Selector(text=job.get_attribute("outerHTML"))
        yield get_job_info(selector)        

### Crawling the job list and append into a csv file

In [None]:
for url in config.LINKEDIN_URLS:
    driver = get_driver(url)
    job_list = list(get_job_list(driver))
    jobs = pd.DataFrame(job_list, columns=['title','published', 'url'])
    jobs.to_csv(config.LINKEDIN_JOBLIST, mode='a', index=False)

### Cleaning the job list file

In [None]:
jobs_df = pd.read_csv(config.LINKEDIN_JOBLIST)
jobs_df.shape

(113, 3)

In [None]:
jobs_df

Unnamed: 0,title,published,url
0,Data Scientist,2021-05-07,https://au.linkedin.com/jobs/view/data-scienti...
1,Data Scientist,2021-05-06,https://au.linkedin.com/jobs/view/data-scienti...
2,Data Scientist - Artificial Intelligence/Machi...,2021-05-07,https://au.linkedin.com/jobs/view/data-scienti...
3,Data Scientist,2021-05-03,https://au.linkedin.com/jobs/view/data-scienti...
4,Entry level Data Scientist / Risk Analyst oppo...,2021-05-07,https://au.linkedin.com/jobs/view/entry-level-...
...,...,...,...
108,ML / Data Engineer,2021-05-01,https://au.linkedin.com/jobs/view/ml-data-engi...
109,Data Engineer,2021-05-07,https://au.linkedin.com/jobs/view/data-enginee...
110,Robotics Software Engineer - SLAM,2021-05-07,https://au.linkedin.com/jobs/view/robotics-sof...
111,Data engineer,2021-05-04,https://au.linkedin.com/jobs/view/data-enginee...


In [None]:
# Remove row without published date and appropriate url
jobs_df = jobs_df[~(jobs_df.published == "published")]
jobs_df.shape

(109, 3)

In [None]:
# Remove duplicate rows
jobs_df.drop_duplicates(inplace=True)
jobs_df.shape

(109, 3)

In [None]:
jobs_df.to_csv(config.LINKEDIN_JOBLIST, index=False)

In [None]:
list(jobs_df.url)

['https://au.linkedin.com/jobs/view/data-scientist-at-johnson-johnson-2523010666?refId=pDIOHoTw%2BlhRhzcfmg0U8g%3D%3D&trackingId=chmlqIbbuE9Ha5oZbb2a2A%3D%3D&position=1&pageNum=0&trk=public_jobs_jserp-result_search-card',
 'https://au.linkedin.com/jobs/view/data-scientist-at-hays-2520456145?refId=pDIOHoTw%2BlhRhzcfmg0U8g%3D%3D&trackingId=SSSrO0KQTpYhDUfcfiCWDA%3D%3D&position=2&pageNum=0&trk=public_jobs_jserp-result_search-card',
 'https://au.linkedin.com/jobs/view/data-scientist-artificial-intelligence-machine-learning-at-systemize-consulting-2511444090?refId=pDIOHoTw%2BlhRhzcfmg0U8g%3D%3D&trackingId=o5kmsfnDL6cmouVkmQnuUw%3D%3D&position=3&pageNum=0&trk=public_jobs_jserp-result_search-card',
 'https://au.linkedin.com/jobs/view/data-scientist-at-kpmg-australia-2434288087?refId=pDIOHoTw%2BlhRhzcfmg0U8g%3D%3D&trackingId=t2EmevW8cSSLYyJakAAI3g%3D%3D&position=4&pageNum=0&trk=public_jobs_jserp-result_search-card',
 'https://au.linkedin.com/jobs/view/entry-level-data-scientist-risk-analyst-op