In [None]:
# default_exp data_collection_linkedin_job_list

# data_collection_linkedin_job_list

> API details.

In [None]:
#hide
# from nbdev.showdoc import *

In [None]:
#export
import datetime, os
import re, time, requests
import pandas as pd
from parsel import Selector
from selenium import webdriver
from selenium.webdriver import *
from selenium.webdriver.chrome.options import Options
import config

In [None]:
#export
def get_driver(url):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(
        executable_path=config.PATH_CHROME_DRIVER,
        options=chrome_options)
    driver.get(url)
    return driver

def get_job_info(selector):
    title = selector.css('h3::text').get().strip()
    published = selector.xpath("//time/@datetime").get()
    job_url = selector.xpath("//a/@href").get()    
    return [title, published, job_url]
    
def get_job_list(driver):
    job_list = driver.find_elements_by_xpath("//ul[contains(@class,'jobs-search__results-list')]/li")    
    for job in job_list:
        time.sleep(0.2)
        selector = Selector(text=job.get_attribute("outerHTML"))
        yield get_job_info(selector)        

### Crawling the job list and append into a csv temporary file

In [None]:
#export

def get_temp_filename():
    return config.LINKEDIN_JOBLIST_TEMP + "_" + today.strftime('%Y%m%d') + ".csv"

def crawl_new_job_list():
    """
    This function crawl a job list into a temporary csv file
    """
    today = datetime.datetime.now()
    temp_file = get_temp_filename()

    for url in config.LINKEDIN_URLS:
        driver = get_driver(url)
        job_list = list(get_job_list(driver))
        jobs = pd.DataFrame(job_list, columns=['title','published', 'url'])
        jobs.to_csv(temp_file, mode='a', index=False)
    return True

In [None]:
# crawl_new_job_list()

### Read the temp file and clean

In [None]:
#export
def clean_new_job_list():
    temp_file = get_temp_filename()
    jobs = pd.read_csv(temp_file)
    # Remove row without published date and appropriate url
    jobs = jobs[~(jobs.published == "published")]
    # Remove duplicate rows
    jobs.drop_duplicates(inplace=True)
    jobs.published = pd.to_datetime(jobs.published)
    jobs.to_csv(temp_file, index=False)
    return True

(872, 3)

In [None]:
# clean_new_job_list()

### Append temp job list into a full job list

In [None]:
#export
def get_cut_date(n_days=3):
    """
    This functin get a cut date by using latest date minus to n_days
    """
    with open(config.LINKEDIN_JOBLIST_LATEST_DATE, 'r') as file:
        latest_date = file.readline()
        latest_date = datetime.datetime.strptime(latest_date, "%Y%m%d")      
    if not latest_date:
        latest_date = datetime.datetime.now()
    cut_date = latest_date - datetime.timedelta(days=n_days)
    return cut_date

def update_latest_job_date(new_jobs):
    """
    After having the new jobs list, the latest date is stored in a text file
    """
    latest_date = list(new_jobs.published.sort_values(ascending=False))[0]
    latest_date = latest_date.strftime("%Y%m%d")
    with open(config.LINKEDIN_JOBLIST_LATEST_DATE, 'w') as file:
        file.write(latest_date)
        
def in_full_jobs_list(job):
    """
    This function check if a job is already in the full jobs list
    """
    jobs_file = config.LINKEDIN_JOBLIST
    if os.path.isfile(jobs_file):
        full_jobs = pd.read_csv(jobs_file)
        if job.url in full_jobs.url:
            return True
    return False

def get_temp_jobs():
    temp_file = get_temp_filename()
    jobs = pd.read_csv(temp_file)
    return jobs

In [None]:
#export
def filter_new_jobs():
    # Only get the recent jobs after a cut date
    jobs = get_temp_jobs()
    cut_date = get_cut_date()    
    jobs = jobs[jobs.published > cut_date]
    # Check of the job is alreay in the full_job_list, if not, append it into new_jobs list
    new_jobs = []
    for index, job in jobs.iterrows():
        if not in_full_jobs_list(job):
            new_jobs.append(job)
    # Convert to a data frame, and add "collected" column
    new_jobs = pd.DataFrame(new_jobs)
    new_jobs["collected"] = 0
    # Update the latest job date into a text file
    update_latest_job_date(new_jobs)
    return new_jobs

In [None]:
#export
def append_to_full_jobs():
    jobs_file = config.LINKEDIN_JOBLIST
    new_jobs = filter_new_jobs()
    if os.path.isfile(jobs_file):
        jobs_full = pd.read_csv(jobs_file)
        jobs_full = jobs_full.append(new_jobs)
    else:
        jobs_full = new_jobs

    jobs_full.to_csv(jobs_file, index=False)
    print("Success append into the full jobs list")

In [None]:
# append_to_full_jobs()