In [7]:

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.proxy import Proxy, ProxyType
import time
from pathlib import Path
import requests
from bs4 import BeautifulSoup
import logging
import pickle
import os


class LinkedInBot:
    def __init__(self, delay=5):
        if not os.path.exists("data"):
            os.makedirs("data")
        log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
        logging.basicConfig(level=logging.INFO, format=log_fmt)
        self.delay=delay
        logging.info("Starting driver")
        self.driver = webdriver.Firefox(executable_path=r"C:\Users\admin\geckodriver.exe")

    def login(self, email, password):
        """Go to linkedin and login"""
        # go to linkedin:
        logging.info("Logging in")
        self.driver.maximize_window()
        self.driver.get('https://www.linkedin.com/login')
        time.sleep(self.delay)

        self.driver.find_element_by_id('username').send_keys(email)
        self.driver.find_element_by_id('password').send_keys(password)

        self.driver.find_element_by_id('password').send_keys(Keys.RETURN)
        time.sleep(self.delay)

    def save_cookie(self, path):
        with open(path, 'wb') as filehandler:
            pickle.dump(self.driver.get_cookies(), filehandler)

    def load_cookie(self, path):
        with open(path, 'rb') as cookiesfile:
            cookies = pickle.load(cookiesfile)
            for cookie in cookies:
                self.driver.add_cookie(cookie)

    def search_linkedin(self, keywords, location):
        """Enter keywords into search bar
        """
        logging.info("Searching jobs page")
        self.driver.get("https://www.linkedin.com/jobs/")
        # search based on keywords and location and hit enter
        self.wait_for_element_ready(By.CLASS_NAME, 'jobs-search-box__text-input')
        time.sleep(self.delay)
        search_bars = self.driver.find_elements_by_class_name('jobs-search-box__text-input')
        search_keywords = search_bars[0]
        search_keywords.send_keys(keywords)
        search_location = search_bars[2]
        search_location.send_keys(location)
        time.sleep(self.delay)
        search_location.send_keys(Keys.RETURN)
        logging.info("Keyword search successful")
        time.sleep(self.delay)
    
    def wait(self, t_delay=None):
        """Just easier to build this in here.
        Parameters
        ----------
        t_delay [optional] : int
            seconds to wait.
        """
        delay = self.delay if t_delay == None else t_delay
        time.sleep(delay)

    def scroll_to(self, job_list_item):
        """Just a function that will scroll to the list item in the column 
        """
        self.driver.execute_script("arguments[0].scrollIntoView();", job_list_item)
        job_list_item.click()
        time.sleep(self.delay)
    
    def get_position_data(self, job):
        """Gets the position data for a posting.
        Parameters
        ----------
        job : Selenium webelement
        Returns
        -------
        list of strings : [position, company, location, details]
        """
        [position, company, location] = job.text.split('\n')[:3]
        details = self.driver.find_element_by_id("job-details").text
        return [position, company, location, details]

    def wait_for_element_ready(self, by, text):
        try:
            WebDriverWait(self.driver, self.delay).until(EC.presence_of_element_located((by, text)))
        except TimeoutException:
            logging.debug("wait_for_element_ready TimeoutException")
            pass

    def close_session(self):
        """This function closes the actual session"""
        logging.info("Closing session")
        self.driver.close()

    def run(self, email, password, keywords, location):
        if os.path.exists("data/cookies.txt"):
            self.driver.get("https://www.linkedin.com/")
            self.load_cookie("data/cookies.txt")
            self.driver.get("https://www.linkedin.com/")
        else:
            self.login(
                email=email,
                password=password
            )
            self.save_cookie("data/cookies.txt")

        logging.info("Begin linkedin keyword search")
        self.search_linkedin(keywords, location)
        self.wait()

        # scrape pages,only do first 8 pages since after that the data isn't 
        # well suited for me anyways:  
        for page in range(2, 8):
            # get the jobs list items to scroll through:
            jobs = self.driver.find_elements_by_class_name("occludable-update")
            for job in jobs:
                self.scroll_to(job)
                [position, company, location, details] = self.get_position_data(job)

                # do something with the data...

            # go to next page:
            bot.driver.find_element_by_xpath(f"//button[@aria-label='Page {page}']").click()
            bot.wait()
        logging.info("Done scraping.")
        logging.info("Closing DB connection.")
        bot.close_session()


if __name__ == "__main__":
    email = "buituankiet132000@gmail.com"
    password = "pass:1+1*2=3?"
    bot = LinkedInBot()
    bot.run(email, password, "Data Scientist", "Canada")

2022-01-14 15:43:20,092 - root - INFO - Starting driver
  self.driver = webdriver.Firefox(executable_path=r"C:\Users\admin\geckodriver.exe")
2022-01-14 15:43:26,353 - root - INFO - Logging in
  self.driver.find_element_by_id('username').send_keys(email)
  self.driver.find_element_by_id('password').send_keys(password)
  self.driver.find_element_by_id('password').send_keys(Keys.RETURN)
2022-01-14 15:43:37,806 - root - INFO - Begin linkedin keyword search
2022-01-14 15:43:37,807 - root - INFO - Searching jobs page
  search_bars = self.driver.find_elements_by_class_name('jobs-search-box__text-input')


ElementNotInteractableException: Message: Element <input class="jobs-search-box__text-input jobs-search-box__keyboard-text-input jobs-search-box__keyboard-text-input--reflowed jobs-search-box__ghost-text-input" type="text"> is not reachable by keyboard
Stacktrace:
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:183:5
ElementNotInteractableError@chrome://remote/content/shared/webdriver/Errors.jsm:293:5
webdriverSendKeysToElement@chrome://remote/content/marionette/interaction.js:624:13
interaction.sendKeysToElement@chrome://remote/content/marionette/interaction.js:600:11
sendKeysToElement@chrome://remote/content/marionette/actors/MarionetteCommandsChild.jsm:497:24
receiveMessage@chrome://remote/content/marionette/actors/MarionetteCommandsChild.jsm:151:31


In [2]:
import requests

headers = {
    'authority': 'www.linkedin.com',
    'cache-control': 'max-age=0',
    'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-user': '?1',
    'sec-fetch-dest': 'document',
    'accept-language': 'en-US,en;q=0.9,vi-VN;q=0.8,vi;q=0.7',
    'cookie': 'li_sugr=05aa68b2-22d9-4c7d-9b7f-aa41eae2118a; bcookie="v=2&891b4103-b2d5-4df3-8f0b-425f204b6641"; bscookie="v=1&202110081052036bf43da7-2ad6-4724-8be5-c55f5499a589AQFHs9tYX_34qpYx4eFF10hh7BVl-8GK"; li_rm=AQHiQpNjrsbosAAAAXxqJzFDgfRzsFBmxtaHMM2xcR1yq58Slv2VKEv_uZRTIN2e7VmPZEVpvUUBfKWR_uD_ubMRPkqJEFkFuIcoRF6m-xu2ZlLJuiPD1F7L; G_ENABLED_IDPS=google; aam_uuid=19433570266972993770576466524582462183; _gcl_au=1.1.298244104.1633868541; liap=true; JSESSIONID="ajax:6896546257217579181"; timezone=Asia/Bangkok; li_at=AQEDATgXfzkBtFIcAAABfIjOxfkAAAF-WVfDmFYAns2RNusd0yvr-vRJ_Ya15nFy_bshMAdhknY9bh_5l0yi3SlBQEQBJH9jHHAysdaNvA3E53NtcOIcfRELqqgoJZb-B2a0s9IZxnKHh8HwV5bv7W2E; _guid=873e6f88-b4aa-40cc-b7b3-e026e4ee1bcc; AnalyticsSyncHistory=AQLDurMGQnLc6gAAAX5XZn-JKdChmfsA3ydWFGxnU0lj750CAb4kaZQMujKsruc7XFm0tG_OHZZpPJevbeC1lw; lms_ads=AQFSxq2FnVDOdwAAAX5XZoHTiTdu4xSsi11s_DSIMqb-gBCnUpaPaXfDPaARAcxMl6mc9NqWl-hQ5QHWiFfkrgCzaZepjDKj; lms_analytics=AQFSxq2FnVDOdwAAAX5XZoHTiTdu4xSsi11s_DSIMqb-gBCnUpaPaXfDPaARAcxMl6mc9NqWl-hQ5QHWiFfkrgCzaZepjDKj; lang=v=2&lang=en-us; AMCVS_14215E3D5995C57C0A495C55%40AdobeOrg=1; AMCV_14215E3D5995C57C0A495C55%40AdobeOrg=-637568504%7CMCIDTS%7C19006%7CMCMID%7C18890135806913880260634772266102993196%7CMCAAMLH-1642748753%7C3%7CMCAAMB-1642748753%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1642151153s%7CNONE%7CvVersion%7C5.1.1%7CMCCIDH%7C1862651472; sdsc=22%3A1%2C1642144319462%7EJOBS%2C0%2FvFpR5YcSDCFm2flf6AXCum8qbI%3D; UserMatchHistory=AQJ1cmJVsxniQwAAAX5Xc69A0C7fyTBRzpYK5CPEPYcnivEV4tm8Mc4eheQDuSETeNWG2XqfOcyJ5NfaBRm26Y03H0SrGNrBb28LZgoMGDqQ0ok_nDX-fzHwmmUVmd4gIB0c4PAfHhP6SwSze058u0Y3H9Oee96XHS8EI1Yh9pG6u96dcTYjg9I_NllfYRWAxu9g3bf5Shhvqs7tyAhPTPTerDp-1cX8D3fg5e5KLVkX8h18IXH51FyGvqzhyfvnt6cyr0XqTLqVFRT9sJFn4uDSjCfvg0kBqLMmjf8; lidc="b=OB93:s=O:r=O:a=O:p=O:g=2321:u=55:x=1:i=1642144727:t=1642180920:v=2:sig=AQHvw1pcDtjVHEyLOE5fMRz_huDPH_SA"',
}

params = (
    ('geoId', '104195383'),
    ('keywords', 'data analyst'),
    ('location', 'Vietnam'),
)



#NB. Original query string below. It seems impossible to parse and
#reproduce query strings 100% accurately so the one below is given
#in case the reproduced version is not "correct".
# response = requests.get('https://www.linkedin.com/jobs/search/?geoId=104195383&keywords=data%20analyst&location=Vietnam', headers=headers)

In [3]:
def cleanSelector(obj, xpath = './text()'):
    clean_list = []
    for i in range(len(obj)):
        if obj[i].xpath(xpath).extract():
            clean_list.append(obj[i].xpath(xpath).extract()[0]) #wtheck is this :)) 
        else:
            clean_list.append(None)
    return clean_list

def getChecked(values, checks):
    output = []
    for value, check in zip(values, checks):
        if check == 'checked':
            output.append(value)
    if output:
        return output
    else:
        output = [None]
        return output

def list2text(thelist):
    text = thelist[0]
    if len(thelist) == 1:
        return text
    else:
        for i in range(1,len(thelist)):
            text = text + ', ' + thelist[i]
        return text

In [8]:
response = requests.get('https://www.linkedin.com/jobs/search/', headers=headers, params=params).content
# sel = scrapy.Selector(text = response)

In [9]:
response


b'<!DOCTYPE html><html lang="en" class="theme theme--mercado"><head>\n    <script type="application/javascript">!function(i,n){void 0!==i.addEventListener&&void 0!==i.hidden&&(n.liVisibilityChangeListener=function(){i.hidden&&(n.liHasWindowHidden=!0)},i.addEventListener("visibilitychange",n.liVisibilityChangeListener))}(document,window);</script>\n\n    <title>LinkedIn</title>\n\n    <meta charset="utf-8">\n    <meta http-equiv="X-UA-Compatible" content="IE=edge">\n    <meta name="asset-url" class="mercado-icons-sprite" id="artdeco-icons/static/images/sprite-asset" content="https://static-exp1.licdn.com/sc/h/7438dbnn8galtczp2gk2s4bgb">\n    <meta name="description" content="">\n    <meta name="google" content="notranslate">\n    <meta name="service" content="voyager-web">\n    <meta name="theme-color" content="#ffffff" id="theme-color-meta-tag">\n    <meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0">\n    <meta name="baseCDNUrl" content="https://s