# Libraries we'll use

In [None]:
#Run this to install all needed libraries
!pip install pandas selenium textract scikit-learn spacy nltk


In [None]:
#Download english model for spacy
!python -m spacy download en

 # Import libraries

In [5]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import textract # Library to read PDFs
import time # PreInstalled library to add sleeping times
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
import string
import spacy
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
all_stopwords = stopwords.words('english')
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
all_stopwords = stopwords.words('english')+stopwords.words('spanish')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/erichhohenstein/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/erichhohenstein/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Functions definitions

In [6]:

def cleanText(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text_tokens = word_tokenize(text)
    tokens_without_sw= [word for word in text_tokens if not word in all_stopwords]
    text = ' '.join(tokens_without_sw)
    return text

def resumeReader(pdf):
    # Read Resume
    with open(pdf, "rb") as pdf_file:
        resume = textract.process(pdf).decode("utf-8").replace('\n',' ')
    return resume

def score(resume,jobDescription):
    cv = CountVectorizer()
    resume = ' '.join(set(cleanText(resume).split(' ')))
    jobDescription = ' '.join(set(cleanText(jobDescription).split(' ')))
    text = [resume,jobDescription]
    count_matrix = cv.fit_transform(text)
    return round(cosine_similarity(count_matrix)[0][1],2)
    
def getJobInfo(browser):
    title = ''
    company = ''
    companyLink = ''
    location = ''
    jobLink = ''
    posterName = ''
    posterProfileLink = ''
    jobDescription = ''
    try:
        title = browser.find_element(By.CLASS_NAME,'jobs-unified-top-card__content--two-pane').text.split('\n')[0]
    except:
        pass
    try:
        company = browser.find_element(By.CLASS_NAME,'jobs-unified-top-card__company-name').text
    except:
        pass
    try:
        companyLink = browser.find_element(By.CLASS_NAME,'jobs-unified-top-card__company-name').find_element(By.TAG_NAME,'a').get_attribute('href')
    except:
        pass
    try:
        location = browser.find_element(By.CLASS_NAME,'jobs-unified-top-card__bullet').text
    except:
        pass
    try:
        jobLink = browser.find_element(By.CLASS_NAME,'jobs-unified-top-card__content--two-pane').find_element(By.TAG_NAME,'a').get_attribute('href')
    except:
        pass
    try:
        posterName = browser.find_element(By.CLASS_NAME,'jobs-poster__name').text
    except:
        pass
    try:
        posterProfileLink = browser.find_element(By.CLASS_NAME,'jobs-poster__name-link').get_attribute('href')
    except:
        pass
    try:
        jobDescription = browser.find_element(By.CLASS_NAME,'jobs-box__html-content').find_element(By.TAG_NAME,'span').text.replace('\n',' ')
    except:
        pass
    return [title,company,companyLink,location,jobLink,posterName,posterProfileLink,jobDescription]

def loginLinkedIn(browser,usr,psw):
    # Go to LinkedIn Login page
    browser.get('https://www.linkedin.com/login/es?fromSignIn=true&trk=guest_homepage-basic_nav-header-signin')
    time.sleep(4)
    browser.find_element(By.ID,'username').send_keys(usr)
    time.sleep(1)
    browser.find_element(By.ID,'password').send_keys(psw)
    time.sleep(1)
    browser.find_element(By.CLASS_NAME,'btn__primary--large').click()
    return browser



def getLinkedinJobs(browser,keyword,jobType,maxPageNumSearch):
    actions = ActionChains(browser)
    jobtype = str(jobType) #2 = Remote
    keyword = keyword.replace(' ','%20') # %20 = space
    jobData = [['title','company','companyLink','location','jobLink','posterName','posterProfileLink','jobDescription']]
    for p in range(maxPageNumSearch):
        page = str(p*25)
        #print(page)
        browser.get('https://www.linkedin.com/jobs/search/?f_WT='+jobtype+'&geoId=92000000&keywords='+keyword+'&location=Worldwide&sortBy=R&start='+page)
        time.sleep(3)
        #Search for all job postings shown
        jobs = browser.find_elements(By.CLASS_NAME,'jobs-search-results__list-item')
        i=0
        for j in jobs:
            #print(i)
            i+=1
            time.sleep(2)
            actions.move_to_element(j).perform()
            j.find_element(By.TAG_NAME,'img').click() #Click on the image so it doesnt misclick a URL
            jobData.append(getJobInfo(browser))
    df = pd.DataFrame(jobData[1:],columns=jobData[0])
    #Drop any duplicate
    df.drop_duplicates(subset = ['jobLink'],inplace=True)
    return df


# Open Chrome browser and login to LinkedIn

In [7]:
# Download Chromedriver from: https://chromedriver.chromium.org/downloads
#Create browser instance by giving the address to chrome webdriver
browser = webdriver.Chrome('/Users/erichhohenstein/Documents/ChromeDriver/chromedriver')
browser = loginLinkedIn(browser,'email','password')


  browser = webdriver.Chrome('/Users/erichhohenstein/Documents/ChromeDriver/chromedriver')


# Search and Scrape LinkedIn Jobs

In [165]:
df = getLinkedinJobs(browser,'data science','2',40)

# Score jobs agains your Resume

In [166]:
pdf = "./Erich Hohenstein - Resume.pdf"
resume = resumeReader(pdf)
df['score'] = df['jobDescription'].apply(lambda x: score(resume,x))
df.sort_values(by=['score'],ascending=False,inplace=True)

# Save data to CSV

In [168]:
#df = pd.read_csv('jobs.csv')
df.to_csv('jobs.csv', index=False)

In [190]:
df.head(5)

Unnamed: 0,title,company,companyLink,location,jobLink,posterName,posterProfileLink,jobDescription,score
599,Data Scientist,Dice,https://www.linkedin.com/jobs/search/?currentJ...,United States,https://www.linkedin.com/jobs/view/3054398721/...,,,Dice is the leading career destination for tec...,0.22
154,Senior Data Scientist,INDIAEXCITE,https://www.linkedin.com/jobs/search/?currentJ...,India,https://www.linkedin.com/jobs/view/3059645658/...,,,The ideal candidate's favorite words are learn...,0.21
688,"Data Scientitst, Healthcare",Blue Health Intelligence,https://www.linkedin.com/jobs/search/?currentJ...,United States,https://www.linkedin.com/jobs/view/3016714422/...,,,Position Summary Working with the nation’s lar...,0.21
116,Data Analyst - OpenData - APAC,Veeva Systems,https://www.linkedin.com/jobs/search/?currentJ...,Mumbai Metropolitan Region,https://www.linkedin.com/jobs/view/3050280196/...,,,Veeva [NYSE: VEEV] is the leader in cloud-base...,0.21
47,Data Analyst - OpenData - APAC,Veeva Systems,https://www.linkedin.com/jobs/search/?currentJ...,"Singapore, Singapore",https://www.linkedin.com/jobs/view/3050279263/...,,,Veeva [NYSE: VEEV] is the leader in cloud-base...,0.21
