In [1]:
# Import the time module to use for delays (e.g., waiting for a page to load)
import time

# Import numpy, a fundamental package for numerical computations in Python
import numpy as np

# Import pandas, a powerful data manipulation and analysis library for Python
import pandas as pd

# Import webdriver from selenium, a tool for automating web browser interaction
from selenium import webdriver

# Import By from selenium.webdriver.common.by, used to locate items on a web page
from selenium.webdriver.common.by import By

# Import Keys from selenium.webdriver.common.keys, allows sending keys to web elements (like keyboard inputs)
from selenium.webdriver.common.keys import Keys

# Import warnings library to manage warnings during runtime
import warnings

# Command to suppress all warnings, making the output cleaner and less cluttered
warnings.filterwarnings('ignore')


In [2]:
# Prompt the user to enter job titles separated by commas and store the input as a string
user_input = input('Enter Job Titles (comma-separated):')

# Split the string into a list at each comma to handle multiple job titles
user_input_job_title = user_input.split(',')

# Strip any leading/trailing whitespace from each job title
user_input_job_title = [title.strip() for title in user_input_job_title]

# Print the list of job titles to verify the input was processed correctly
print(user_input_job_title)


Enter Job Titles (comma-separated):data scientist
['data scientist']


In [21]:
user_input_job_title

['data scientist']

In [22]:
# Initialize an empty list to store formatted job titles
formatted_job_titles = []

# Iterate through each job title in the user input list
for title in user_input_job_title:
    # Split the job title into individual words
    words = title.split()
    
    # Join the words with '%20' to replace spaces
    formatted_title = '%20'.join(words)
    
    # Append the formatted job title to the list
    formatted_job_titles.append(formatted_title)

# Join the formatted job titles with '%2C%20' to separate them in the final string
# '%2C' represents the URL-encoded comma (,) and '%20' represents the URL-encoded space ( )
final_job_title_string = '%2C%20'.join(formatted_job_titles)

# Print the final formatted job title string
print(final_job_title_string)


data%20scientist


In [23]:
link = f"https://www.linkedin.com/jobs/search/?currentJobId=3910773395&f_TPR=r604800&keywords={user_input_job_title}&location=United%20Kingdom&originalSubdomain=uk"
print(link)

https://www.linkedin.com/jobs/search/?currentJobId=3910773395&f_TPR=r604800&keywords=['data scientist']&location=United%20Kingdom&originalSubdomain=uk


In [24]:
# Create a new instance of the Chrome WebDriver, which controls the Chrome browser
driver = webdriver.Chrome()

# Maximize the browser window to ensure full visibility
driver.maximize_window()

# Open the provided link in the Chrome browser
driver.get(link)

# Set an implicit wait time of 10 seconds to wait for elements to become available
driver.implicitly_wait(10)


In [25]:
# Loop through the range from 0 to 1 (2 times)
for i in range(2):
    # Execute JavaScript to scroll to the bottom of the page
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
    # Pause script execution for 5 seconds to allow content to load
    time.sleep(5)

    try:
        # Find and click the "See more jobs" button using CSS selector
        x = driver.find_element(by=By.CSS_SELECTOR, value="button[aria-label='See more jobs']").click()
        
        # Pause script execution for 3 seconds after clicking the button
        time.sleep(3)
    except:
        # If the button is not found, do nothing and continue
        pass


In [26]:
# Initialize an empty list to store company names
company_name = []

# Try block to catch any potential exceptions
try:
    # Find all elements with CSS selector matching company names
    companies = driver.find_elements(by=By.CSS_SELECTOR, value='h4[class="base-search-card__subtitle"]')
    
    # Iterate through each found element
    for company in companies:
        # Append the text of each element to the company_name list
        company_name.append(company.text)

# Exception handling block to handle any errors
except:
    # If an exception occurs, do nothing and continue
    pass

# Print the list of company names
print(company_name)


['BBC', 'Dyson', 'Torbay Pharma', 'Ralph Lauren', 'PEP Health', 'GRAYCE', 'Gopuff', 'InterEx Group', 'Kantar Media', 'Acquire Me', 'Omnis Partners', 'Energy Jobline', 'Nicholson Glover', 'Meta', 'CricViz', 'Tech Kinect', 'Meta', 'DHL Supply Chain', 'Morgan McKinley', 'GRAYCE', 'QuanTech Partners', 'Match Digital', 'Northern Powergrid', 'VRS Recruitment', 'Aviva', 'Klickstarters', 'Hunter Bond', 'Lifelancer', 'Vp plc', 'Understanding Recruitment', 'DHL Supply Chain', 'Tiro Partners Limited', 'Stanford Black Limited', 'Sixty Eight People', 'Dexter Talent', 'Houseful', 'WYK Digital', 'Energy Jobline', 'Hunter Bond', 'Ingenii Search', 'Novo Nordisk', 'NielsenIQ', 'Energy Jobline', 'Energy Jobline', 'Hunter Bond', 'Energy Jobline', 'DataCareers', 'Stott and May', 'Hunter Bond', 'Deel', 'the LEGO Group', 'Energy Jobline', 'Bamboo Crowd', 'Malaberg', 'Andretti Global', 'Nestlé', 'Lawrence Harvey', 'Elton Recruitment', 'Energy Jobline', 'KDR Talent Solutions', 'Klickstarters', 'Hunter Bond', '

In [27]:
# Initialize an empty list to store job titles
job_title = []

# Try block to catch any potential exceptions
try:
    # Find all elements with CSS selector matching job titles
    titles = driver.find_elements(by=By.CSS_SELECTOR, value='h3[class="base-search-card__title"]')
    
    # Iterate through each found element
    for title in titles:
        # Append the text of each element to the job_title list
        job_title.append(title.text)

# Exception handling block to handle any errors
except:
    # If an exception occurs, do nothing and continue
    pass

# Print the list of job titles
print(job_title)


['Data Analyst', 'Graduate Data Analyst', 'Data Analyst - Generic Medicine', 'Junior Machine Learning Engineer', 'Data Scientist', 'Graduate Data Analyst - Manchester', 'Data Analyst, In-Stock Management', 'Data Scientist', 'Data Scientist', 'Data Engineer Intern', 'Analyst', 'Graduate Software Engineer', 'Data Scientist | Behavioural | PR and Comms', 'Data Scientist, Product Analytics', 'Data Analyst', 'Data Scientist', 'Enterprise Engineer (University Grad)', 'Data Scientist', 'Data Scientist', 'Graduate Software Developer - Manchester', 'Quantitative Researcher [Python Developer] – London', 'Data Analyst', 'Junior Data Analyst', 'Data Scientist', 'Junior Data Scientist', 'Junior Software Engineer', 'Junior Python Developer – Elite Tech Firm (up to £80K + Bonus + Hybrid)', 'Data Analyst, Life Sciences', 'Graduate Group Data Analyst', 'Data Scientist', 'Data Scientist', 'Data Analyst', 'Python Developer', 'Data Analyst', 'Junior Data Analyst', 'Data Analyst', 'Data Analytics Trainee (

In [28]:
# Initialize an empty list to store company locations
company_location = []

# Try block to catch any potential exceptions
try:
    # Find all elements with CSS selector matching company locations
    locations = driver.find_elements(by=By.CSS_SELECTOR, value='span[class="job-search-card__location"]')
    
    # Iterate through each found element
    for location in locations:
        # Append the text of each element to the company_location list
        company_location.append(location.text)

# Exception handling block to handle any errors
except:
    # If an exception occurs, do nothing and continue
    pass

# Print the list of company locations
print(company_location)


['London, England, United Kingdom', 'Malmesbury, England, United Kingdom', 'United Kingdom', 'London, England, United Kingdom', 'United Kingdom', 'Manchester, England, United Kingdom', 'London, England, United Kingdom', 'Greater London, England, United Kingdom', 'London Area, United Kingdom', 'Greater London, England, United Kingdom', 'Oxfordshire, England, United Kingdom', 'London, England, United Kingdom', 'London Area, United Kingdom', 'London, England, United Kingdom', 'London Area, United Kingdom', 'London Area, United Kingdom', 'London, England, United Kingdom', 'Greater Portsmouth Area', 'London Area, United Kingdom', 'Manchester, England, United Kingdom', 'London Area, United Kingdom', 'London Area, United Kingdom', 'Dewsbury, England, United Kingdom', 'Slough, England, United Kingdom', 'Greater Bristol Area, United Kingdom', 'Birmingham, England, United Kingdom', 'Greater London, England, United Kingdom', 'London, England, United Kingdom', 'Harrogate, England, United Kingdom',

In [29]:
# Initialize an empty list to store job URLs
job_url = []

# Try block to catch any potential exceptions
try:
    # Find all elements with XPath containing URLs of job listings
    urls = driver.find_elements(by=By.XPATH, value='//a[contains(@href, "/jobs/")]')
    
    # Iterate through each found element
    for url in urls:
        # Get the 'href' attribute of each element and append it to the job_url list
        job_url.append(url.get_attribute('href'))

# Exception handling block to handle any errors
except:
    # If an exception occurs, do nothing and continue
    pass

# Print the list of job URLs
print(job_url)


['https://uk.linkedin.com/jobs/view/data-analyst-at-bbc-3912068735?position=1&pageNum=0&refId=y01c0%2FNnL5Xfno9b1leqeQ%3D%3D&trackingId=eBzRbXDgr%2BzMJweoCt%2BfaQ%3D%3D&trk=public_jobs_jserp-result_search-card', 'https://uk.linkedin.com/jobs/view/graduate-data-analyst-at-dyson-3909010865?position=2&pageNum=0&refId=y01c0%2FNnL5Xfno9b1leqeQ%3D%3D&trackingId=HiHMbwNPe5QjN9JqIVCvyg%3D%3D&trk=public_jobs_jserp-result_search-card', 'https://uk.linkedin.com/jobs/view/data-analyst-generic-medicine-at-torbay-pharma-3904240943?position=3&pageNum=0&refId=y01c0%2FNnL5Xfno9b1leqeQ%3D%3D&trackingId=XfkxymGw9B8mNSOMFHLbBg%3D%3D&trk=public_jobs_jserp-result_search-card', 'https://uk.linkedin.com/jobs/view/junior-machine-learning-engineer-at-ralph-lauren-3853758485?position=4&pageNum=0&refId=y01c0%2FNnL5Xfno9b1leqeQ%3D%3D&trackingId=wVuLAJ7reO7biaICYDDvfg%3D%3D&trk=public_jobs_jserp-result_search-card', 'https://uk.linkedin.com/jobs/view/data-scientist-at-pep-health-3905997394?position=5&pageNum=0&refI

In [30]:
df = pd.DataFrame(company_name, columns=['Company Name'])
df['Job Title'] = pd.DataFrame(job_title)
df['Location'] = pd.DataFrame(company_location)
df['Website URL'] = pd.DataFrame(job_url)
df

Unnamed: 0,Company Name,Job Title,Location,Website URL
0,BBC,Data Analyst,"London, England, United Kingdom",https://uk.linkedin.com/jobs/view/data-analyst...
1,Dyson,Graduate Data Analyst,"Malmesbury, England, United Kingdom",https://uk.linkedin.com/jobs/view/graduate-dat...
2,Torbay Pharma,Data Analyst - Generic Medicine,United Kingdom,https://uk.linkedin.com/jobs/view/data-analyst...
3,Ralph Lauren,Junior Machine Learning Engineer,"London, England, United Kingdom",https://uk.linkedin.com/jobs/view/junior-machi...
4,PEP Health,Data Scientist,United Kingdom,https://uk.linkedin.com/jobs/view/data-scienti...
...,...,...,...,...
75,Nestlé,Data Scientist,"Crawley, England, United Kingdom",https://uk.linkedin.com/jobs/view/data-scienti...
76,Lawrence Harvey,Data Scientist,"London Area, United Kingdom",https://uk.linkedin.com/jobs/view/data-scienti...
77,Elton Recruitment,Data Analyst,"Stevenage, England, United Kingdom",https://uk.linkedin.com/jobs/view/data-analyst...
78,Energy Jobline,Data Scientist - Remote,"Liverpool, England, United Kingdom",https://uk.linkedin.com/jobs/view/data-scienti...


In [31]:
df.to_csv('data_scientist.csv')

In [32]:
def job_title_filter(x, user_input_job_title):
    # Convert user input job titles to lowercase and split into individual words
    suggestion = [word.lower() for title in user_input_job_title for word in title.split()]
    
    # Convert job title x to lowercase and split into individual words
    title_words = x.split()
    a = [word.lower() for word in title_words]
    
    # Find the intersection of words between user input job titles and job title x
    intersection = list(set(suggestion).intersection(set(a)))
    
    # Return job title x if the intersection contains more than one word, otherwise return NaN
    return x if len(intersection) > 1 else np.nan


In [33]:
df['Job Title'] = df['Job Title'].apply(lambda x: job_title_filter(x, user_input_job_title))
df=df.dropna()
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Company Name,Job Title,Location,Website URL
0,PEP Health,Data Scientist,United Kingdom,https://uk.linkedin.com/jobs/view/data-scienti...
1,InterEx Group,Data Scientist,"Greater London, England, United Kingdom",https://uk.linkedin.com/jobs/view/data-scienti...
2,Kantar Media,Data Scientist,"London Area, United Kingdom",https://uk.linkedin.com/jobs/view/data-scienti...
3,Nicholson Glover,Data Scientist | Behavioural | PR and Comms,"London Area, United Kingdom",https://uk.linkedin.com/jobs/view/data-scienti...
4,Tech Kinect,Data Scientist,"London Area, United Kingdom",https://uk.linkedin.com/jobs/view/data-scienti...
5,DHL Supply Chain,Data Scientist,Greater Portsmouth Area,https://uk.linkedin.com/jobs/view/data-scienti...
6,Morgan McKinley,Data Scientist,"London Area, United Kingdom",https://uk.linkedin.com/jobs/view/data-scienti...
7,VRS Recruitment,Data Scientist,"Slough, England, United Kingdom",https://uk.linkedin.com/jobs/view/data-scienti...
8,Aviva,Junior Data Scientist,"Greater Bristol Area, United Kingdom",https://uk.linkedin.com/jobs/view/junior-data-...
9,Understanding Recruitment,Data Scientist,"London Area, United Kingdom",https://uk.linkedin.com/jobs/view/data-scienti...


In [34]:
df=df.iloc[:10,:]
df

Unnamed: 0,Company Name,Job Title,Location,Website URL
0,PEP Health,Data Scientist,United Kingdom,https://uk.linkedin.com/jobs/view/data-scienti...
1,InterEx Group,Data Scientist,"Greater London, England, United Kingdom",https://uk.linkedin.com/jobs/view/data-scienti...
2,Kantar Media,Data Scientist,"London Area, United Kingdom",https://uk.linkedin.com/jobs/view/data-scienti...
3,Nicholson Glover,Data Scientist | Behavioural | PR and Comms,"London Area, United Kingdom",https://uk.linkedin.com/jobs/view/data-scienti...
4,Tech Kinect,Data Scientist,"London Area, United Kingdom",https://uk.linkedin.com/jobs/view/data-scienti...
5,DHL Supply Chain,Data Scientist,Greater Portsmouth Area,https://uk.linkedin.com/jobs/view/data-scienti...
6,Morgan McKinley,Data Scientist,"London Area, United Kingdom",https://uk.linkedin.com/jobs/view/data-scienti...
7,VRS Recruitment,Data Scientist,"Slough, England, United Kingdom",https://uk.linkedin.com/jobs/view/data-scienti...
8,Aviva,Junior Data Scientist,"Greater Bristol Area, United Kingdom",https://uk.linkedin.com/jobs/view/junior-data-...
9,Understanding Recruitment,Data Scientist,"London Area, United Kingdom",https://uk.linkedin.com/jobs/view/data-scienti...


In [35]:
def description(link):
    # Open the provided link in the web driver
    driver.get(link)
    time.sleep(3)  # Adding a delay to ensure page loads completely
    
    # Click on the button to show more job descriptions
    driver.find_element(by=By.CSS_SELECTOR, value='button[data-tracking-control-name="public_jobs_show-more-html-btn"]').click()
    time.sleep(2)  # Adding a delay to allow the additional content to load
    
    # Find all elements containing job descriptions
    descriptions = driver.find_elements(by=By.CSS_SELECTOR, value='div[class="show-more-less-html__markup relative overflow-hidden"]')
    driver.implicitly_wait(4)  # Setting implicit wait time
    
    for description in descriptions:
        # Return the text of the first description found
        return description.text


In [36]:
# Extract website URLs from the DataFrame and convert them to a list
website_url = df['Website URL'].tolist()

# Initialize an empty list to store job descriptions
job_description = []

# Iterate through each website URL
for url in website_url:
    # Get the description data from the URL using the description function
    data = description(url)
    
    # Check if the data is not None and has non-zero length after stripping whitespace
    if data is not None and len(data.strip()) > 0:
        # Append the job description to the list if available
        job_description.append(data)
    else:
        # If description is not available, append a placeholder string
        job_description.append('Description Not Available')


In [37]:
df['Job Description'] = pd.DataFrame(job_description, columns=['Description'])
df

Unnamed: 0,Company Name,Job Title,Location,Website URL,Job Description
0,PEP Health,Data Scientist,United Kingdom,https://uk.linkedin.com/jobs/view/data-scienti...,Data Scientist | Healthtech | PEP Health\nLoca...
1,InterEx Group,Data Scientist,"Greater London, England, United Kingdom",https://uk.linkedin.com/jobs/view/data-scienti...,The ideal candidate's favourite words are lear...
2,Kantar Media,Data Scientist,"London Area, United Kingdom",https://uk.linkedin.com/jobs/view/data-scienti...,As people increasingly move across channels an...
3,Nicholson Glover,Data Scientist | Behavioural | PR and Comms,"London Area, United Kingdom",https://uk.linkedin.com/jobs/view/data-scienti...,🎉 Join a team committed to making a real impac...
4,Tech Kinect,Data Scientist,"London Area, United Kingdom",https://uk.linkedin.com/jobs/view/data-scienti...,Data Scientist Job Description:\n\nAnalyze lar...
5,DHL Supply Chain,Data Scientist,Greater Portsmouth Area,https://uk.linkedin.com/jobs/view/data-scienti...,RCS Grade: J\n\nContract Type: Permanent\n\nLo...
6,Morgan McKinley,Data Scientist,"London Area, United Kingdom",https://uk.linkedin.com/jobs/view/data-scienti...,Exciting opportunity to work with a Big4 Tech ...
7,VRS Recruitment,Data Scientist,"Slough, England, United Kingdom",https://uk.linkedin.com/jobs/view/data-scienti...,"Data Scientist | Slough, Hybrid | £55,000 - £6..."
8,Aviva,Junior Data Scientist,"Greater Bristol Area, United Kingdom",https://uk.linkedin.com/jobs/view/junior-data-...,"Junior Data Scientist\nSalary: Circa £45,000\n..."
9,Understanding Recruitment,Data Scientist,"London Area, United Kingdom",https://uk.linkedin.com/jobs/view/data-scienti...,Data Scientist | Junior - Mid level | London\n...


In [38]:
l = len(df['Company Name'])
for i in range(0,l):
    print(f"Company Name : {df.iloc[i,0]}")
    print(f"Job Title    : {df.iloc[i,1]}")
    print(f"Location     : {df.iloc[i,2]}")
    print(f"Website URL  : {df.iloc[i,3]}")
    print(f"Description  : {df.iloc[i,4]}")


Company Name : PEP Health
Job Title    : Data Scientist
Location     : United Kingdom
Website URL  : https://uk.linkedin.com/jobs/view/data-scientist-at-pep-health-3905997394?position=5&pageNum=0&refId=y01c0%2FNnL5Xfno9b1leqeQ%3D%3D&trackingId=n%2FhaDqnj83JDCJZK3CY%2B8Q%3D%3D&trk=public_jobs_jserp-result_search-card
Description  : Data Scientist | Healthtech | PEP Health
Location: Remote within the UK
We are looking for a Data Scientist to join our world-class tech team and grow with us as we scale our services in the UK, US and globally, making a genuine impact on the quality of healthcare.
 Why is it challenging & interesting?
At PEP Health, we’re on a mission to transform healthcare through a stronger and more empowered patient voice. PEP Health listens to and makes sense of the millions of digital comments made by patients about their care to understand trends and create real-time insights. We have recently completed significant funding to grow the team and ensure our impact is int