In [1]:
# Import required modules
import pandas as pd
from bs4 import BeautifulSoup
import requests
from concurrent.futures import ProcessPoolExecutor
from itertools import chain

In [2]:
# Define deaders and date
HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"}

# Function to generate cover pages
def generate_cover_pages():
    # Generate cover pages (for 25 pages only)
    cover_page = []
    for pg in range(1, 26):
        cover_page.append(f"https://www.reed.co.uk/jobs/data-analyst-jobs?pageno={pg}")
    return cover_page


# Function to scrape ind job link
def scrape_job_link(url):
    """url = cover page,
    return = list of job links"""
    
    # Making requests
    r = requests.get(url, headers=HEADERS)
    s = BeautifulSoup(r.text, "lxml")
    
    # Main container
    main_cont = s.find_all(class_="job-result")
    
    # Iterate over the main container and get the job link
    job_link = []
    for cont in main_cont:
        try:
            job_link.append("https://www.reed.co.uk" + cont.a.get("href"))
        except:
            job_link.append("na")
    
    # Filter out "na"
    job_link = list(filter(lambda x:x!="na", job_link))
    return job_link


# Function to scrape job info
def scrape_job_info(url):
    """url = individual job link,
    return = job info dataframe"""
    
    # Making requests
    r = requests.get(url, headers=HEADERS)
    s = BeautifulSoup(r.text, "lxml")
    
    # Scrape job title
    try:
        job_title = s.find(itemprop="title").get("content")
    except:
        job_title = "na"
    
    
    # Scrape job posting date
    try:
        date_posted = s.find(itemprop="datePosted").get("content")
    except:
        date_posted = "na"
    
    
    # Scrape employer
    try:
        employer = s.find(itemprop="hiringOrganization").find(itemprop="name").text.strip()
    except:
        employer = "na"
    
    
    # Scrape salary
    try:
        salary = s.find(itemprop="baseSalary").span.text.strip()
    except:
        salary = "na"
    
    
    # Scrape location
    try:
        location = s.find(itemprop="addressRegion").get("content")
    except:
        location = "na"
    
    
    # Scrape job nature
    try:
        job_type = s.find(itemprop="employmentType").get("content")
    except:
        job_type = "na"
    
    
    # Scrape whole body text
    try:
        whole_body = s.find(itemprop="description").text.lower().strip()
    except:
        whole_body = "na"
    
    
    # Create a df off scraped variables
    temp_df = pd.DataFrame({
        "job_title":job_title,
        "job_link":url,
        "date_posted":date_posted,
        "employer":employer,
        "salary":salary,
        "location":location,
        "job_type":job_type,
        "whole_body":whole_body
    }, index=[0])
    
    return temp_df

In [3]:
# Wrap all the function inside main
def main():
    
    # Generate cover pages
    cover_pages = generate_cover_pages()
    
    # Scrape job links
    job_links = list(chain.from_iterable(list(map(scrape_job_link, cover_pages))))
    
    # Scrape course info. Use multiprocessing to speed up the scraping process
    with ProcessPoolExecutor(max_workers=4) as ex:
        job_df = pd.concat(list(ex.map(scrape_job_info, job_links))).reset_index(drop=True)
    
    return job_df

In [4]:
%%time
# Run the main
job_df = main()
job_df.head(10)

CPU times: user 3.01 s, sys: 143 ms, total: 3.15 s
Wall time: 5min


Unnamed: 0,job_title,job_link,date_posted,employer,salary,location,job_type,whole_body
0,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/43851...,2021-08-31,Percepta UK Limited,Competitive salary,East Midlands,FULL_TIME,position summary the data analyst is responsib...
1,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/44005...,2021-09-10,Howden Group Holdings,Salary negotiable,South East England,FULL_TIME,data analyst at hx - we offer a great opportun...
2,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/43916...,2021-09-03,Hays Specialist Recruitment Limited,"£25,000 - £32,000 per annum",South Humberside,FULL_TIME,data analyst / excel skills / sql / dashboards...
3,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/44111...,2021-09-20,X4 Group,Salary negotiable,South East England,"FULL_TIME, CONTRACTOR",there is a brand new opportunity for an experi...
4,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/44111...,2021-09-20,X4 Group,Competitive salary,South East England,"FULL_TIME, CONTRACTOR",there is a brand new opportunity for an experi...
5,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/44210...,2021-09-28,Brook Street,£11.27 per hour,West Midlands,"FULL_TIME, TEMPORARY",we are working with a national energy supplier...
6,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/44227...,2021-09-29,Advantage Resourcing,£250.00 - £300.00 per day,South East England,"FULL_TIME, CONTRACTOR","data analyst 3 months initially, outside ir35...."
7,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/42648...,2021-07-27,Remit Resources,"£35,000 - £55,000 per annum",Lancashire,FULL_TIME,data analysts needed for this large profession...
8,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/43520...,2021-08-23,REED Education,"£35,000 - £45,000 per annum, inc benefits",South West England,FULL_TIME,role: data analystlocation: salisbury/nationwi...
9,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/44251...,2021-10-01,Concept Personnel,"£35,000 - £55,000 per annum",South East England,FULL_TIME,job title: data analyst location: harmondswort...


In [6]:
# Create a csv
job_df[job_df.job_title!="na"].to_csv("reed_data_analyst_jobs_data_07_Oct_21.csv", index=None)