In [1]:
# Import required modules
import pandas as pd
from bs4 import BeautifulSoup
import requests
from concurrent.futures import ProcessPoolExecutor

In [2]:
# Generate cover pages
cover_page = []
for pg in range(1, 56):
    cover_page.append(f"https://www.reed.co.uk/jobs/full-time?pageno={pg}&sortby=DisplayDate&datecreatedoffset=LastWeek")

In [3]:
# This function scrapes 
def get_job_info(url):
    
    # Make requests
    r = requests.get(url)
    s = BeautifulSoup(r.text, "lxml")
    
    # List of empty variables to be scraped
    job_title = []
    job_link = []
    employer = []
    location = []
    salary = []
    posted_at = []
    
    # Main container
    cont = s.find_all(class_="col-sm-12 col-md-9 col-lg-9 details")
    for c in cont:
        
        # Scrape job title
        try:
            job_title.append(c.find(class_="title").text.strip())
        except:
            job_title.append("na")
        
        # Scrape job link
        try:
            job_link.append("https://www.reed.co.uk" + c.a.get("href"))
        except:
            job_link.append("na")
            
        
        # Srrape employer
        try:
            employer.append(c.find(class_="posted-by").a.text.strip())
        except:
            employer.append("na")
        
        # Scrape job posting day
        try:
            posted_at.append(c.find(class_="posted-by").text.strip().split("by")[0])
        except:
            posted_at.append("na")
        
        # Srape location
        try:
            location.append(c.find(class_="location").text.strip().split("\r")[0])
        except:
            location.append("na")
        
        # Scrape salary
        try:
            salary.append(c.find(class_="salary").text.strip())
        except:
            salary.append("na")
    
    # Scrape a temporary df
    temp_df = pd.DataFrame({
        "job_title":job_title,
        "job_link":job_link,
        "employer":employer,
        "location":location,
        "posted_at":posted_at
        
    })
    return temp_df

In [4]:
%%time
with ProcessPoolExecutor(4) as ex:
    df = pd.concat(list(ex.map(get_job_info, cover_page)))

CPU times: user 54.8 ms, sys: 18.4 ms, total: 73.2 ms
Wall time: 18.7 s


In [5]:
# Create a csv file
df.to_csv("trending_jobs.csv", index=None)

## Find unique job titles

In [6]:
# Read in the data back
df = pd.read_csv("trending_jobs.csv")
df.job_title = df.job_title.str.lower().str.strip()
df.head()

Unnamed: 0,job_title,job_link,employer,location,posted_at
0,"electrical engineer hv ap training, progressio...",https://www.reed.co.uk/jobs/electrical-enginee...,Rise Technical Recruitment Limited,Manchester,Posted Today
1,strategy and planning manager,https://www.reed.co.uk/jobs/strategy-and-plann...,Hays Specialist Recruitment Limited,London,Posted Today
2,12 weeks to christmas - sales assistant,https://www.reed.co.uk/jobs/12-weeks-to-christ...,SLS Recruitment,"Dalston, London",Posted Today
3,cleaner 10 to 15 hours,https://www.reed.co.uk/jobs/cleaner-10-to-15-h...,Lidl GB,Heacham,Posted Today
4,import administrator,https://www.reed.co.uk/jobs/import-administrat...,Lidl GB,London,Posted Today


In [9]:
# Find title frequency
df.job_title.value_counts().reset_index().rename(columns={"job_title":"frequency", "index":"job_title"}).head(10)

Unnamed: 0,job_title,frequency
0,warehouse operative,50
1,care assistant - care home,18
2,trainee driving instructor,17
3,real estate broker in dubai,14
4,band 7 radiographer,14
5,band 6 radiographer,10
6,credit controller,9
7,management accountant,9
8,picker packer,8
9,residential conveyancer - remote,7
