<a href="https://colab.research.google.com/github/futureCodersSE/data-roles/blob/main/Data_Roles_job_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing libaries and defining base jobsite urls

In [78]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'  ##<- this is really annoying, I WANT to copy the DF.
import numpy as np
import datetime
import matplotlib.pyplot as plt


def get_html(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    return soup
#This cleans the dataframes when they have been created.
def general_clean(df,site):
  df.drop_duplicates(inplace=True)
  df = df[df["job_title"].str.contains("Data")]
  df["salary"].replace(np.nan, '', regex=False,inplace=True) #removes all NaN values from missing salary info
  df["company"] = df["company"].str.lstrip(" ").str.rstrip(" ") #removes excess spaces from names

  df["date_found"] = datetime.date.today()
  df["advertised_on"] = site
  return df


reed_url = "https://www.reed.co.uk/jobs/data-jobs-in-ashford-kent?proximity=30"
jobsinkent_url = "https://jobsinkent.com/search?q=Data&pl=1"

max_job_call = 2000 #reed has a limit of 2000 job search api calls per hour.
date_today = datetime.date.today()

#Reed Job Search
---

This function finds the job ID for all jobs displayed on each page url from reed.co.uk


In [42]:
def find__reed_jobs_iterate(page_url):
  site_html = get_html(page_url)
  results = site_html.find(class_="col-sm-8 col-md-9 results-container")

  id_list = np.array([])
  job_cards = results.find_all(class_="job-result-card")

  for job in job_cards: #finds the jobs, gets it's ID, returns the id
      job_id = int(job["id"].split("jobSection")[1]) #jobSection48529572 ect...
      
      id_list = np.append(id_list,[job_id])
  return id_list

This finds the total jobs found in the search

In [44]:
def find_total_jobs(site_html,max_job_call):
  max_page_requests = int(max_job_call/25) 

  ###
  #This section finds the total number of jobs and calculates the total number of pages to iterate over to obtain all the results.
  total_jobs_text = site_html.find(class_="col-sm-11 col-xs-12 page-title").text # '\n' '\r' '\n' x,xxx\r\n  Data Jobs near Ashford       '\n'....

  total_jobs_text = total_jobs_text.replace("\n","").replace("\r","").replace(",","") #          xxxx         Data Jobs near Ashford         

  total_jobs = int(total_jobs_text.split("Data")[0].strip(" "))
  total_pages = int(np.ceil(total_jobs/25))
  ###

  print("total jobs found: ", total_jobs)
  print("total pages: ", total_pages)
  #limits the requests of only 2000
  if total_pages > max_page_requests:
    total_pages = max_page_requests

  print("pages to search: ",total_pages)
  return total_jobs, total_pages
#find_total_jobs(get_html(reed_url),max_job_call)



The function that actually calls everything else.
---


In [None]:
def reed_scrape(base_url,max_job_call):
  soup = get_html(base_url)
  total_id_list = np.array([],dtype=int)

  #scrapes job id's from page1

  total_id_list = find__reed_jobs_iterate(base_url)

  #finds out how many more pages to scrape
  total_jobs, total_pages = find_total_jobs(soup,max_job_call)


  print("page 1")
  #iterates over requried pages
  for page_no in np.arange(2,total_pages+1): #this starts from page 2 as the first page is checked in the function
    page_url = base_url+"&pageno="+str(page_no)

    page_id_list = find__reed_jobs_iterate(page_url)

    total_id_list = np.append(total_id_list,[page_id_list])
    print("page ",page_no)
  return total_id_list

reed_id_list = reed_scrape(reed_url,max_job_call)
reed_id_list = reed_id_list.astype("int")
#print(id_list)

This calls the reed API to return job information based on the jobID

In [48]:
def call_api(id_list):
  base_url = "https://www.reed.co.uk/api/1.0/jobs/"
  url = ""
  job_df = pd.DataFrame()


  for job_id in id_list:
    url = base_url+str(job_id)
    post_request = requests.get(url, auth=("96a7ec49-549c-4529-b9d2-fa3059a437b3",""))
    json_data = post_request.json()


    new_row = pd.json_normalize(json_data)
    job_df = job_df.append(new_row,ignore_index=True)
  return job_df
full_reed_df = call_api(reed_id_list)
display(full_reed_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1537 entries, 0 to 1536
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   employerId           1537 non-null   int64 
 1   employerName         1537 non-null   object
 2   jobId                1537 non-null   int64 
 3   jobTitle             1537 non-null   object
 4   locationName         1537 non-null   object
 5   minimumSalary        1269 non-null   object
 6   maximumSalary        1269 non-null   object
 7   yearlyMinimumSalary  1269 non-null   object
 8   yearlyMaximumSalary  1269 non-null   object
 9   currency             1269 non-null   object
 10  salaryType           1537 non-null   object
 11  salary               1269 non-null   object
 12  datePosted           1537 non-null   object
 13  expirationDate       1537 non-null   object
 14  externalUrl          295 non-null    object
 15  jobUrl               1537 non-null   object
 16  partTi

None

Cleaning the Reed DF

In [79]:
def clean_reed(df_in): #this filters out the necessary rows and renames them so the cleaning function works.
  df = df_in[["jobTitle", "employerName", "salary", "contractType", "locationName",  "jobUrl"]]

  df.rename(columns={"jobTitle":"job_title", "employerName": "company", "contractType":"contract", "locationName":"location",  "jobUrl":"job_url"}, inplace=True)
  return df


reed_df = clean_reed(full_reed_df)
reed_df = general_clean(reed_df,"https://www.reed.co.uk")
display(reed_df)

Unnamed: 0,job_title,company,salary,contract,location,job_url,date_found,advertised_on
0,Data Architect,Veolia UK,"£65,000 - £84,600 per annum, inc benefits",Permanent,United Kingdom,https://www.reed.co.uk/jobs/data-architect/486...,2022-10-31,https://www.reed.co.uk
2,Data Analyst,Commercial Services Interim & Executive Search,"£41,216 - £47,199 per annum",Permanent,West Malling,https://www.reed.co.uk/jobs/data-analyst/48529572,2022-10-31,https://www.reed.co.uk
3,Data Analyst,Morgan Sindall Property Services,"£35,000 - £40,000 per annum, inc benefits",Permanent,Dartford,https://www.reed.co.uk/jobs/data-analyst/48211407,2022-10-31,https://www.reed.co.uk
4,Data Administrator,Braundton Consulting Limited,"£22,194 per annum",Permanent,Sidcup,https://www.reed.co.uk/jobs/data-administrator...,2022-10-31,https://www.reed.co.uk
5,Data Administrator,Braundton Consulting Limited,"£22,194 per annum",Permanent,Orpington,https://www.reed.co.uk/jobs/data-administrator...,2022-10-31,https://www.reed.co.uk
6,Data Analyst,Hays Specialist Recruitment Limited,"£35,000 - £40,000 per annum",Permanent,Dartford,https://www.reed.co.uk/jobs/data-analyst/48641144,2022-10-31,https://www.reed.co.uk
7,Data Analyst,Premier IT,"£30,000 - £35,000 per annum",Permanent,Basildon,https://www.reed.co.uk/jobs/data-analyst/48631725,2022-10-31,https://www.reed.co.uk
8,Data Analyst,Morgan Sindall Property Services,"£26,000 - £34,000 per annum, inc benefits",Permanent,Billericay,https://www.reed.co.uk/jobs/data-analyst/48223150,2022-10-31,https://www.reed.co.uk
9,Data Analyst,IronmongeryDirect & ElectricalDirect,"£32,000 - £35,000 per annum",Permanent,Basildon,https://www.reed.co.uk/jobs/data-analyst/48320718,2022-10-31,https://www.reed.co.uk
10,Data Analyst,OnetoOne Personnel,"£33,000 - £50,000 per annum",Permanent,Leigh-on-Sea,https://www.reed.co.uk/jobs/data-analyst/48465441,2022-10-31,https://www.reed.co.uk


#JobsInKent Job Search
---


In [62]:
def find_jobsinkent_iterate(url,job_limit):
  jobs_df = pd.DataFrame(data=[], columns = ["job_title", "company", "salary", "contract", "location","job_url"])
  site_html = get_html(url)
  results = site_html.find(class_="mt-2 lg:mt-0")

  job_cards = results.find_all(class_="flex mb-2 border-black-900 rounded-md border-x-4 sm:rounded-lg") #this class uniquly identifies every job card of the site.
  print("found jobs",len(job_cards))


  #[:job_limit] is required as non relevent jobs were being found in the site_html with the job_cards class after the 20~ jobs that were being show on the site.
  # So I had to find how many jobs were being show in each page firstand then limit the loop to only take up to that number of jobs.
  for job in job_cards[:job_limit]:

    job_title_url = job.find("a", class_="text-blue-700 visited:text-grey-200 hover:underline") #Second box class of the job card (first box is a blank spacer).
    job_title = job_title_url.text
    job_url = job_title_url["href"]

    company_name = job.find(class_="mt-0 mb-2 max-w-2xl text-sm text-black").text #Third box class of the job card.


    #This info was all on the fouth box class and need to be seperated further for the relevent information to be extracted.
    info_list = job.find_all(class_="col-span-12") 
    salary = clean_bloat(info_list[0].text)
    contract = clean_bloat(info_list[1].text)
    location = clean_bloat(info_list[2].text)

    #creats a temperary dictionary for new job which can then be appended into the DF.
    new_row = {"job_title":job_title, "company":company_name, "salary":salary, "contract":contract, "location":location,"job_url":job_url}
    jobs_df = jobs_df.append(new_row,ignore_index = True)
  display(jobs_df)
  return jobs_df


In [63]:
def clean_bloat(text):
  strip1 = text.replace("\n","").replace("\xa0","")
  strip2 = strip1.strip(" ")
  return strip2

def return_jobs_pages(input_str):
  p_j = input_str.split("of") #[showing 1 to yy] , [xx jobs]
  jobs_per_page = int(p_j[0].split("to")[1]) #[yy]
  total_jobs = int(p_j[1].split("Jobs")[0]) #[xx]
  
  total_pages = int(np.ceil(total_jobs/jobs_per_page))

  return total_jobs, jobs_per_page, total_pages

In [64]:
def main_scrape(base_url):
  #initialse the DF that will be used
  jobsinkent_df = pd.DataFrame(data=[], columns = ["job_title", "company", "salary", "contract", "location","job_url"])

  #returns ALL the html from the site
  site_html = get_html(base_url)

  banner = site_html.find("p", class_="text-sm text-gray-700 leading-5") #~6th child within the top banner.
  showing = banner.text # Showing 1 to yy of xx Jobs


  total_jobs, jobs_per_page, total_pages = return_jobs_pages(showing)

  for page_no in range(1,total_pages+1): #
    print("page_no ",page_no)
    #this if statment is required due to the issues mentioned above about job limits. It sets the jobs_limit variable to however many jobs there are displayed (visibly) on the page.
    if page_no == total_pages:
      jobs_limit = total_jobs%jobs_per_page
      print("jobs_limit ", jobs_limit)
    else:
      jobs_limit = jobs_per_page
      print("jobs_limit ", jobs_limit)

    page_url = base_url + "&page=" + str(page_no) #the url for each page to be scraped
    new_df = find_jobsinkent_iterate(page_url, jobs_limit)
    jobsinkent_df = jobsinkent_df.append(new_df, ignore_index=True)
  
  
  return jobsinkent_df
  

jobsinkent_df = main_scrape(jobsinkent_url)

page_no  1
jobs_limit  20
found jobs 40


Unnamed: 0,job_title,company,salary,contract,location,job_url
0,Senior Data Analyst,Clarion Housing Group Ltd,£43264 - £59488 per annum,"permanent,full-time","Central London, Greater London",https://jobsinkent.com/job/2423530
1,Data Analyst - Global Insurance Markets,MW Appointments,Negotiable,"permanent,full-time","City of London, Greater London",https://jobsinkent.com/job/2422053
2,Data Analyst,Pearson Whiffin Recruitment,£23000 - £28000 per annum + DOE,"permanent,full-time","West Malling, Kent",https://jobsinkent.com/job/2331144
3,Data Lead (Digital Marketing),Recruitment Solutions Folkestone Ltd,"£30-40,000 dep on exp","permanent,full-time","Ashford, Kent",https://jobsinkent.com/job/2277249
4,Data & GDPR Officer,Pearson Whiffin Recruitment,"Up to £35,000 per annum","permanent,full-time","Kings Hill, Kent",https://jobsinkent.com/job/2424610
5,Data Cable Engineer - City of London,PW Construction,Negotiable,"temporary,full-time","City of London, Greater London",https://jobsinkent.com/job/2421623
6,Data Assistant / Analyst / Manager,KHR - Recruitment Specialists,+ Benefits,"permanent,full-time","Sevenoaks, Kent",https://jobsinkent.com/job/2424450
7,Data Analyst,Hays Specialist Recruitment Ltd,£35000.00 - £40000.00 per annum,"permanent,full-time","Dartford, Kent",https://jobsinkent.com/job/2424239
8,Azure Data Engineer,Pearson Whiffin Recruitment,£55000 - £60000 per annum,"permanent,full-time","Central London, Greater London",https://jobsinkent.com/job/2362033
9,Digital Analyst/ Data Visualisation Executive,Recruitment Solutions South East Ltd,£35000 + benefits,"permanent,full-time",Work from home,https://jobsinkent.com/job/2424211


page_no  2
jobs_limit  20
found jobs 40


Unnamed: 0,job_title,company,salary,contract,location,job_url
0,"Senior Azure Specialists WANTED - Native, Data...",Jump IT Recruitment Solutions Limited,£65K-£90K+Bens+Opportunity+Equity,"permanent,full-time",Work from home,https://jobsinkent.com/job/2145576
1,Data Analyst,Hays Specialist Recruitment Ltd,£45000.00 - £50000.00 per annum,"permanent,full-time","Dartford, Kent",https://jobsinkent.com/job/2145570
2,Senior Data Governance Manager,MW Appointments,£90000 - £110000 per annum + + package,"permanent,full-time","City of London, Greater London",https://jobsinkent.com/job/2053074
3,Azure Data Engineer,Pearson Whiffin Recruitment,£50000 - £60000 per annum,"permanent,full-time","Central London, Greater London",https://jobsinkent.com/job/2263413
4,Junior Data Analyst - MS SQL & Excel,Pearson Whiffin Recruitment,£22000 - £27500 per annum + Excellent Benefits,"permanent,full-time","Dover, Kent",https://jobsinkent.com/job/2052907
5,Data Protection & Governance Manager,Atlas Recruitment Group,£50000 - £55000 per annum + Bonus,"permanent,full-time","Hastings, Sussex",https://jobsinkent.com/job/2261948
6,"Data Scientist - AI, Python, Machine Learning,...",Jump IT Recruitment Solutions Limited,£35K-£90K+Bens+,"permanent,full-time",Work from home,https://jobsinkent.com/job/2261649
7,Financial Data Entry,Huntress,Negotiable,"temporary,full-time","West Malling, Kent",https://jobsinkent.com/job/2180856
8,Data Analyst,MW Appointments,Up to £35000.00 per annum,"permanent,full-time","City of London, Greater London",https://jobsinkent.com/job/2142849
9,Finance Data & Systems Lead,Hays Specialist Recruitment Ltd,"£60000.00 - £65000.00 per annum + parking, fle...","permanent,full-time",South West London,https://jobsinkent.com/job/2309432


page_no  3
jobs_limit  14
found jobs 34


Unnamed: 0,job_title,company,salary,contract,location,job_url
0,Junior Data Analyst - Office based,Pearson Whiffin Recruitment,£22000 - £28000 per annum,"permanent,full-time","West Malling, Kent",https://jobsinkent.com/job/2332912
1,Immediate Start - Data Entry,Huntress,Up to £11 per hour,"temporary,full-time","Ashford, Kent",https://jobsinkent.com/job/2251154
2,Data & M.I. Business Partner (Insurance),MW Appointments,Negotiable,"permanent,full-time","City of London, Greater London",https://jobsinkent.com/job/2049769
3,Data and Business Services Assistant,Barker Munro Recruitment Limited,"£20,000 - 30,000","permanent,full-time","Tunbridge Wells, Kent",https://jobsinkent.com/job/2133378
4,Graduate Data Analyst,Huntress,Up to £23000 per annum,"permanent,full-time","Tenterden, Kent",https://jobsinkent.com/job/2422248
5,Immediate Start - Data Entry,Huntress,Up to £20000 per annum,"temporary,full-time","Chatham, Kent",https://jobsinkent.com/job/2132885
6,Data Entry Clerk (finance),Connect2Staff,£8.91 - £9.00 per hour,"temporary,full-time","Aylesford, Tonbridge and Malling, Kent",https://jobsinkent.com/job/2132825
7,Data Entry Clerk,Connect2Staff,Up to £10 per hour,"temporary,full-time","West Malling, Kent",https://jobsinkent.com/job/2164411
8,Junior Data Analyst - SQL - Excel - VBA - Remo...,Pearson Whiffin Recruitment,£20000 - £28000 per annum,"permanent,full-time","Central London, Greater London",https://jobsinkent.com/job/2082056
9,Data Analyst,Pearson Whiffin Recruitment,£25000 - £30000 per annum + DOE,"permanent,full-time","Gillingham, Kent",https://jobsinkent.com/job/2132332


Cleans the jobsinkent_df to be used later.

In [65]:
jobsinkent_df = general_clean(jobsinkent_df,"jobsinkent.com/search")

#Combining both results
---


In [None]:
data_jobs_df = pd.DataFrame()

data_jobs_df = data_jobs_df.append([jobsinkent_df,reed_df],ignore_index=True)
display(data_jobs_df)

This returns a **unique** list of recruiters and compaies who are advertiseing data roles

In [72]:
def find_company_lists():
  data_jobs_employer_names = data_jobs_df.drop_duplicates(subset = "company") #data_jobs_df["company"].unique()

  #removing all recruitment companies from the list
  companies_df = data_jobs_employer_names.drop( data_jobs_employer_names[ data_jobs_employer_names["company"].str.contains("Recruit")==True].index)
  additional_recruiter_list = ["Reed","OnetoOne Personnel","Huntress","MW Appointments","P3 Search & Selection","Manpower - Ashford","Morgan McKinley","Brook Street","McGregor Boyall","SAGA","Office Angels","REED",
                            "Commercial Services Interim & Executive Search","Harnham","Connect2Staff","Senitor Associates","Zorba Consulting Limited","GerrardWhite","Lorien","Morgan Law", "Academics"]
  it_specialits = ["Senitor Associates","Zorba Consulting Limited","GerrardWhite","Lorien"]

  companies_df = companies_df.drop( companies_df[companies_df["company"].isin(additional_recruiter_list) == True ].index)
  companies_list = companies_df["company"]

  recruiters_df1 = data_jobs_employer_names.drop( data_jobs_employer_names[ data_jobs_employer_names["company"].str.contains("Recruit")==False].index) #drop 'Recruit' is not in name
  recruiters_df2 = data_jobs_employer_names.drop( data_jobs_employer_names[data_jobs_employer_names["company"].isin(additional_recruiter_list) == False ].index) #drop if company is not in 'additional_drop_list'

  recruiters_df = recruiters_df1.append(recruiters_df2,ignore_index=True) #combine the two sets of lists
  recruiters_list = recruiters_df["company"]

  return companies_list, recruiters_list
companies_list, recruiters_list = find_company_lists()

This takes the lists of unique compaies and recruiters and returns a DataFrame with **all** the roles they have advertised.

In [None]:
def find_company_df():
  company_df = data_jobs_df.drop( data_jobs_df[data_jobs_df["company"].isin(recruiters_list) == True ].index) #drop if company is a recruiter
  recruiters_df = data_jobs_df.drop( data_jobs_df[data_jobs_df["company"].isin(companies_list) == True ].index) #drop if company is not a recruiter



  company_df.reset_index(drop=True, inplace = True)
  recruiters_df.reset_index(drop=True, inplace = True)

  return company_df, recruiters_df
company_df, recruiters_df = find_company_df()
  

To download and save the DataFrames, uncomment save() and download() and run the cell

In [88]:
from google.colab import files

#saveing files names of search based on current date.
recruiters_fn = str(date_today)+"_recruitment_company_df.csv"
company_fn = str(date_today)+"_data_employeer_df.csv"

#creats a .csv file in colab (cloud?) NB: This does not save the .csv files to you PC, you need to download them first.
def save():
  recruiters_df.to_csv(recruiters_fn)
  company_df.to_csv(company_fn)

#downloads files to pc
def download():
  files.download(recruiters_fn)
  files.download(company_fn)
###
#Uncomment to run, commented by default.
####
#save()
#download()
