Objective: Create a beautifulsoup based scraper.

Website to scrape: www.indeed.com.my

What to scrape: extract the job posting details like position name, salary and location.

In [1]:
# required libraries
import requests
import time
from bs4 import BeautifulSoup as soup
import urllib.request
import re
import pandas as pd

In [2]:
# create list of cities in Malaysia from https://en.wikipedia.org/wiki/List_of_cities_in_Malaysia
city = ['George+Town', 'Kuala+Lumpur', 'Ipoh', 'Kuching', 'Johor+Bahru', 'Kota+Kinabalu', 'Shah+Alam', 'Melaka', 'Alor+Setar',
       'Miri', 'Petaling Jaya', 'Kuala Terengganu', 'Iskandar Puteri', 'Seremban']
state_list = ['Penang','Federal Territory','Perak', 'Sarawak', 'Johor', 'Sabah', 'Selangor', 'Melaka', 'Kedah', 'Sarawak', 'Selangor',
        'Terengganu', 'Johor', 'Negeri Sembilan']

max_results_per_city = 800 
df = pd.DataFrame()

page_url = "https://www.indeed.com.my/jobs?q=data+scientist&l="
results = []

In [4]:
#conducting a request of the stated URL above:
page = requests.get(page_url)

In [8]:
page_text = soup(page.text, "html.parser")

In [9]:
#printing soup in a more structured tree format that makes for easier reading
print(page_text.prettify())

<!DOCTYPE html>
<html dir="ltr" lang="en">
 <head>
  <meta content="text/html;charset=utf-8" http-equiv="content-type"/>
  <script src="/s/92dc942/en_MY.js" type="text/javascript">
  </script>
  <link href="/s/970d98c/jobsearch_all.css" rel="stylesheet" type="text/css"/>
  <link href="http://www.indeed.com.my/rss?q=data+scientist" rel="alternate" title="Data Scientist Jobs, Jawatan Kosong" type="application/rss+xml"/>
  <link href="/m/jobs?q=data+scientist" media="only screen and (max-width: 640px)" rel="alternate"/>
  <link href="/m/jobs?q=data+scientist" media="handheld" rel="alternate"/>
  <script type="text/javascript">
   if (typeof window['closureReadyCallbacks'] == 'undefined') {
        window['closureReadyCallbacks'] = [];
    }

    function call_when_jsall_loaded(cb) {
        if (window['closureReady']) {
            cb();
        } else {
            window['closureReadyCallbacks'].push(cb);
        }
    }
  </script>
  <meta content="1" name="ppstriptst"/>
  <script src=

Now, we know that our variable “soup” has all of the information housed in our page of interest. It is now a matter of writing code to iterate through the various tags (and nested tags therein) to capture the information we want.

In [13]:
def extract_job_title_from_result(page_text):
    jobs = []
    for div in page_text.find_all(name="div", attrs={"class":"row"}):
        for a in div.find_all(name="a", attrs={"data-tn-element":"jobTitle"}):
            jobs.append(a["title"])
    return(jobs)

extract_job_title_from_result(page_text)

['Lead Data Scientist',
 'Data Scientist',
 'Environmental Enginner',
 'DATA SCIENTIST',
 'Business & Operational Analytics/Data Scientist)',
 'Data Scientist',
 'Data Scientist',
 'Data Scientist',
 'Data Scientist',
 'Manager - (Business & Operational Analytics/Data Scientist)',
 'Chief Data And Analytics Officer',
 'Data analytics intern',
 'Data Scientist',
 'Data Scientist ($100k/yr) - Online Hiring Event - Remote Work']

Now, lets extract the company names

In [14]:
def extract_company_from_result(page_text):
    companies = []
    for div in page_text.find_all(name="div", attrs={"class":"row"}):
        company = div.find_all(name="span", attrs={"class":"company"})
        if len(company) > 0:
            for b in company:
                companies.append(b.text.strip())
        else:
            sec_try = div.find_all(name="span", attrs={"class":"result-link-source"})
            for span in sec_try:
                companies.append(span.text.strip())
    return(companies)
 
extract_company_from_result(page_text)

['Kaishi Partners, EA Licence No: 16C8316',
 'Manpower Staffing Services (Malaysia) Sdn Bhd',
 'CHEMKIMIA SDN BHD',
 'Statworks (M) Sdn Bhd',
 'TELE - TEMPS ASIA PASIFIC',
 'Ctrl/Shift',
 'BAE Systems Applied Intelligence',
 'GBGroup',
 'Intellect-Minds',
 'TTSearch Job Agency',
 '123RF Technology',
 'iflix',
 'iflix',
 'Crossover Markets DMCC']

Lets extract the locations now

In [15]:
# define the function
def extract_location_from_result(page_text):
    locations = []
    spans = page_text.findAll( name="span", attrs={"class": "location"})
    for span in spans:
        locations.append(span.text)
    return(locations)

# execute the function
extract_location_from_result(page_text)

['Petaling Jaya',
 'Sepang',
 'Kuala Lumpur',
 'Kuala Lumpur',
 'Kuala Lumpur',
 'Kuala Lumpur',
 'Kuala Lumpur',
 'Petaling Jaya',
 'Kuala Lumpur',
 'Kuala Lumpur']

Now, extracting the salary

In [16]:
# define the function
def extract_salary_from_result(page_text):
    salaries = []
    for div in page_text.find_all(name="div", attrs={"class":"row"}):
        try:
            salaries.append(div.find(name= "nobr").text)
        except:
            try:
                div_two = div.find(name="div", attrs={"class":"sjcl"})
                div_three = div_two.find(name="div")
                salaries.append(div_three.text.strip())
            except:
                salaries.append("Nothing found")
    return(salaries)

# execute the function
extract_salary_from_result(page_text)

['Kaishi Partners, EA Licence No: 16C8316',
 'Manpower Staffing Services (Malaysia) Sdn Bhd',
 'CHEMKIMIA SDN BHD',
 'Nothing found',
 'Nothing found',
 'Nothing found',
 'Nothing found',
 'Nothing found',
 'Nothing found',
 'Nothing found',
 'Nothing found',
 'Nothing found',
 'Nothing found',
 'Crossover Markets DMCC']

Finally, extracting the job summaries

In [19]:
# define the function
def extract_summary_from_result(page_text):
    summaries = []
    spans = page_text.findAll(name= 'span', attrs={'class': 'summary'})
    for span in spans:
        summaries.append(span.text.strip())
    return(summaries)

# execute the function
extract_summary_from_result(page_text)

[]