Objective: Create a beautifulsoup based scraper.

Website to scrape: www.indeed.com.my

What to scrape: extract the job posting details like position name, salary and location.

In [1]:
# required libraries
import requests
import time
from bs4 import BeautifulSoup as soup
import urllib.request
import re
import pandas as pd

In [2]:
# create list of cities in Malaysia from https://en.wikipedia.org/wiki/List_of_cities_in_Malaysia
city = ['George+Town', 'Kuala+Lumpur', 'Ipoh', 'Kuching', 'Johor+Bahru', 'Kota+Kinabalu', 'Shah+Alam', 'Melaka', 'Alor+Setar',
       'Miri', 'Petaling Jaya', 'Kuala Terengganu', 'Iskandar Puteri', 'Seremban']
state_list = ['Penang','Federal Territory','Perak', 'Sarawak', 'Johor', 'Sabah', 'Selangor', 'Melaka', 'Kedah', 'Sarawak', 'Selangor',
        'Terengganu', 'Johor', 'Negeri Sembilan']

max_results_per_city = 800 
df = pd.DataFrame()

page_url = "https://www.indeed.com.my/jobs?q=data+scientist&l="
results = []

In [3]:
#conducting a request of the stated URL above:
page = requests.get(page_url)

In [4]:
page_text = soup(page.text, "html.parser")

In [5]:
#printing soup in a more structured tree format that makes for easier reading
print(page_text.prettify())

<!DOCTYPE html>
<html dir="ltr" lang="en">
 <head>
  <meta content="text/html;charset=utf-8" http-equiv="content-type"/>
  <script src="/s/24156e1/en_MY.js" type="text/javascript">
  </script>
  <script>
   window.indeed=window.indeed||{};indeed.crashtext=indeed.crashtext||{};indeed.crashtext.populate=indeed.crashtext.populate||function(c,g){var d=window[g];if(d){var e=d[""]||{},h=e.salt;if(e.hasOwnProperty("salt"))for(var b in c){for(var a=void 0,a=h,f=b.length;f;)a=33*a^b.charCodeAt(--f);a>>>=0;e.hasOwnProperty("id_length")&&(a=String(a).substring(0,e.id_length));d[a]=c[b]}else for(b in c)d[b]=[null].concat(c[b])}};indeed.crashtext.populate({"indeedapply_serp_label":["Express apply"]}, 'indeed.i18n.localeData')
  </script>
  <link href="/s/684a333/jobsearch_all.css" rel="stylesheet" type="text/css"/>
  <link href="http://www.indeed.com.my/rss?q=data+scientist" rel="alternate" title="Data Scientist Jobs, Jawatan Kosong" type="application/rss+xml"/>
  <link href="/m/jobs?q=data+scienti

Now, we know that our variable “soup” has all of the information housed in our page of interest. It is now a matter of writing code to iterate through the various tags (and nested tags therein) to capture the information we want.

In [6]:
def extract_job_title_from_result(page_text):
    jobs = []
    for div in page_text.find_all(name="div", attrs={"class":"row"}):
        for a in div.find_all(name="a", attrs={"data-tn-element":"jobTitle"}):
            jobs.append(a["title"])
    return(jobs)

extract_job_title_from_result(page_text)

['Junior Data Scientist',
 'Data Scientist',
 'DATA SCIENTIST II',
 'Data Scientist',
 'Data Scientist',
 'Associate RWE Data Scientist',
 'Data Scientist',
 'Data Engineer',
 'Data Scientist (TIS)',
 'PRINCIPAL DATA SCIENTIST']

Now, lets extract the company names

In [7]:
def extract_company_from_result(page_text):
    companies = []
    for div in page_text.find_all(name="div", attrs={"class":"row"}):
        company = div.find_all(name="span", attrs={"class":"company"})
        if len(company) > 0:
            for b in company:
                companies.append(b.text.strip())
        else:
            sec_try = div.find_all(name="span", attrs={"class":"result-link-source"})
            for span in sec_try:
                companies.append(span.text.strip())
    return(companies)
 
extract_company_from_result(page_text)

['ParaCell',
 'BioQuest Advisory',
 'Celcom Axiata Berhad',
 'AirAsia',
 'Axiata Analytics Center',
 'Novartis',
 'MoneyLion',
 'Ensoft Consulting',
 'Grab Taxi',
 'Celcom Axiata Berhad']

Lets extract the locations now

In [8]:
# define the function
def extract_location_from_result(page_text):
    locations = []
    spans = page_text.findAll( name="span", attrs={"class": "location"})
    for span in spans:
        locations.append(span.text)
    return(locations)

# execute the function
extract_location_from_result(page_text)

['Shah Alam',
 'Kuala Lumpur',
 'Kuala Lumpur',
 'Kuala Lumpur',
 'Kuala Lumpur',
 'Petaling Jaya',
 'Kuala Lumpur',
 'Ara Damansara',
 'Petaling Jaya',
 'Petaling Jaya']

Now, extracting the salary

In [9]:
# define the function
def extract_salary_from_result(page_text):
    salaries = []
    for div in page_text.find_all(name="div", attrs={"class":"row"}):
        try:
            salaries.append(div.find(name= "nobr").text)
        except:
            try:
                div_two = div.find(name="div", attrs={"class":"sjcl"})
                div_three = div_two.find(name="div")
                salaries.append(div_three.text.strip())
            except:
                salaries.append("Nothing found")
    return(salaries)

# execute the function
extract_salary_from_result(page_text)

['ParaCell',
 'BioQuest Advisory',
 'Celcom Axiata Berhad\n\n\n66 reviews',
 'AirAsia\n\n\n219 reviews',
 'Axiata Analytics Center',
 'Novartis\n\n\n3,191 reviews',
 'MoneyLion\n\n\n14 reviews',
 'Ensoft Consulting',
 'Grab Taxi\n\n\n203 reviews',
 'Celcom Axiata Berhad\n\n\n66 reviews']

In [25]:
#Finally, extracting the job summaries
# define the function
def extract_summary_from_result(page_text):
    summaries = []
    spans = page_text.findAll(name= 'div', attrs={'class': 'summary'})
    for span in spans:
        summaries.append(span.text.strip())
    return(summaries)

# execute the function
extract_summary_from_result(page_text)

['Junior Data Scientist. Experience with large data sets and knowledge of both structured and unstructured data. Passion for data science and innovation....',
 'NLP) and Big Data. Journey leveraging on machine learning and data analytics to develop data science solutions focused on delivering competitive business...',
 'Fair knowledge of data management and maintenance through tools such as Data warehousing solutions, ETL, SAP Data Management, SSIS, Cloud technologies etc....',
 'Data Scientist - Job Description. Data sourcing, etl, ensuring data integrity and data handling. As a Data Scientist you will be developing SQL queries,...',
 'The Data Scientist will be part of the Analytics CoE team of Axiata Intelligence Unit, reporting to Senior Data Scientists and Principal Data Scientists....',
 'Data sources include medical and pharmacy claims data, hospital data, electronic medical record data, and prospective observational study data....',
 'MoneyLion is looking for a data scientist t

In [30]:
# define the function
def extract_reviews_from_result(page_text):
    reviews = []
    spans = page_text.findAll( name="span", attrs={"class": "slNoUnderline"})
    for review in spans:
        reviews.append(review.text.strip())
    return(reviews)

# execute the function
extract_reviews_from_result(page_text)

['66 reviews',
 '219 reviews',
 '3,191 reviews',
 '14 reviews',
 '203 reviews',
 '66 reviews']

In [43]:
# get the job advert posting date
def extract_advertpostdate_from_result(page_text):
    dates = []
    spans = page_text.findAll( name="span", attrs={"class": "date"})
    for jobdate in spans:
        dates.append(jobdate.text.strip())
    return(dates)

# execute the function
extract_advertpostdate_from_result(page_text)

['25 days ago',
 '15 days ago',
 '17 days ago',
 '30+ days ago',
 '30+ days ago',
 '30+ days ago',
 '30+ days ago',
 '5 days ago',
 '30+ days ago',
 '17 days ago']

Now, calling all these functions and saving them into individual variables. Then add those variables to a dataframe.

In [44]:
job_title = extract_job_title_from_result(page_text)
company_info = extract_company_from_result(page_text)
company_loc = extract_location_from_result(page_text)
jobDescr = extract_summary_from_result(page_text)
#salary = extract_salary_from_result(page_text)
reviews = extract_reviews_from_result(page_text)
advertpostdate = extract_advertpostdate_from_result(page_text)

In [45]:
jobs_df = pd.DataFrame(
    {'JobTitle':job_title, 'Company':company_info, 'Location':company_loc, 'Job Description': jobDescr,
     'Date':advertpostdate
     #,'Reviews':reviews
    }
    )

In [46]:
jobs_df

Unnamed: 0,JobTitle,Company,Location,Job Description,Date
0,Junior Data Scientist,ParaCell,Shah Alam,Junior Data Scientist. Experience with large d...,25 days ago
1,Data Scientist,BioQuest Advisory,Kuala Lumpur,NLP) and Big Data. Journey leveraging on machi...,15 days ago
2,DATA SCIENTIST II,Celcom Axiata Berhad,Kuala Lumpur,Fair knowledge of data management and maintena...,17 days ago
3,Data Scientist,AirAsia,Kuala Lumpur,Data Scientist - Job Description. Data sourcin...,30+ days ago
4,Data Scientist,Axiata Analytics Center,Kuala Lumpur,The Data Scientist will be part of the Analyti...,30+ days ago
5,Associate RWE Data Scientist,Novartis,Petaling Jaya,Data sources include medical and pharmacy clai...,30+ days ago
6,Data Scientist,MoneyLion,Kuala Lumpur,MoneyLion is looking for a data scientist to l...,30+ days ago
7,Data Engineer,Ensoft Consulting,Ara Damansara,Create data tools for analytics and data scien...,5 days ago
8,Data Scientist (TIS),Grab Taxi,Petaling Jaya,"As a safety data scientist, you will get expos...",30+ days ago
9,PRINCIPAL DATA SCIENTIST,Celcom Axiata Berhad,Petaling Jaya,"Control, oversee and accountable of the qualit...",17 days ago


#### Further improvement options

1. Scrape multiple pages

Note: The base url is, "https://www.indeed.com.my/jobs?q=data+scientist&l=". The next page url is, `https://www.indeed.com.my/jobs?q=data+scientist&start=10` thereafter the only entity changing in subsequent pages is `start=10`. So the third page, url is `https://www.indeed.com.my/jobs?q=data+scientist&start=20`  
