> 
# Job titles to search for:
* data scientist
* data analyst
* data architect
* data engineer
* statistician
* database administrator
* business analyst
* data analytics manager
>
# Countries to search in:
* Singapore


In [None]:
# Each page has 15 job descriptions, 5 advertised jobs
# url = 'https://www.indeed.com.sg/jobs?q=data+scientist&l=Singapore&start='
# Assumption is that all job information gathered are data-related

In [3]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time
import regex as re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

sns.set_style('whitegrid')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [124]:
# Initialize search parameters and dataframe
country_set = ['singapore']
search_string = ['data science','data scientist','data analyst','data architect','data engineer', \
                 'statistician','database administrator','business analyst','data analytics manager']
columns = ["job_category","job_title", "company_name", "location", "summary", "salary"]

In [130]:
# Initialize container to store all job postings
jobs_list = []

# Iterate through search parameters and store relevant data in respective columns in dataframe
for country in country_set:
    for query in search_string:
        
        url = 'https://www.indeed.com.sg/jobs?q=' + '+'.join([word for word in query.split()]) + '&l=' + country + '&start='
        time.sleep(1)
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'lxml')
        jobs_count = soup.find_all(name='div', attrs={'id':'searchCount'})[0].get_text()
        
        # Get maximum number of jobs to iterate over all pages
        max_jobs = int(re.sub('[^0-9a-zA-Z]+', '', jobs_count.split()[-1]))
        
        for start_number in range(0,max_jobs,10):
            time.sleep(1)
            url_page = url + str(start_number)
            page = requests.get(url_page)
            soup = BeautifulSoup(page.text, 'lxml')
            
            # Get all advertised job descriptions
            regex = re.compile('.*row.*')
            jobs = soup.find_all(name='div', attrs={'class':regex})
            
            # Get job title from job description
            for job in jobs:

                job_title = job.find(name='a', attrs={'data-tn-element':'jobTitle'})
                company = job.find(name='span', attrs={'class':'company'})
                location = job.find(name='span', attrs={'class':'location'})
                summary = job.find(name='span', attrs={'class':'summary'})
                salary = job.find(name='span', attrs={'class':'no-wrap'})

                # Put default for missing variables
                if job_title != None:
                    job_title_result = job_title.get_text()
                    job_title_result = job_title_result.replace('\n','')
                    job_title_result = job_title_result.strip()
                else:
                    job_title_result = np.nan

                if company != None:
                    company_result = company.get_text()
                    company_result = company_result.replace('\n','')
                    company_result = company_result.strip()
                else:
                    company_result = np.nan

                if location != None:
                    location_result = location.get_text()
                    location_result = location_result.replace('\n','')
                    location_result = location_result.strip()
                else:
                    location_result = np.nan

                if summary != None:
                    summary_result = summary.get_text()
                    summary_result = summary_result.replace('\n','')
                    summary_result = summary_result.strip()
                else:
                    summary_result = np.nan

                if salary != None:
#                   salary_list = [s for s in re.findall(r'\d+(?:[\d,.]*\d)', salary.get_text())]
#                   salary_list_mod = [int(re.sub('[^0-9a-zA-Z]+', '', s)) for s in salary_list]
                    salary_result = salary.get_text()
                    salary_result = salary_result.replace('\n','')
                    salary_result = salary_result.strip()
                else:
                    salary_result = np.nan

                # Append to list
                job_category = '_'.join([word for word in query.split()])
                jobs_list.append([job_category,job_title_result, company_result, location_result, summary_result, salary_result])

# Convert jobs list to dataframe
df = pd.DataFrame(jobs_list, columns = columns)
# drop all duplicated job postings based on summary
df.drop_duplicates(subset=['summary'], inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3700 entries, 0 to 3699
Data columns (total 6 columns):
job_category    3700 non-null object
job_title       3700 non-null object
company_name    3017 non-null object
location        3700 non-null object
summary         3699 non-null object
salary          124 non-null object
dtypes: object(6)
memory usage: 173.5+ KB


In [8]:
# save dataframe in pickle to not waste time scrapping again
# df.to_pickle('indeed_data_related_jobs')

In [5]:
df1 = pd.read_pickle('./indeed_data_related_jobs')

In [7]:
df1[~df1.salary.isnull()].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 124 entries, 5 to 3667
Data columns (total 6 columns):
job_category    124 non-null object
job_title       124 non-null object
company_name    124 non-null object
location        124 non-null object
summary         124 non-null object
salary          124 non-null object
dtypes: object(6)
memory usage: 6.8+ KB
