> 
# Job titles to search for:
* data scientist
* data analyst
* data architect
* data engineer
* statistician
* database administrator
* business analyst
* data analytics manager
>
# Countries to search in:
* Singapore


In [None]:
# Each page has 15 job descriptions, 5 advertised jobs
# url = 'https://www.indeed.com.sg/jobs?q=data+scientist&l=Singapore&start='
# Assumption is that all job information gathered are data-related

In [80]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time
import regex as re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

sns.set_style('whitegrid')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [81]:
# Initialize search parameters and dataframe
# 'my',
country_set = ['sg','my']
search_string = ['data scientist', 'data analyst', 'data architect', 'data engineer',
       'statistician', 'database administrator', 'business analyst',
       'data analytics manager']
columns = ["job_category","job_title", "company_name", "location", "summary", "salary"]

In [82]:
# Initialize container to store all job postings
jobs_list = []

# Iterate through search parameters and store relevant data in respective columns in dataframe
for country in country_set:
    for query in search_string:
        
        url = 'https://www.indeed.com.' + country + '/jobs?q=' + '+'.join([word for word in query.split()]) + '&start='
        time.sleep(1)
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'lxml')
        jobs_count = soup.find_all(name='div', attrs={'id':'searchCount'})[0].get_text()
        
        # Get maximum number of jobs to iterate over all pages
        max_jobs = int(re.sub('[^0-9a-zA-Z]+', '', jobs_count.split()[-1]))
        
        for start_number in range(0,max_jobs,10):
            time.sleep(1)
            url_page = url + str(start_number)
            page = requests.get(url_page)
            soup = BeautifulSoup(page.text, 'lxml')
            
            # Get all advertised job descriptions
            regex = re.compile('.*row.*')
            jobs = soup.find_all(name='div', attrs={'class':regex})
            
            # Get job title from job description
            for job in jobs:

                job_title = job.find(name='a', attrs={'data-tn-element':'jobTitle'})
                company = job.find(name='span', attrs={'class':'company'})
                location = job.find(name='span', attrs={'class':'location'})
                summary = job.find(name='span', attrs={'class':'summary'})
                salary = job.find(name='span', attrs={'class':'no-wrap'})

                # Put default for missing variables
                if job_title != None:
                    job_title_result = job_title.get_text()
                    job_title_result = job_title_result.replace('\n','')
                    job_title_result = job_title_result.strip()
                else:
                    job_title_result = np.nan

                if company != None:
                    company_result = company.get_text()
                    company_result = company_result.replace('\n','')
                    company_result = company_result.strip()
                else:
                    company_result = np.nan

                if location != None:
                    location_result = location.get_text()
                    location_result = location_result.replace('\n','')
                    location_result = location_result.strip()
                else:
                    location_result = np.nan

                if summary != None:
                    summary_result = summary.get_text()
                    summary_result = summary_result.replace('\n','')
                    summary_result = summary_result.strip()
                else:
                    summary_result = np.nan

                if salary != None:

                    salary_result = salary.get_text()
                    salary_result = salary_result.replace('\n','')
                    salary_result = salary_result.strip()
                else:
                    salary_result = np.nan

                # Append to list
                job_category = '_'.join([word for word in query.split()])
                jobs_list.append([job_category,job_title_result, company_result, location_result, summary_result, salary_result])

# Convert jobs list to dataframe
df = pd.DataFrame(jobs_list, columns = columns)
# drop all duplicated job postings based on summary
df.drop_duplicates(subset=['summary'], inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7388 entries, 0 to 7387
Data columns (total 6 columns):
job_category    7388 non-null object
job_title       7388 non-null object
company_name    6245 non-null object
location        7388 non-null object
summary         6647 non-null object
salary          332 non-null object
dtypes: object(6)
memory usage: 346.4+ KB


In [83]:
# save dataframe in pickle to not waste time scrapping again
df.drop_duplicates(subset=['summary'], inplace=True)
df.to_pickle('indeed_data_related_jobs')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5815 entries, 0 to 7387
Data columns (total 6 columns):
job_category    5815 non-null object
job_title       5815 non-null object
company_name    4945 non-null object
location        5815 non-null object
summary         5814 non-null object
salary          284 non-null object
dtypes: object(6)
memory usage: 318.0+ KB


In [72]:
df1 = pd.read_pickle('./indeed_data_related_jobs')

In [51]:
df1.dropna(subset=['company_name', 'summary'], inplace=True)

In [52]:
df1 = pd.read_pickle('./indeed_data_related_jobs')
df1.dropna(subset=['company_name', 'summary'], inplace=True)
# convert all to small letters if string
df1 = df1.applymap(lambda x: x.lower() if isinstance(x, str) else x)
# remove numbers from job titles
from string import digits
df1.job_title = df1.job_title.map(lambda x: x.translate(str.maketrans('', '', digits)).strip())
# remove business licence numbers
df1.company_name = df1.company_name.map(lambda x: x[:x.index(', ea licence')] if x.find(', ea licence') != -1 else x)

In [61]:
df1.job_title.map(lambda x: x if x.find('data') >= 0 or x.find('business analyst') >= 0 or \
                  x.find('business intelligence') >= 0 else np.nan).isnull().sum()

2200

In [65]:
df1.job_category.unique()


array(['data_scientist', 'data_analyst', 'data_architect', 'data_engineer',
       'statistician', 'database_administrator', 'business_analyst',
       'data_analytics_manager'], dtype=object)

In [None]:
# Question 1:
# Will use bag of words model to classify high vs low salary
# Gather all words associated with low salary
# Gather all words associated with high salary
# Get their significance by calculating their frequency
# use log reg and decision tree