In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
# The path to your downloaded driver
web_driver_path = '/Users/gary/Downloads/geckodriver'

def get_info_from_url(url, pages):
    titles = []
    companies = []
    salaries = []
    for page in pages:
        driver = webdriver.Firefox(executable_path=web_driver_path)
        # Get the info from this url
        driver.get(url + str(page))
        # Get the web content from driver
        web_content = driver.page_source
        # Use beautifulsoup to parse the data
        soup = BeautifulSoup(web_content, 'lxml')
        for block in soup.find_all('div', attrs = {'class':'y-card y-card-horizontal'}):
            # Skip the intern and part time job
            if block.find('div', attrs = {'class': "y-card-content-icon-row"}).find('a').text.strip() != '正職':
                continue
            # job title
            titles.append(block.find('div', attrs = {'class':'y-card-content-title'}).text)
            # company name
            companies.append(block.find('div', attrs = {'class':'y-card-content-subtitle'}).find('a').text)
            # salary range
            salaries.append(block.find('span', attrs = {'class':'salary-description'}).text)
        driver.close()
    return titles, companies, salaries

In [2]:
titles, companies, salaries = get_info_from_url('https://www.yourator.co/jobs?category[]=8&page=', (1, 2, 3))

In [3]:
salaries

['面議（經常性薪資達4萬元）',
 'NT$ 35,000 - 50,000 (月薪)',
 'NT$ 40,000 - 60,000 (月薪)',
 '面議（經常性薪資達4萬元）',
 'NT$ 38,000 - 42,000 (月薪)',
 'NT$ 1,200,000 - 2,000,000 (年薪)',
 'NT$ 830,000 - 1,600,000 (年薪)',
 '面議（經常性薪資達4萬元）',
 'NT$ 900,000 - 1,350,000 (年薪)',
 'NT$ 45,000 - 65,000 (月薪)',
 'NT$ 840,000 -  (年薪)',
 'NT$ 900,000 - 1,500,000 (年薪)',
 'NT$ 80,000 - 150,000 (月薪)',
 'NT$ 1,000,000 - 2,000,000 (年薪)',
 'NT$ 1,000,000 - 1,500,000 (年薪)',
 'NT$ 35,000 - 60,000 (月薪)',
 '面議（經常性薪資達4萬元）',
 'NT$ 800,000 - 1,500,000 (年薪)',
 'NT$ 40,000 - 100,000 (月薪)',
 'NT$ 600,000 - 1,000,000 (年薪)',
 '面議（經常性薪資達4萬元）',
 'NT$ 1,200,000 - 1,800,000 (年薪)',
 'NT$ 80,000 - 120,000 (月薪)',
 '面議（經常性薪資達4萬元）',
 '面議（經常性薪資達4萬元）',
 '面議（經常性薪資達4萬元）',
 'NT$ 70,000 - 120,000 (月薪)',
 'NT$ 80,000 - 180,000 (月薪)',
 '面議（經常性薪資達4萬元）',
 'NT$ 1,000,000 - 2,000,000 (年薪)',
 'NT$ 1,000,000 - 2,000,000 (年薪)',
 'NT$ 115,000 - 184,000 (月薪)',
 'NT$ 115,000 - 184,000 (月薪)',
 'NT$ 115,000 - 184,000 (月薪)',
 'NT$ 115,000 - 184,000 (月薪)',
 'NT$ 115,000 - 184,

In [4]:
import pandas as pd
# Construct a dataframe by the data we crawled
df = pd.DataFrame(zip(titles, companies, salaries))
# Rename the dataframe
df.columns = ['job_title', 'company_name', 'salary']
df.head(5).to_csv('selenium_demo.csv', index = None)

In [5]:
import re
# Regular expression to keep only number with more than 3 digits and with comma
df['salary_range'] = df['salary'].apply(lambda x: re.findall(r'([\d\,]{3,})', x))

In [6]:
# Keep only the dataframe has salary_range
df = df[df['salary_range'].apply(len) > 0]
uppers = []
lowers = []
# Get the upper bound and lower bound of salaries
for index, (job_title, company_name, salary, salary_range) in df.iterrows():
    if len(salary_range) == 1:
        uppers.append(salary_range[0])
        lowers.append(salary_range[0])
    else:
        uppers.append(salary_range[1])
        lowers.append(salary_range[0])

In [7]:
df['salary_low'] = lowers
df['salary_high'] = uppers
# Remove the comma
df['salary_low'] = df['salary_low'].apply(lambda x: x.replace(',',''))
df['salary_high'] = df['salary_high'].apply(lambda x: x.replace(',',''))
# To know whether the salary is annual/monthly salary
df['month_or_annual'] = df['salary'].apply(lambda x: x[-3:-2])

In [8]:
df['month_salary_low'] = df.apply(lambda x: int(x['salary_low']) if x['month_or_annual'] == '年' else int(x['salary_low']) * 12, axis = 1)
df['month_salary_high'] = df.apply(lambda x: int(x['salary_high']) if x['month_or_annual'] == '年' else int(x['salary_high']) * 12, axis = 1)

In [9]:
# Change the display module of data frame
pd.options.display.float_format = '{:.2f}'.format
# Keep the job that could offer more than 1M TWD/year
df_high = df[df['month_salary_high'] >= 1000000]
df_high

Unnamed: 0,job_title,company_name,salary,salary_range,salary_low,salary_high,month_or_annual,month_salary_low,month_salary_high
5,Data Analyst (based in Japan),Vpon,"NT$ 1,200,000 - 2,000,000 (年薪)","[1,200,000, 2,000,000]",1200000,2000000,年,1200000,2000000
6,Junior Data Project Manager (based in Japan),Vpon,"NT$ 830,000 - 1,600,000 (年薪)","[830,000, 1,600,000]",830000,1600000,年,830000,1600000
8,Senior Data Engineer,Vpon,"NT$ 900,000 - 1,350,000 (年薪)","[900,000, 1,350,000]",900000,1350000,年,900000,1350000
11,Video/ Image Processing Software Engineer,PicCollage 拼貼趣,"NT$ 900,000 - 1,500,000 (年薪)","[900,000, 1,500,000]",900000,1500000,年,900000,1500000
12,Computer Vision Engineer (電腦視覺工程師),XNEX - Flexible Robotics,"NT$ 80,000 - 150,000 (月薪)","[80,000, 150,000]",80000,150000,月,960000,1800000
13,深度 / 機器學習,PEZZALoan by GDP,"NT$ 1,000,000 - 2,000,000 (年薪)","[1,000,000, 2,000,000]",1000000,2000000,年,1000000,2000000
14,風控模型算法工程師,PEZZALoan by GDP,"NT$ 1,000,000 - 1,500,000 (年薪)","[1,000,000, 1,500,000]",1000000,1500000,年,1000000,1500000
17,Senior Data Scientist,WeMo Scooter,"NT$ 800,000 - 1,500,000 (年薪)","[800,000, 1,500,000]",800000,1500000,年,800000,1500000
18,電子商務資料工程師 Data Engineer (月薪40K-100K),生活市集_創業家兄弟股份有限公司,"NT$ 40,000 - 100,000 (月薪)","[40,000, 100,000]",40000,100000,月,480000,1200000
19,金融數據工程師/ Data Engineer,大拇哥投顧 TAROBO,"NT$ 600,000 - 1,000,000 (年薪)","[600,000, 1,000,000]",600000,1000000,年,600000,1000000


In [10]:
# Statistic features of salary
df_high.describe()

Unnamed: 0,month_salary_low,month_salary_high
count,25.0,25.0
mean,978960.0,1753999.52
std,276643.65,413316.53
min,360000.0,1000000.0
25%,830000.0,1440000.0
50%,960000.0,1800000.0
75%,1200000.0,2160000.0
max,1380000.0,2208000.0


In [33]:
df_high.describe().to_csv('salary_describe.csv', sep = '\t')

In [31]:
df_high.to_csv('selenium_high.csv', index = False, sep = '\t')