In [None]:
# 上市櫃公司平均薪資要高於多少? 請在這邊調整，單位是千元
salary_criteria = 1200

In [None]:
import re
import requests
from bs4 import BeautifulSoup as bs
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import pandas as pd
import time
import difflib
import winsound

In [None]:
start = time.time()

In [None]:
# 讀取上市櫃薪資排行榜，找出高薪資公司
# 不論員工人數，所以控股公司也會被列入
salary = pd.read_excel('salary_data.xlsx', sheet_name='上市櫃5年平均薪資')
salary_high = salary[salary['平均員工薪資費用-111年度(仟元/人)'] > salary_criteria]
high_company_list = salary_high['公司全名'].to_list()

In [None]:
# remove nan
# high_company_list_original變數不可覆蓋，要給後面比對公司用
high_company_list_original = [x for x in high_company_list if str(x) != 'nan']

# 只取前面5個字，模糊搜尋，比較容易找到公司名稱
# 日月光投資控股股份有限公司 - 全名會找不到，所以只取前5個字，識別機率更高
high_company_list = [i[:5] for i in high_company_list_original]
search_company = ['https://www.104.com.tw/company/search/?keyword={}'.format(i) for i in high_company_list]

In [None]:
# 使用difflib，找出最相似的公司名稱
def similar_diff_qk_ratio(str1, str2):
    return difflib.SequenceMatcher(None, str1, str2).quick_ratio()

In [None]:
# 自動翻頁
def scroll(driver, timeout):
    scroll_pause_time = timeout

    # Get scroll height
    last_height = driver.execute_script('return document.body.scrollHeight')

    while True:
        # Scroll down to bottom
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')

        # Wait to load page
        time.sleep(scroll_pause_time)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script('return document.body.scrollHeight')
        if new_height == last_height:
            # If heights are the same it will exit the function
            break
        last_height = new_height

In [None]:
# 抓公司專頁的網址
company_dict = {}

for original_company_name, url in zip(high_company_list_original, search_company):
    res = requests.get(url)
    time.sleep(0.5)
    soup = bs(res.text, 'lxml')

    # 搜尋結果，只抓前面6個公司名稱進行比對，抓太多可能會識別到其他更相似公司
    # 日月光投資控股股份有限公司，每次搜尋排名都不一樣，容易抓錯
    catch_company = []
    for _ in soup.select('div > a.d-none.d-md-inline.jb-link.jb-link-blue.jb-link-blue--visited.h2.mb-1')[:6]:
        catch_company.append(_.text)

    similar_company = []
    for random_name in catch_company:
        similar_company.append(similar_diff_qk_ratio(random_name, original_company_name))

    # 返回最大值的索引，即最相似的公司名稱順位
    index = similar_company.index(max(similar_company))

    # 抓取正確的公司名稱及網址
    try:
        correct_name = soup.select('div > a.d-none.d-md-inline.jb-link.jb-link-blue.jb-link-blue--visited.h2.mb-1')[index].text
        correct_page = soup.select('div > a.d-none.d-md-inline.jb-link.jb-link-blue.jb-link-blue--visited.h2.mb-1')[index]['href']
        company_dict.update({correct_name : correct_page})
        print('爬完' + '\t\t' + correct_name)
    except:
        pass
        print('找不到' + '\t' + high_company_list[index])
print('共' + str(len(company_dict)) + '家公司')

In [None]:
chrome_options = Options()
chrome_options.add_argument('--headless')           # 指定headless，并且禁用gpu
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('window-size=1920,1080') # 設定瀏覽器尺寸，假如無法滾動的話

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

driver.get("https://www.104.com.tw/jobs/main/")
# driver.maximize_window()

In [None]:
company = []
job = []
location = []
salary = []
requirement = []
education = []
descirption = []
link = []

In [None]:
# 頁碼規則
# https://www.104.com.tw/company/12noppgo?job=&roleJobCat=0_0&area=0&page=1&pageSize=20&order=8&asc=0&jobsource=cs_custlist&tab=job
# https://www.104.com.tw/company/12noppgo?job=&roleJobCat=0_0&area=0&page=2&pageSize=20&order=8&asc=0&jobsource=cs_custlist&tab=job

In [None]:
for name, url in zip(company_dict.keys(), company_dict.values()):
    driver.get(url)
    time.sleep(1)   # 等待網頁更新

    # 預設顯示20個工作，找出總頁數
    try:
        text = driver.find_element(By.CSS_SELECTOR, 'div.col.main > div.joblist.rounded > div.joblist__footer > div > div:nth-child(2) > label > select').text
        total_page = int(re.findall(r'\d+', text)[-1])
    except:
        #如果找不到每筆100頁選單，則總頁數為1頁
        total_page = 1

    print(name + '\t' + '總頁數' + str(total_page))

    # 先寫爬取第一頁，之後再翻頁
    for i in range(1, total_page+1):
        driver.get(url + '?job=&roleJobCat=0_0&area=0&page={}&pageSize=20&order=8&asc=0&jobsource=cs_custlist&tab=job'.format(i))
        # 自動翻頁
        scroll(driver, 1)
        for i in range(1, 105): # 避免中間有廣告或是在頂端的急徵工作，所以最大值>100
            try:
                job_tag = driver.find_element(By.CSS_SELECTOR, "div:nth-child({}) > div > div.info.col > div > div.info-job.text-break.mb-2 > a".format(i)).text
                link_tag = driver.find_element(By.CSS_SELECTOR, "div:nth-child({}) > div > div.info.col > div > div.info-job.text-break.mb-2 > a".format(i)).get_attribute("href")
                location_tag = driver.find_element(By.CSS_SELECTOR, 'div:nth-child({}) > div > div.info.col > div > div.info-tags.gray-deep-dark > span:nth-child(1)'.format(i)).text
                requirement_tag = driver.find_element(By.CSS_SELECTOR, 'div:nth-child({}) > div > div.info.col > div > div.info-tags.gray-deep-dark > span:nth-child(2)'.format(i)).text
                education_tag = driver.find_element(By.CSS_SELECTOR, 'div:nth-child({}) > div > div.info.col > div > div.info-tags.gray-deep-dark > span:nth-child(3)'.format(i)).text
                descirption_tag = driver.find_element(By.CSS_SELECTOR, 'div:nth-child({}) > div > div.info.col > div > div.info-description.text-gray-darker.t4.text-break.mt-2.position-relative.info-description__line2'.format(i)).text
                salary_tag = driver.find_element(By.CSS_SELECTOR, 'div:nth-child({}) > div > div.info.col > div > div.info-othertags > span'.format(i)).text

                job.append(job_tag)
                link.append(link_tag)
                location.append(location_tag)
                requirement.append(requirement_tag)
                education.append(education_tag)
                descirption.append(descirption_tag)
                salary.append(salary_tag)
                company.append(name)
            except:
                pass

    print(name + '\t' + '爬完')

In [None]:
data = {
    'company':      company,
    'job_name':     job,
    'location':     location,
    'salary':       salary,
    'requirement':  requirement,
    'education':    education,
    'descirption':  descirption,
    'link':         link,
}

df = pd.DataFrame(data)
df.drop_duplicates(subset='link',inplace=True)   #link一樣，內容就會一樣，刪除重複的置頂急徵工作
df.to_excel('104_上市櫃高薪工作.xlsx', sheet_name= '上市櫃高薪工作')
df

In [None]:
print(len(company))
print(len(job))
print(len(location))
print(len(salary))
print(len(requirement))
print(len(education))
print(len(descirption))
print(len(link))

In [None]:
end = time.time()
print('完成共花費',round((end - start)/60,2),'分')

In [None]:
winsound.Beep(300,500)
winsound.Beep(360,500)
winsound.Beep(500,500)
winsound.PlaySound('alert', winsound.SND_ASYNC)