<b>You're working as a data scientist for a contracting firm that's rapidly expanding. Now that they have their most valuable employee (you!), they need to leverage data to win more contracts. Your firm offers technology and scientific solutions and wants to be competitive in the hiring market. Your principal has two main objectives:

   1. Determine the industry factors that are most important in predicting the salary amounts for these data.
   2. Determine the factors that distinguish job categories and titles from each other. For example, can required skills accurately predict job title?

To limit the scope, your principal has suggested that you *focus on data-related job postings*, e.g. data scientist, data analyst, research scientist, business intelligence, and any others you might think of. You may also want to decrease the scope by *limiting your search to a single region.</b>

In [1]:
from bs4 import BeautifulSoup
import urllib

import pandas as pd
import numpy as np

from tqdm import tqdm


In [None]:
# Set the URL we want to visit.
url = "https://www.mycareersfuture.sg/"

# Visit the URL and grab the HTML of the page.
html = urllib.request.urlopen(url).read()

In [None]:
len(html)

In [None]:
html[0:1000]

In [None]:
import os
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.keys import Keys

chromedriver = "/Users/edoardo/github_dsi4/classes/week-06/labs/python-webscraping_opentable-lab-master/chromedriver/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
# driver = webdriver.Chrome(chromedriver)

In [None]:
#write a function that will extract all the relevant job details from each page and return a dictionary with all the relevant
#information

def info_extract(link, location):
    
    #go the the webpage indicated by the link
    
    driver.get(link)
    sleep(6)
    
#     Eventually we want to convert this a list of dictionaries into a pandas dataframe
#     job_df = pd.DataFrame(columns=["Title","Company","Location","Applications",
#                                  "EmployType", "Seniority","Categories", "SalaryLow",
#                                  "SalaryHigh", "PostedDate", "ExpiryDate" , "Requirements"])

    #initialise job dictionary
    job_dict = {}
    #get job title
    try:
        title = driver.find_element_by_id("job_title").text
    except:
        title = np.nan
        
    try:
        company = driver.find_element_by_name("company").text
    except:
        company = np.nan
  
    try:
        employtype = driver.find_element_by_id("employment_type").text
    except:
        employtype = np.nan #put null value if this field cannot be found
        
    try:
        senior = driver.find_element_by_id("seniority").text
    except:
        senior = np.nan  #put null value if this field cannot be found
        
    #check if we want to split these by commas
    
    try:
        category = driver.find_element_by_id("job-categories").text
    except:
        category = np.nan 
    
    try:
        applications = driver.find_element_by_id("num_of_applications").text.replace(' application','').replace('s','')
    except:
        applications = np.nan 
        
    #get the text from job requirements
    
    #get income range
    try:
        income_path = '//*[@id="job_details"]/div[1]/div[2]/div[1]/div/section[2]/div/span[2]/div'
        income_range = driver.find_element_by_xpath(income_path).text.split('to')
        income_lower = income_range[0]
        income_upper = income_range[1]
        
    except:
        income_range = np.nan 
        income_upper = np.nan 
        income_lower = np.nan 
    
    try:
        posted = driver.find_element_by_id("last_posted_date").text
    except:
        posted = np.nan    
        
    try:
        expiry = driver.find_element_by_id("expiry_date").text
    except:
        expiry = np.nan    
    
    try:
        #req_path = '//*[@id="requirements-content"]/ul/li[1]/p'
        #reqts = driver.find_element_by_xpath(req_path).text
        reqts = driver.find_element_by_id("requirements-content").text
        
    except:
        reqts = np.nan    
    

    rate_path = '//*[@id="job_details"]/div[1]/div[2]/div[1]/div/section[2]/div/span[3]'
    try:
        income_rate = driver.find_element_by_xpath(rate_path).text
    except:
        income_rate = np.nan
    
        
        
    #update the dictionary
    job_dict['Title'] = title
    job_dict['Company'] = company
    job_dict['EmployType'] = employtype
    job_dict['Seniority'] = senior
    job_dict['Company'] = company
    job_dict['Categories'] = category
    job_dict['Applications'] = applications
    job_dict['SalaryLow'] = income_lower
    job_dict['SalaryHigh'] = income_upper
    job_dict['PostedDate'] = posted
    job_dict['ExpiryDate'] = expiry
    job_dict['Requirements'] = reqts
    job_dict['Location'] = location
    job_dict['SalaryRate'] = income_rate
    
    
    
    return job_dict

In [None]:
#write a function that will visit each of the href links on each page in the job listings

def get_links():
  
    list_links = []
    jobcards = []
    loc_list = []
    location_list = []
    jobcards = []
    
    #for each job card on the search results
    for i in range(0,20):
        jobcard_xpath_iter = '//*[@id="job-card-%d"]/div/a'%i
        jobcard = driver.find_element_by_xpath(jobcard_xpath_iter)
        jobcards.append(jobcard)
        
        loc_xpath = '//*[@id="job-card-%d"]/div/a/div[1]/div[1]/section/div[2]/div[2]/section/p[1]'%i
        loc_elem = driver.find_element_by_xpath(loc_xpath)  
        location_list.append(loc_elem)

  
    #location_list = driver.find_elements_by_name("location")
    
    
        
    
    for location in location_list:
        #retrieve link
        try:
            loc_text = location.text
            loc_list.append(loc_text)
            #print("size of loc_list is ", len(loc_list))
        except:
            #if no text found
            loc_text = np.nan
            loc_list.append(loc_text)
            
            

    #get href link
    for job in jobcards:
        #retrieve link
        joblink = job.get_attribute('href')
        #append to the list of links
        list_links.append(joblink)
        
    return list_links, loc_list
    
 

In [None]:
# Create a driver called "driver."
# Visit the relevant page
driver = webdriver.Chrome(executable_path="C:/Users/Kai Hee/materials/projects/project-4/chromedriver/chromedriver")
driver.get("https://www.mycareersfuture.sg/")


sleep(5)
# Grab the page source.
html = driver.page_source

#find the search bar
searchbar = driver.find_element_by_name("search-text")

#enter search term and get results page
searchbar.send_keys("data")
searchbar.send_keys(Keys.RETURN)

# Wait five seconds.
sleep(5)

i = 0

urls = []
job_links = []
location_list = []
dict_list = []

#we take the first 220 pages of search results as they are likely the more relevant ones
#runs = 220
runs = 220


pbar = tqdm(total = runs+1)

while(i<runs):#lets try for a few pages first
   
    try:
        #retrieve the next page of search results
        driver.get("https://www.mycareersfuture.sg/search?search=data&sortBy=new_posting_date&page="+str(i))
        sleep(5)
        
        #get the links of all the job listings on the page
        new_links, locations = get_links()
        
        #concatenate to full job list
        job_links = job_links + new_links
        location_list = location_list + locations
        
        i = i+1
        pbar.update(1)
        
        #for each page, print statement when links have been retrieved
        print("Job links retrieved:", len(job_links))
        
    except:
        print("Oops")
        break
        

#once we have gotten all the relevant links, we visit each page to extract the required job info

loc_index = 0
batch = 0

for job in job_links:#for each link on the search results
    
    #extract information and tag location info to this dictionary
    job_dict ={}
    job_dict = info_extract(job, location_list[loc_index])
    dict_list.append(job_dict)
    loc_index = loc_index + 1

    #once all job data has been added to the job dictionary, save as a dataframe

    #update batch number for every 45 pages of search results

    if (loc_index%900 == 0):#for every 20 x 450 listings
        
        batch = batch + 1

        job_df = pd.DataFrame(dict_list)

        #export to csv file

        job_df.to_csv("jobs"+ str(batch)+ ".csv")
        
        #reset the dict_list
        del dict_list[:]
        

#checking that all the job info was saved correctly

#for job_info in dict_list:
#     print(job_info['Title'])
#     print(job_info['Company'])
#     print(job_info['EmployType'])
#     print(job_info['Seniority'])
#     print(job_info['Company'])
#     print(job_info['Categories'])
#     print(job_info['Applications'])
#     print(job_info['SalaryLow']) 
#     print(job_info['SalaryHigh'])
#     print(job_info['SalaryRate'])
#     print(job_info['PostedDate'])
#     print(job_info['ExpiryDate'])
#     print(job_info['Requirements'])
#     print(job_info['Location'])

    
    
# for loc in location_list:
#     print(loc)


sleep(5)

# 
# Beautiful Soup it!

#html = BeautifulSoup(html, 'lxml')




In [None]:
# Close it.
driver.close()


In [None]:
#last few listings may be a slightly smaller batch. export remaining listings to csv file

job_df = pd.DataFrame(dict_list)
#export to csv file

job_df.to_csv("jobs5.csv")