## Scrape Skills from Dice.com Data Scientist Jobs

Steps:
- Run a search on Dice.com for Data Scientists jobs
- Loop thru the Results Page(s) and scrape individual Job Listing URL's
- Loop thru Individual Job Listing Pages and scrape job skill(s)
- Save the skills to a text file, comma separated skills - one row per job listing

In [88]:
%matplotlib inline

# ------ Web Scraping --------------
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException

from lxml import html
import requests

import re


### Load Dice.com Search Results and Scrape Job Listing URLS

In [89]:
links_to_scrape = list()

def get_links(browser):
    '''
    Scrape Jod Listing URL's from current HTML page
    '''
    links_to_return = list()
    soup = BeautifulSoup(browser.page_source, "html.parser")
    
    # Grab all the job links and insert them into our list of Dice URLs
    links = soup.find_all(class_='card-title-link')
    
    # Make sure we are only scraping actual job listing URL's
    for link in links:
        if 'jobs/detail' in link['href']:
            page_url = link['href']
            links_to_return.append(page_url)
            
    return(links_to_return)

# Start up the selenium instance for scrapping
browser = webdriver.Chrome()
browser.get('https://www.dice.com/')
            
# Scrape Dice results location by location
start_url = 'https://www.dice.com/jobs?q=%22Data+Scientist%22&l='
browser.get(start_url)

# The Dice.com Search Results are Paged ... we will need to loop thru each page to get job listing links
while True:
    # Scrape links from current page
    links_to_scrape = links_to_scrape + get_links(browser)

    # We need to see if we have another page or are at the last page
    next_page_btn = browser.find_elements_by_xpath('//*[@id="pagination_2"]/pagination/ul/li[7]')
    
    if next_page_btn:
        next_page_btn = next_page_btn[0]
        print(next_page_btn.get_attribute('class'))

        # When on the last page, there is a 'disabled' class attribute
        if 'disabled' in next_page_btn.get_attribute('class'):
            print("No more pages left")
            break

            
        else:
            # Click the next page link
            next_page_btn = browser.find_elements_by_xpath('//*[@id="pagination_2"]/pagination/ul/li[7]/a')
            next_page_btn[0].click()
            
            # Allow 1 second to ensure the page was loaded and rendered
            time.sleep(1)
    else:
        break
        
# Time to close up the show
browser.close()

# note: we are using set() to remove duplicate entries
links_to_scrape = set(links_to_scrape)

pagination-next page-item ng-star-inserted
pagination-next page-item ng-star-inserted
pagination-next page-item ng-star-inserted
pagination-next page-item ng-star-inserted
pagination-next page-item ng-star-inserted
pagination-next page-item ng-star-inserted
pagination-next page-item ng-star-inserted
pagination-next page-item ng-star-inserted
pagination-next page-item ng-star-inserted
pagination-next page-item ng-star-inserted
pagination-next page-item ng-star-inserted
pagination-next page-item ng-star-inserted
pagination-next page-item ng-star-inserted
pagination-next page-item ng-star-inserted
pagination-next page-item ng-star-inserted
pagination-next page-item ng-star-inserted
pagination-next page-item ng-star-inserted
pagination-next page-item ng-star-inserted
pagination-next page-item ng-star-inserted
pagination-next page-item ng-star-inserted disabled
No more pages left


### Loop thru Job Listings and Scrape Skills

In [110]:
def scrape_skills(url):
    '''
    Scrape the skills from the hidden input tag on the Dice.com job listing
    '''
    
    try:
        # Scrape the job listing page and create an html object we can xpath parse
        page = requests.get(url).text
        
        # Use regular expression to pull out the skills from the page
        x = re.search(r'\"skills\" : \[\"(.*)\"\]', page)
        job_skills = x.group(1).lower()
        
        return(job_skills)
    
    except:
        return(False)

# master list
all_jobs_list = list()

# Loop thru each job URL and scape the unique skills per job listing
for url in links_to_scrape:
    skills = scrape_skills(url)
    
    if skills:
        all_jobs_list = all_jobs_list + [skills]

# Create a text file with comma separated skills, each row is a separate job
with open('./data/dice_ds_skills.txt', 'w') as out:
    for job_skills in all_jobs_list:
        out.write(job_skills + '\n')    
        

### Spot check the list to make sure things look good

In [111]:
all_jobs_list

['PhD, Machine Learning (ML), Deep Learning (DL), SQL, Python, Stats Models, Artificial Intelligence (AI)',
 'data scientist, python, pandas, r, sas, stata, sql, matlab, data analysis, programming, financial',
 'Manager, Management, Systems, Engineering, Accounting, Computer, Database, Oracle, SQL, Python, SAS',
 'Python, JavaScript, SQL',
 'Data Scientist, Data Analyst, Java, Mining, Extract, Statistical programming language, datasets',
 'IT, Computer, Engineering, SQL, Recruiter',
 'Strong Communication Skills, Strong Orginazational and Presentational Skills, Data Science, Problem Solving, Python, Machine Learning, SQL',
 'NLP, Network, Analysis, SNA',
 'artificial intelligence, machine learning, deep learning, computer vision',
 'predictive models, data models, algorithms, bi, r',
 'Python, frameworks',
 'Deep Learning, Machine Learning, Recommender System, Speech Recognition, Artificial Intelligence, Computer Vision, SPARK, Databricks, Hadoop - Deep Learning, Machine Learning, Reco