## Project 4 : Job Market Analysis

## Notebook 01 : Data Extraction from Seek - (Web Scrape)

The goal of this project is to answer the following questions:

Identify the factors that have the most affect on salary? Identify the key skills and buzzwords across job category / title?

For the analysis, I have gathered data from the job search engine site SEEK Limited AU. I have limited the study to data related fields like data scientist, data analyst for Sydney and Melbourne.

In [3]:
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import seaborn as sns
import pdb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

The following code will scrape the data from the job site SEEK.com.au by using Xpath and a combination of Beautiful Soup.

In [4]:
def page(url):
    request = requests.get(url)
  
    if request.status_code == 200:
        select_data = Selector(text=request.text)
        # Extract Title of the job
        pdb.set_trace()
        job_title = select_data.xpath('//*/*[@*="job-detail-title"]/span/h1/text()').extract()[0] 
        # Extract Advertiser / company name
        advertiser = select_data.xpath('//*/h2/span/span/text()').extract()            
        
        if len(advertiser) > 0:
            advertiser = advertiser[0]
        else:
            advertiser = None
        # Extract Rating of the advertise if any
        rating = select_data.xpath('//*/h2/span/span/span/span/text()').extract()         
        if len(rating) > 0:
            rating = rating[0]
        else:
            rating = None
        # Extract job Posted Date
        posted_date = select_data.xpath('//*/*[@*="job-detail-date"]/span/span/text()').extract()[0] 
        # Extract Salary
        salary = select_data.xpath('//*/*[@*="jobInfoHeader"]/dl/div/dd/span/span/text()').extract() 
        
        if len(salary) > 0:
            salary = salary[0]
        else:
            salary = None
      
        # Extract Contract Type
        type_of_work = select_data.xpath('//*/*[@*="job-detail-work-type"]/span/span/text()').extract()[0] 
        # Extract Job Category
        category_of_work = select_data.xpath('//*/section[@*="jobInfoHeader"]/dl/div/dd/span/span/strong/text()').extract()[0] 
        # Extract if there's any sub Category
        sub_category_of_work = select_data.xpath('//*/section[@*="jobInfoHeader"]/dl/div/dd/span/span/span/text()').extract()[0] 
    
    # Extracting the Job description text:
        parsed_results = BeautifulSoup(request.text, 'lxml')
       
        body_text = parsed_results.find('div', {'data-automation': 'mobileTemplate'}).text
    
        return job_title, advertiser, rating, posted_date, salary, type_of_work, category_of_work, sub_category_of_work, body_text
  
    else:
        print('Failed')

In [5]:
def jobs_scraped(job, location, res_page=20):
    scraped_results = {'job_title' : [],
               'job_location': [],
                'advertiser' : [],
                'rating' : [],
                'posted_date' : [],
                'salary' : [],
                'type_of_work' : [],
                'job_category' : [],
                'job_subcategory' : [],
                'job_description' : [],
                'job_searched' : [],
                'url' : []
       }

    url = 'https://www.seek.com.au/' + job + '-jobs/in-' + location
    request = requests.get(url)
    select_data = Selector(text=request.text)
    total_results = np.ceil(int(select_data.xpath('//*/*[@*="totalJobsCount"]/text()').extract()[0].replace(',','')) / 20)
    total_results = int(total_results)
  
    for i in range(1,total_results + 1):
        url = 'https://www.seek.com.au/' + job + '-jobs/in-' + location + '?page=' + str(i)
        request = requests.get(url)
        select_data = Selector(text=request.text)
        h_refs = select_data.xpath('//*/*[@*="searchResults"]/div/div/div/article/span/span/h1/a/@href').extract()
   
        for h_ref in h_refs:
            url = 'https://www.seek.com.au' + h_ref
            job_title, advertiser, rating, posted_date, salary, type_of_work, category_of_work, sub_category_of_work, body_of_text = page(url)
            scraped_results['job_title'].append(job_title)
            scraped_results['job_searched'].append(job)
            scraped_results['job_location'].append(location)
            scraped_results['advertiser'].append(advertiser)
            scraped_results['rating'].append(rating)
            scraped_results['posted_date'].append(posted_date)
            scraped_results['salary'].append(salary)
            scraped_results['type_of_work'].append(type_of_work)
            scraped_results['job_category'].append(category_of_work)
            scraped_results['job_subcategory'].append(sub_category_of_work)
            scraped_results['job_description'].append(body_of_text)
            scraped_results['url'].append(url)
      
    return pd.DataFrame(scraped_results)

## Testing Individual Page

In [None]:
url = 'https://www.seek.com.au/job/38952572?type=standout&searchrequesttoken=ea2e6044-6c97-4e94-84c7-54a6b29c449f'
job_title, advertiser, rating, posted_date, salary, type_of_work, category_of_work, sub_category_of_work, body_of_text = page(url)

> <ipython-input-4-9f8336b6204e>(8)page()
-> job_title = select_data.xpath('//*/*[@*="job-detail-title"]/span/h1/text()').extract()[0]
(Pdb) select_data
<Selector xpath=None data='<html lang="en" prefix="og: http://ogp.m'>
(Pdb) select_data.xpath('//*/*[@*="job-detail-title"]/span/h1/text()').extract()[0]
'Trainee Data Analyst'
(Pdb) select_data.xpath('//*/h2/span/span/text()').extract() 
['Roy Morgan Research']


In [19]:
job_title, advertiser, rating, posted_date, salary, type_of_work, category_of_work, sub_category_of_work, body_of_text;

## Testing Search Results

In [74]:
ds_df = jobs_scraped('data-scientist', 'All-Melbourne-VIC')

In [11]:
ds_sydney = jobs_scraped('data-scientist', 'All-Sydney-NSW')

In [15]:
ds_sydney.to_csv('seek_ds_Sydney.csv')

In [16]:
len(ds_sydney);

In [26]:
url = 'https://www.seek.com.au/job/38866940?searchrequesttoken=1627f2be-df19-4f82-9a3b-02d0b8c87149&type=promoted'
job_title, advertiser, rating, posted_date, salary, type_of_work, category_of_work, sub_category_of_work, body_of_text = page(url)

In [21]:
job_title, advertiser, rating, posted_date, salary, type_of_work, category_of_work, sub_category_of_work, body_of_text;

## Saving original scraped data to csv file

In [77]:
ds_df.to_csv('seek_datascientist.csv')

In [31]:
da_sydney = jobs_scraped('data-analyst', 'All-Sydney-NSW')

In [33]:
da_sydney.to_csv('seek_da_Sydney.csv')

In [78]:
d_df = jobs_scraped('data-analyst', 'All-Melbourne-VIC')

In [80]:
ds_df.to_csv('seek_data-analyst.csv')

In [7]:
data = pd.read_csv('seek_data-analyst.csv')

In [6]:
data_ds = pd.read_csv('seek_datascientist.csv')

In [8]:
len(data_ds)

100

In [4]:
Sydney_DA= pd.read_csv('seek_da_Sydney.csv')

In [14]:
print(Sydney_DA.shape, Sydney_DS.shape, Mel_DA.shape, Mel_DS.shape)

(4004, 13) (180, 13) (3158, 13) (100, 13)


In [5]:
Sydney_DS = pd.read_csv('seek_ds_Sydney.csv')

In [6]:
Mel_DS = pd.read_csv('seek_datascientist.csv')

In [7]:
Mel_DA = pd.read_csv('seek_data-analyst.csv')

## Concatinating 4 csv's to a DF:

In [8]:
All_jobs = pd.concat([Sydney_DA,Sydney_DS,Mel_DS,Mel_DA], ignore_index= True)