In [1]:
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
Countries = {"AU":"Australia"}

Target_cities= {'AU':["Sydney","Melbourne","Brisbane","Perth", "Adelaide", "Canberra", "Darwin", "Hobart"]}

URL ={"AU":"https://au.indeed.com/jobs"}

In [3]:
## maximum search results
max_results_per_city = 1000
## Search parameters on url
## placeholders
parameters = {'q': 'data scientist', 'radius': '100', 'start':1}
jobs = ['data scientist','data analyst', 'data engineer', 'big data', 'cloud', 'business intelligence']

In [4]:
def scrape_page_to_df(url, url_params, country):

    # create a empty dictionary to store extracted information for each page
    scraped_data = {'location': [],
                  'company': [],
                  'title': [],
                  'salary': [],
                  'description': [],
                  'review': [],
                  'star': [],
                  'country': []
                  }

    html = requests.get(url, params=url_params)

    # make sure the response status is ok
    assert html.status_code == requests.codes.ok
    
    soup = BeautifulSoup(html.text, 'lxml')

    # function to extract results for each search page

    def extract_results(soup):
        return soup.find_all('div', class_='result')

    results = extract_results(soup)

    ## input html for each search object and extract jobs information

    def extract_location(result):
        """extract job location"""
        try:
            location = result.find('span', class_='location').get_text().strip()
            return location
        except:
            return None
    
    def extract_company(result):
        """extract the name of the company"""
        try:
            company = result.find('span', class_='company').get_text().strip()
            return company
        except:
            return None

    def extract_title(result):
        """extract the job title"""
        try:
            title = result.find('a', attrs={'data-tn-element': "jobTitle"}).get('title')
            return title
        except:
            return None


    def extract_salary(result):
        """extract the salary"""
        try:
            salary = result.find('div', class_='salarySnippet salarySnippetDemphasizeholisticSalary').\
            find('span', class_='no-wrap').\
            get_text().strip()
            
            return salary
        except:
            return None


    def extract_description(result):
        """extract job description snippet"""
        try:
            description = result.find('div', class_= 'summary').get_text().strip()
            return description
        except:
            return None


    def extract_review(result):
        """extract the number of reviews for the company"""
        try:
            review = result.find('a', attrs={'data-tn-element': "reviewStars"})
            review = review.find('span', class_="slNoUnderline")
            review = review.get_text().strip()
            # extract only the number
            review = review.replace(',', '').replace(' reviews', '')
            return review
        except:
            return None            

    
    def extract_star(result):
        """extract a number (width) that is proportional to the number of stars
        shown for the company"""
        try:
            # the 'style' attribute dictates how many stars are filled with color
            star = result.find('span', class_='rating').get('style')
            # extract only the number
            star = star.replace('width:', '').replace('px', '')
            return star
        except:
            return None


    # append extracted info to the correspond list
    for result in results:
        scraped_data['location'].append(extract_location(result))
        scraped_data['company'].append(extract_company(result))
        scraped_data['title'].append(extract_title(result))
        scraped_data['salary'].append(extract_salary(result))
        scraped_data['description'].append(extract_description(result))
        scraped_data['review'].append(extract_review(result))
        scraped_data['star'].append(extract_star(result))
        scraped_data['country'].append(country)

    # convert the dictionary to a pandas dataframe and returns it
    return pd.DataFrame(scraped_data)

In [5]:
def remove_duplicates(df):
    """remove duplicates and returns a new df"""
    
    nrows_before = df.shape[0]
    df.drop_duplicates(subset=['company', 'country','description',
                               'location', 'salary', 'title'],
                       keep='last', inplace=True)
    nrows_after = df.shape[0]
    
    print('{} rows remain after removing duplicates from {} rows.'.format(
        nrows_after, nrows_before))
    print('{} rows have salary info; {} rows have yearly salary info.'.format(
      df.salary.notnull().sum(), df.salary.str.contains('year').sum()))
    return df

In [6]:
def scrapper(CountryCode):
    print('Current system time: {}'.format(time.ctime()))
 
    # scrape data and save to dataframe
    start_time = time.time()
    
    #Retrieve Parameters to scrape with based on input of Country Code
    url = URL[CountryCode]
    locations = Target_cities[CountryCode]
    country = Countries[CountryCode]
    
    #Create an empty place holder df, search through every location in that country, but only 1 results, just to get the title and columns
    df = scrape_page_to_df(url,parameters,country)
    
    for loc in locations:
        for work in jobs:
            for start in range(0, max_results_per_city, 10):
            
              
                url_params = parameters.copy()
                #update the job with the target job that we want, city for target city that we are looking for and start refers to the current page number being scrapped
                url_params.update({'l': loc,'q': work, 'start': start})


                #insert code to put the scrap stuff into a df here, after each round of loop, concat into a df
                df = pd.concat([df,scrape_page_to_df(url, url_params,country)],axis=0)
              
        print('Finished scraping {}'.format(loc))
    total_time = (time.time() - start_time) / 60
    print('Scraping run time: {:.1f} minutes'.format(total_time))

    
    
    
    # remove duplicates
    df = remove_duplicates(df)
    print('Script finished at {}\n'.format(time.ctime()))
    
    #returns the final df
    return df

In [7]:
AU = scrapper('AU')

Current system time: Fri Feb 21 21:50:23 2020
Finished scraping Sydney
Finished scraping Melbourne
Finished scraping Brisbane
Finished scraping Perth
Finished scraping Adelaide
Finished scraping Canberra
Finished scraping Darwin
Finished scraping Hobart
Scraping run time: 54.5 minutes
10688 rows remain after removing duplicates from 56601 rows.
2034 rows have salary info; 1545 rows have yearly salary info.
Script finished at Fri Feb 21 22:44:51 2020



In [8]:
AU

Unnamed: 0,location,company,title,salary,description,review,star,country
3,Sydney NSW,International Institute of Data & Analytics,Junior Data Analyst/Scientist,,"In data science and big data analytics, the ID...",,,Australia
5,Sydney NSW,AWS Australia Pty Ltd,Sr. Data Scientist,,Strong communication and data presentation ski...,,,Australia
7,Sydney NSW,Atlassian,"Senior Data Scientist, DevOps Product Analytics",,Play with our seriously large volume of analyt...,,,Australia
10,North Sydney NSW,Harwood Environmental Consultants,Graduate Environmental Scientist,,To develop the skills required to achieve fiel...,,,Australia
11,Sydney NSW,Freelancer.com,Data Analyst / Junior Data Scientist,,We are a data-driven company - data trumps opi...,,,Australia
...,...,...,...,...,...,...,...,...
4,Cambridge TAS,Trouw Nutrition Canada Inc.,Technical Officer,,Take appropriate action as agreed with the Key...,,,Australia
5,Hobart TAS,Weir Minerals,Area Manager,,To support the Weir Minerals business by provi...,,,Australia
6,Otago TAS,APM,APM - Team Co-ordinator - Dunedin,,Working with our regional manager to explore n...,,,Australia
7,Otago TAS,APM,APM - Team Co-ordinator - Central Otago,,Working with our regional manager to explore n...,,,Australia


In [10]:
df_job = pd.DataFrame(AU)
df_job.to_csv('Job_final_output_v6.csv')

In [2]:
store_job = pd.read_csv('./Job_final_output_v6.csv',index_col=0)

In [3]:
store_job

Unnamed: 0,location,company,title,salary,description,review,star,country
3,Sydney NSW,International Institute of Data & Analytics,Junior Data Analyst/Scientist,,"In data science and big data analytics, the ID...",,,Australia
5,Sydney NSW,AWS Australia Pty Ltd,Sr. Data Scientist,,Strong communication and data presentation ski...,,,Australia
7,Sydney NSW,Atlassian,"Senior Data Scientist, DevOps Product Analytics",,Play with our seriously large volume of analyt...,,,Australia
10,North Sydney NSW,Harwood Environmental Consultants,Graduate Environmental Scientist,,To develop the skills required to achieve fiel...,,,Australia
11,Sydney NSW,Freelancer.com,Data Analyst / Junior Data Scientist,,We are a data-driven company - data trumps opi...,,,Australia
...,...,...,...,...,...,...,...,...
4,Cambridge TAS,Trouw Nutrition Canada Inc.,Technical Officer,,Take appropriate action as agreed with the Key...,,,Australia
5,Hobart TAS,Weir Minerals,Area Manager,,To support the Weir Minerals business by provi...,,,Australia
6,Otago TAS,APM,APM - Team Co-ordinator - Dunedin,,Working with our regional manager to explore n...,,,Australia
7,Otago TAS,APM,APM - Team Co-ordinator - Central Otago,,Working with our regional manager to explore n...,,,Australia


In [4]:
scrape = store_job[['title', 'company', 'location', 'salary', 'description']]

In [6]:
scrape.columns = ['job_title', 'company_name', 'location', 'salary', 'job_desc']

In [9]:
scrape

Unnamed: 0,job_title,company_name,location,salary,job_desc
3,Junior Data Analyst/Scientist,International Institute of Data & Analytics,Sydney NSW,,"In data science and big data analytics, the ID..."
5,Sr. Data Scientist,AWS Australia Pty Ltd,Sydney NSW,,Strong communication and data presentation ski...
7,"Senior Data Scientist, DevOps Product Analytics",Atlassian,Sydney NSW,,Play with our seriously large volume of analyt...
10,Graduate Environmental Scientist,Harwood Environmental Consultants,North Sydney NSW,,To develop the skills required to achieve fiel...
11,Data Analyst / Junior Data Scientist,Freelancer.com,Sydney NSW,,We are a data-driven company - data trumps opi...
...,...,...,...,...,...
4,Technical Officer,Trouw Nutrition Canada Inc.,Cambridge TAS,,Take appropriate action as agreed with the Key...
5,Area Manager,Weir Minerals,Hobart TAS,,To support the Weir Minerals business by provi...
6,APM - Team Co-ordinator - Dunedin,APM,Otago TAS,,Working with our regional manager to explore n...
7,APM - Team Co-ordinator - Central Otago,APM,Otago TAS,,Working with our regional manager to explore n...


In [10]:
scrape.to_csv('Job_final_output_v7.csv')