# Project 4 GA-DSI

In [1009]:
import numpy as np
import pandas as pd
import seaborn as sns

import requests
import time
from bs4 import BeautifulSoup
from wordcloud import (WordCloud, get_single_color_func)
import matplotlib.pyplot as plt
import pickle as pk
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression, LassoCV, LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

%matplotlib inline
sns.set_palette("husl")

In [117]:
Countries = {"SG":"Singapore","US":"United States","MY":"Malaysia","UK":"United Kingdom",\
             "AU":"Australia","CN":"Canada"}
Target_cities= {'US':['New York', 'Chicago', 'San Francisco', 'Austin', 'Seattle',
                  'Los Angeles', 'Philadelphia', 'Atlanta', 'Dallas',
                  'Houston'],
                'SG':["Singapore"],
                'MY':['Kuala Lumpur'],
                "UK":["London", "Newcastle","Sheffield","Bristol","Manchester","Glasgow","Edinburgh","Birmingham","Liverpool",
                      "Aberdeen","Nottingham","Belfast","Cardiff","Cambridge","Oxford"],
                "AU" :["Sydney","Melbourne","Bisbane","Perth"],
                "CN" :["Toronto","Montreal","Vancouver","Quebec"]
               }


In [404]:
URL = {"SG":"https://www.indeed.com.sg/jobs",
       "US":"https://www.indeed.com/jobs",
       "MY":"https://www.indeed.com.my/jobs",
       "UK":"https://www.indeed.co.uk/jobs",
       "AU":"https://au.indeed.com/jobs",
       "CN":"https://ca.indeed.com/jobs"}

In [21]:
max_results_per_city = 300
#put data scientist as a placeholder, will be filled with the list of jobs later
parameters = {'q': 'data scientist', 'radius': '100', 'start':1}
#list of jobs
jobs = ['data scientist','data analyst']

In [23]:
def scrape_page_to_df(url, url_params,country):
    """
    extract information from a results page and save to an existing csv
    :param url: url template
    :param url_params: a dictionary to feed to params argument in requests.get (based on the parameters I defined above, and I'll make a wrapper to do this below)
    :return: a pandas dataframe containing the extracted information
    """
    # create a empty dictionary to store extracted information
    scraped_data = {'location': [],
                  'company': [],
                  'title': [],
                  'salary': [],
                  'description': [],
                  'review': [],
                  'star': [],
                  'country':[]
                  }

    html = requests.get(url, params=url_params)

    # make sure the response status is ok
    assert html.status_code == requests.codes.ok

    soup = BeautifulSoup(html.text, 'lxml')

  #helper function to extract results

    def extract_results(soup):
        return soup.find_all('div', class_='result')

    results = extract_results(soup)

  #helper functions to extract information
    def extract_location(result):
        """extract job location"""
        try:
            location = result.find('span', class_='location').get_text().strip()
            return location
        except:
            return None


    def extract_company(result):
        """extract the name of the company"""
        try:
            company = result.find('span', class_='company').get_text().strip()
            return company
        except:
            return None

    def extract_title(result):
        """extract the job title"""
        try:
            title = result.find('a', attrs={'data-tn-element': "jobTitle"}).get('title')
            return title
        except:
            return None


    def extract_salary(result):
        """extract the salary"""
        try:
            salary = result.find('td', class_='snip').\
            find('span', class_='no-wrap').\
            get_text().strip()
            return salary
        except:
            return None


    def extract_description(result):
        """extract job description snippet"""
        try:
            description = result.find('span', class_='summary').get_text().strip()
            return description
        except:
            return None


    def extract_review(result):
        """extract the number of reviews for the company"""
        try:
            review = result.find('a', attrs={'data-tn-element': "reviewStars"})
            review = review.find('span', class_="slNoUnderline")
            review = review.get_text().strip()
            # extract only the number
            review = review.replace(',', '').replace(' reviews', '')
            return review
        except:
            return None            

    
    def extract_star(result):
        """extract a number (width) that is proportional to the number of stars
        shown for the company"""
        try:
            # the 'style' attribute dictates how many stars are filled with color
            star = result.find('span', class_='rating').get('style')
            # extract only the number
            star = star.replace('width:', '').replace('px', '')
            return star
        except:
            return None


    # append extracted info to the correspond list
    for result in results:
        scraped_data['location'].append(extract_location(result))
        scraped_data['company'].append(extract_company(result))
        scraped_data['title'].append(extract_title(result))
        scraped_data['salary'].append(extract_salary(result))
        scraped_data['description'].append(extract_description(result))
        scraped_data['review'].append(extract_review(result))
        scraped_data['star'].append(extract_star(result))
        scraped_data['country'].append(country)

      # convert the dictionary to a pandas dataframe and returns it
    return pd.DataFrame(scraped_data)


In [115]:
def remove_duplicates(df):
    """remove duplicates and returns a new df"""
    
    nrows_before = df.shape[0]
    df.drop_duplicates(subset=['company', 'country','description',
                               'location', 'salary', 'title'],
                       keep='last', inplace=True)
    nrows_after = df.shape[0]
    
    print('{} rows remain after removing duplicates from {} rows.'.format(
        nrows_after, nrows_before))
    print('{} rows have salary info; {} rows have yearly salary info.'.format(
      df.salary.notnull().sum(), df.salary.str.contains('year').sum()))
    return df

In [1010]:
def scrapper(CountryCode):
    print('Current system time: {}'.format(time.ctime()))
  
    # scrape data and save to dataframe
    start_time = time.time()
    
    #Retrieve Parameters to scrape with based on input of Country Code
    url = URL[CountryCode]
    locations = Target_cities[CountryCode]
    country = Countries[CountryCode]
    
    #Create an empty place holder df, search through every location in that country, but only 1 results, just to get the title and columns
    df = scrape_page_to_df(url,parameters,country)
    
    
    for loc in locations:
        for j in jobs:
            for start in range(0, max_results_per_city, 10):
            
              
                url_params = parameters.copy()
                #update the job with the target job that we want, city for target city that we are looking for and start refers to the current page number being scrapped
                url_params.update({'l': loc,'q': j, 'start': start})


                #insert code to put the scrap stuff into a df here, after each round of loop, concat into a df
                df = pd.concat([df,scrape_page_to_df(url, url_params,country)],axis=0)
              
        print('Finished scraping {}'.format(loc))
    total_time = (time.time() - start_time) / 60
    print('Scraping run time: {:.1f} minutes'.format(total_time))

    
    
    
    # remove duplicates
    df = remove_duplicates(df)
    print('Script finished at {}\n'.format(time.ctime()))
    
    #returns the final df
    return df

In [17]:
SG = scrapper('SG')

Current system time: Tue Oct 24 10:01:59 2017
Finished scraping Singapore
Scraping run time: 0.8 minutes
738 rows remain after removing duplicates from 1506 rows.
26 rows have salary info; 0 rows have yearly salary info.
Script finished at Tue Oct 24 10:02:47 2017



In [118]:
AU = scrapper('AU')

Current system time: Tue Oct 24 13:28:27 2017
Finished scraping Sydney
Finished scraping Melbourne
Scraping run time: 4.6 minutes
780 rows remain after removing duplicates from 1208 rows.
117 rows have salary info; 89 rows have yearly salary info.
Script finished at Tue Oct 24 13:33:01 2017



In [119]:
UK = scrapper('UK')

Current system time: Tue Oct 24 13:33:31 2017
Finished scraping London
Scraping run time: 2.6 minutes
520 rows remain after removing duplicates from 610 rows.
122 rows have salary info; 107 rows have yearly salary info.
Script finished at Tue Oct 24 13:36:05 2017



In [25]:
HK= scrapper('HK')

Current system time: Tue Oct 24 10:17:10 2017
Finished scraping Hong Kong
Scraping run time: 0.4 minutes
361 rows remain after removing duplicates from 607 rows.
9 rows have salary info; 0 rows have yearly salary info.
Script finished at Tue Oct 24 10:17:35 2017



In [95]:
US = scrapper('US')

Current system time: Tue Oct 24 12:02:38 2017
Finished scraping Seattle
Finished scraping Boston
Finished scraping Austin
Scraping run time: 4.3 minutes
1533 rows remain after removing duplicates from 1831 rows.
113 rows have salary info; 58 rows have yearly salary info.
Script finished at Tue Oct 24 12:06:58 2017



In [None]:
MY = scrapper('MY')
CN = scrapper('CN')

In [127]:
AU.to_pickle('./data/AU.pkl')
SG.to_pickle('./data/SG.pkl')
US.to_pickle('./data/US.pkl')
UK.to_pickle('./data/UK.pkl')
HK.to_pickle('./data/HK.pkl')
MY.to_pickle('./data/MY.pkl')
CN.to_pickle('./data/CN.pkl')

In [51]:
AU.to_csv('./data/AU.csv')
SG.to_csv('./data/SG.csv')
US.to_csv('./data/US.csv')
UK.to_csv('./data/UK.csv')
HK.to_csv('./data/HK.csv')
MY.to_csv('./data/MY.csv')
CN.to_csv('./data/CN.csv')

In [120]:
dft = pd.concat([SG, HK, US, AU, UK, MY, CN], axis=0)

In [121]:
dft.to_pickle('./data/total.pickle')