# Scrap Indeed

In [1]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time


def extract_job_title_from_result(soup):
    jobs = []
    for div in soup.find_all(name="div", attrs={"class": "row"}):
        for a in div.find_all(name="a", attrs={"data-tn-element": "jobTitle"}):
            jobs.append(a["title"])
    return(jobs)


def extract_location_from_result(soup):
    locations = []
    for div in soup.find_all(name="div", attrs={"class": "row"}):
        location = div.find_all(name="div", attrs={"class": "location"})
        if len(location) > 0:
            for b in location:
                locations.append(b.text.strip())
        else:
            sec_try = div.find_all(name="span", attrs={"class": "location"})
            for span in sec_try:
                locations.append(span.text.strip())
    return(locations)


def extract_company_from_result(soup):
    companies = []
    for div in soup.find_all(name="div", attrs={"class": "row"}):
        company = div.find_all(name="span", attrs={"class": "company"})
        if len(company) > 0:
            for b in company:
                companies.append(b.text.strip())
        else:
            sec_try = div.find_all(
                name="a", attrs={"data-tn-element": "companyName"})
            if len(sec_try) > 0:
                for span in sec_try:
                    companies.append(span.text.strip())
            else:
                companies.append("Nothing_found")
    return(companies)


def extract_salary_from_result(soup):
    salaries = []
    for div in soup.find_all(name="div", attrs={"class": "row"}):
        salary = div.find_all(name="span", attrs={"class": "salary no-wrap"})
        if len(salary) > 0:
            for b in salary:
                salaries.append(b.text.strip())
        else:
            salaries.append("Nothing_found")
    return(salaries)


def extract_summary_from_result(soup):
    summaries = []
    spans = soup.findAll("div", attrs={"class": "summary"})
    for span in spans:
        summaries.append(span.text.strip())
    return(summaries)


def add_to_df(df, soup):
    # Run multiple extractions to add to df
    columns = ["job_title", "location", "company", "salary", "summary"]
    temp = pd.DataFrame(columns=columns)
    print(len(extract_job_title_from_result(soup)), len(extract_location_from_result(soup)), len(
        extract_company_from_result(soup)), len(extract_salary_from_result(soup)), len(extract_summary_from_result(soup)))
    funcs = [extract_job_title_from_result(soup), extract_location_from_result(soup), extract_company_from_result(
        soup), extract_salary_from_result(soup), extract_summary_from_result(soup)]
    for i, j in enumerate(funcs):
        temp[columns[i]] = j

    return pd.concat([df, temp])


max_results_per_city = 170
columns = ["job_title", "location", "company", "salary", "summary"]
sample_df = pd.DataFrame(columns=columns)

for start in range(0, max_results_per_city, 10):
    URL = "https://www.indeed.com/jobs?q=data+scientist&l=georgia&start=" + str(start)
    page = requests.get(URL)
    soup = BeautifulSoup(page.text, "html.parser")
    soup.prettify()
    sample_df = add_to_df(sample_df, soup)
    time.sleep(10)

from datetime import date
today = str(date.today()) # '2018-12-26'

sample_df.to_csv("jobs_Georgia_"+today+".csv", index=False)

19 19 19 19 19
19 19 19 19 19
19 19 19 19 19
19 19 19 19 19
16 16 16 16 16
17 17 17 17 17
15 15 15 15 15
17 17 17 17 17
15 15 15 15 15
15 15 15 15 15
15 15 15 15 15
15 15 15 15 15
16 16 16 16 16
15 15 15 15 15
15 15 15 15 15
15 15 15 15 15
15 15 15 15 15


# Preview

In [2]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 500)
df = pd.read_csv("jobs_Georgia_"+today+".csv")
df.head()


Unnamed: 0,job_title,location,company,salary,summary
0,Entry Level Environmental Scientist,"Roswell, GA 30075","Atlantic Coast Consulting, inc.",Nothing_found,Various field data collection and data reports...
1,Conjugation Scientist,"Norcross, GA",Caprico Biotechnologies,Nothing_found,"Caprico Biotechnologies, Inc. (CBI), an Atlant..."
2,"Scientist II, Discovery","Athens, GA 30601",Boehringer Ingelheim,Nothing_found,Contributes executing on a daily basis global ...
3,Bioinformatician,"Atlanta, GA 30346","IHRC, Inc.",Nothing_found,Work with scientists in the laboratory to addr...
4,Senior Data Scientist,"Alpharetta, GA 30022",Verizon,Nothing_found,"As a Senior Data Scientist, you’ll lead projec..."


## Companies - Job Listing Counts

In [3]:
df.company.value_counts()

Verizon                                       20
Double Edge Capital LLC                       17
Caprico Biotechnologies                       17
Atlantic Coast Consulting, inc.               17
Thinkful                                      17
ADP                                           10
Boehringer Ingelheim                           8
KPMG                                           8
Emory University                               8
SunTrust                                       7
UNITED PARCEL SERVICE                          6
The Home Depot                                 6
Intercontinental Exchange                      5
IHRC, Inc.                                     4
ICP Systems LLC                                4
BD                                             3
Cortland                                       3
Capgemini                                      3
Georgia Tech Research Institute                3
Wells Fargo                                    3
FraudScope, Inc.    

## Job Title Counts

In [4]:
df.job_title.value_counts(normalize=True)

Senior Data Scientist                                                   0.075812
Data Scientist                                                          0.075812
Data Scientist - Supply Chain Inventory Optimization                    0.061372
Conjugation Scientist                                                   0.061372
Entry Level Environmental Scientist                                     0.061372
Data Science Instructor                                                 0.061372
Scientist II, Discovery                                                 0.028881
Data Scientist (NextGen Payroll)                                        0.021661
Sr. Associate, Data Scientist                                           0.018051
Bioinformatician                                                        0.014440
Data Engineer                                                           0.010830
Statistical Quant Analyst-4                                             0.007220
Data Scientist, Data Science

## Location Counts

In [5]:
df.location.value_counts()

Atlanta, GA                                               100
Alpharetta, GA 30022                                       20
Norcross, GA                                               19
Roswell, GA 30075                                          17
Atlanta, GA 30301 (Buckhead area)                          11
Alpharetta, GA                                              9
Athens, GA 30601                                            9
Atlanta, GA 30338                                           8
Alpharetta, GA 30004                                        7
Alpharetta, GA 30005                                        6
Atlanta, GA 30328                                           5
Atlanta, GA 30308 (Old Fourth Ward area)                    5
Atlanta, GA 30337                                           4
Atlanta, GA 30346                                           4
Atlanta, GA 30326 (Buckhead area)                           3
Atlanta, GA 30341                                           3
Atlanta,

## Salary Counts

In [6]:
df.salary.value_counts()

Nothing_found                               210
$100,000 - $150,000 a year                   17
$20 - $25 an hour                            17
$85,000 - $126,000 a year (Indeed est.)       6
$78,446 - $101,983 a year                     2
$90,000 - $132,000 a year (Indeed est.)       2
$88,000 - $130,000 a year (Indeed est.)       2
$88,000 - $129,000 a year (Indeed est.)       2
$108,000 - $160,000 a year (Indeed est.)      2
$98,000 - $145,000 a year (Indeed est.)       2
$80,000 - $118,000 a year (Indeed est.)       2
$60 an hour                                   1
$70,000 - $80,000 a year                      1
$104,000 - $154,000 a year (Indeed est.)      1
$60 - $70 an hour                             1
$10.50 an hour                                1
$8 - $10 an hour                              1
$96,000 - $142,000 a year (Indeed est.)       1
$108,000 - $159,000 a year (Indeed est.)      1
$106,000 - $156,000 a year (Indeed est.)      1
$105,000 - $155,000 a year (Indeed est.)

# Groupby Job Titles & Companies

In [7]:
df.groupby(['company', 'job_title']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,location,salary,summary
company,job_title,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ADP,Data Scientist,"Alpharetta, GA 30005Alpharetta, GA 30005",Nothing_foundNothing_found,ADP is hiring a Data Scientist - Senior Machin...
ADP,Data Scientist (NextGen Payroll),"Alpharetta, GA 30004Alpharetta, GA 30004Alphar...","$85,000 - $126,000 a year (Indeed est.)Nothing...",Theoretical and practical understanding of dat...
ADP,Lead Data Scientist - Risk Strategy,"Alpharetta, GA 30005",Nothing_found,Will exhibit expert level knowledge in data st...
ADP,UX Researcher II,"Alpharetta, GA 30004",Nothing_found,"At ADP, the world's largest B2B cloud company,..."
AMERICAN CANCER SOCIETY,"Assistant Scientist, Study Management","Atlanta, GA",Nothing_found,"Working in a team environment, perform various..."
"AUDIOEYE, INC.",Senior Data Scientist,"Atlanta, GA",Nothing_found,The Data Scientist identifies business trends ...
Abt Associates,Quantitative Analyst - DHE,"Atlanta, GA 30345",Nothing_found,"Implement and manage project schedules, plans,..."
Accenture,AI Data Science Sr. Manager - Financial Services (Conversational AI),"Atlanta, GA 30303 (Five Points area)",Nothing_found,We’re also the world’s largest team of data sc...
Accenture,Data Science Consulant,"Atlanta, GA 30303 (Five Points area)","$108,000 - $160,000 a year (Indeed est.)","Effectively utilize statistical, data mining, ..."
"Atlantic Coast Consulting, inc.",Entry Level Environmental Scientist,"Roswell, GA 30075Roswell, GA 30075Roswell, GA ...",Nothing_foundNothing_foundNothing_foundNothing...,Various field data collection and data reports...


# Groupby Salary & Companies

In [8]:
df.groupby(['salary','company']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,job_title,location,summary
salary,company,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
$10.50 an hour,Lowers Risk Group,Criminal Research Analyst,"Flowery Branch, GA",Enters findings and results into internal syst...
"$100,000 - $150,000 a year",Double Edge Capital LLC,Data Scientist - Supply Chain Inventory Optimi...,"Atlanta, GAAtlanta, GAAtlanta, GAAtlanta, GAAt...","Data visualization skills (Power BI, Tableau, ..."
"$104,000 - $154,000 a year (Indeed est.)",Grid Dynamics,Data Scientist,"Atlanta, GA",We are looking for an enthusiastic and technol...
"$105,000 - $154,000 a year (Indeed est.)",Norfolk Southern Corp,Data Scientist/Machine Learning Trainee,"Atlanta, GA 30309 (Midtown area)","As a Data Scientist/Machine Learning Trainee, ..."
"$105,000 - $155,000 a year (Indeed est.)",SoftVision - TAMS,"Data Scientist (PYTHON, HADOOP)","Johns Creek, GA 30097",5-7 years of Data Modeling and Machine Learnin...
"$106,000 - $156,000 a year (Indeed est.)",Regions Bank,Data Scientist,"Atlanta, GA","At Regions, the Data Scientist works with mult..."
"$107,000 - $158,000 a year (Indeed est.)",Verizon,Data Science - Predictive & Prescriptive Analy...,"Alpharetta, GA 30022",Be a subject matter expert on machine learning...
"$108,000 - $159,000 a year (Indeed est.)",McKinsey & Company,"Specialist, Data Science - Sales Analytics","Atlanta, GA 30318","As a seasoned data scientist, you will lead pr..."
"$108,000 - $160,000 a year (Indeed est.)",Accenture,Data Science Consulant,"Atlanta, GA 30303 (Five Points area)","Effectively utilize statistical, data mining, ..."
"$108,000 - $160,000 a year (Indeed est.)",Brillio,Data Scientist,Georgia,5+ years of minimum experience in data science...
