## Data Jobs Market Analysis for Top 10 US Tech(IT) Cities


** This module is for data collection. **

_Data Sources Used in this module:_
    1. Indeed API
    2. Web scraping


In [10]:
# importing dependencies 

import pandas as pd
from indeed import IndeedClient
from config import publisher_key
import requests
import bs4
import re
from html.parser import HTMLParser

### Extracting Data Related Jobs using API for top 10 US Tech(IT) Cities

In [2]:
# function to get the total count of all 'data' related jobs posted on Indeed.com in last 120 days (using Indeed API)

def get_job_count(key, start, location):

    client = IndeedClient(publisher = key)
    
    parameters = {'q' : "title:('data' OR ('business' AND 'intelligence') OR 'BI' OR \
                  ('machine' AND 'learning') OR 'AI' OR ('artificial' AND 'intelligence') \
                  OR 'tableau' OR ('power' AND 'BI') OR statistical' OR 'statistician)",
                  'l' : location,                  
                  'sort' : "date",
                  'fromage' : "119",
                  'toage' : "0",
                  'limit' : "25",
                  'radius': 25, 
                  'userip' : "192.168.0.58",
                  'useragent' : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6)",
                  'start': start
                 }
    
    search_results = client.search(**parameters)
    jobs_count = search_results['totalResults']
    return jobs_count


In [3]:
# function to get the details of all 'data' related jobs posted on Indeed.com in last 120 days (using Indeed API)

def get_job_data(key, start, location):
    
    client = IndeedClient(publisher = key)
    
    parameters = {'q' : "title:('data' OR ('business' AND 'intelligence') OR 'BI' OR \
                  ('machine' AND 'learning') OR 'AI' OR ('artificial' AND 'intelligence') \
                  OR 'tableau' OR ('power' AND 'BI') OR statistical' OR 'statistician)",
                  'l' : location,
                  'sort' : "date",
                  'fromage' : "119",
                  'toage' : "0",
                  'limit' : "25",
                  'radius': 25,
                  'userip' : "192.168.0.58",
                  'useragent' : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6)",
                  'start': start
                 }
    
    search_results = client.search(**parameters)
    jobs_df = pd.DataFrame(search_results['results'])
    return jobs_df


In [4]:
# defining a list of cities(locations) to be used for analysis

cities = ['Austin, TX', 'San Francisco, CA', 'Raleigh, NC', 
          'Denver, CO', 'Seattle, WA', 'Atlanta, GA', 
          'Boston, MA', 'New York, NY', 'Washington, DC', 'Columbus, OH']

In [5]:
# calling the function to find the total number of 'data' related jobs posted on Indeed.com in last 120 days for 10 cities

total_job_count = 0
city_job_count = 0

for city in cities:
    
    city_job_count = get_job_count(key=publisher_key, start=0, location=city)
    print(f'Total number of data jobs posted for {city} in last 120 days -', city_job_count)
    total_job_count +=  city_job_count

print('Total number of data job posted on Indeed.com in last 120 days for selected list of cities is: ', total_job_count)

Total number of data jobs posted for Austin, TX in last 120 days - 457
Total number of data jobs posted for San Francisco, CA in last 120 days - 1368
Total number of data jobs posted for Raleigh, NC in last 120 days - 424
Total number of data jobs posted for Denver, CO in last 120 days - 563
Total number of data jobs posted for Seattle, WA in last 120 days - 1065
Total number of data jobs posted for Atlanta, GA in last 120 days - 641
Total number of data jobs posted for Boston, MA in last 120 days - 1249
Total number of data jobs posted for New York, NY in last 120 days - 1777
Total number of data jobs posted for Washington, DC in last 120 days - 2378
Total number of data jobs posted for Columbus, OH in last 120 days - 299
Total number of data job posted on Indeed.com in last 120 days for selected list of cities is:  10221


In [6]:
# calling function to get the details of all 'data' related jobs posted on Indeed.com in last 120 days for 10 cities
# storing it in a dataframe

data_jobs_df = pd.DataFrame()

try:
    for city in cities:

        start = 0
        count = 0
        city_job_count = get_job_count(key=publisher_key, start=0, location=city)
        # below print statement is for the purpose of ensuring the loop is processing all pages  
        print(f'Total number of data jobs posted for {city} in last 120 days -', city_job_count)
        count = (city_job_count // 25) + 1  

        for i in range(0, count):
            print(i, '-', start)
            data_jobs = get_job_data(key=publisher_key, start=start, location=city)
            data_jobs_df = data_jobs_df.append(data_jobs, ignore_index=True)
            start += 25
            
except Exception:
    pass
        

Total number of data jobs posted for Austin, TX in last 120 days - 457
0 - 0
1 - 25
2 - 50
3 - 75
4 - 100
5 - 125
6 - 150
7 - 175
8 - 200
9 - 225
10 - 250
11 - 275
12 - 300
13 - 325
14 - 350
15 - 375
16 - 400
17 - 425
18 - 450
Total number of data jobs posted for San Francisco, CA in last 120 days - 1368
0 - 0
1 - 25
2 - 50
3 - 75
4 - 100
5 - 125
6 - 150
7 - 175
8 - 200
9 - 225
10 - 250
11 - 275
12 - 300
13 - 325
14 - 350
15 - 375
16 - 400
17 - 425
18 - 450
19 - 475
20 - 500
21 - 525
22 - 550
23 - 575
24 - 600
25 - 625
26 - 650
27 - 675
28 - 700
29 - 725
30 - 750
31 - 775
32 - 800
33 - 825
34 - 850
35 - 875
36 - 900
37 - 925
38 - 950
39 - 975
40 - 1000
41 - 1025
42 - 1050
43 - 1075
44 - 1100
45 - 1125
46 - 1150
47 - 1175
48 - 1200
49 - 1225
50 - 1250
51 - 1275
52 - 1300
53 - 1325
54 - 1350
Total number of data jobs posted for Raleigh, NC in last 120 days - 424
0 - 0
1 - 25
2 - 50
3 - 75
4 - 100
5 - 125
6 - 150
7 - 175
8 - 200
9 - 225
10 - 250
11 - 275
12 - 300
13 - 325
14 - 350
15 - 37

In [7]:
data_jobs_df.count()

jobtitle                 10284
company                  10284
city                     10284
state                    10284
country                  10284
language                 10284
formattedLocation        10284
source                   10284
date                     10284
snippet                  10284
url                      10284
onmousedown              10284
jobkey                   10284
sponsored                10284
expired                  10284
indeedApply              10284
formattedLocationFull    10284
formattedRelativeTime    10284
stations                 10284
dtype: int64

In [9]:
# displaying all job titles (to ensure the query returned all job titles included)

for _, row in data_jobs_df.iterrows():    
    
    print(row['jobtitle'])            


Data Center Security Engineer
Senior Engineer - Data Science
Senior Business Intelligence Data Analyst
Data Governance Program Manager
Operations Analyst - Symitar System and Episys Data Structure
R&D Data Analysis and Machine Learning Software Engineering Associate
Machine Learning Engineer
Data Scientist
Data Base Administrator IV
Lead Machine Learning Engineer
Temporary Customer Data Specialist - UTemps
Data Warehouse Architect
Data Manager
CPA - Data Analysis and Transparency Intern
Clinical Data Reporter
Data & Analytics - eCommerce
Backend Software Engineer - Data Platform
Data Entry Clerk
Junior Tableau Consultant (Entry Level)
Data Scientist
Data Science
Special Education Data Support Analyst (Data Analyst V)
Sanctions Data Analyst
Data Entry / Lab Assistant
Business Intelligence Analyst
Director, Data Science - Round Rock, TX
Data Engineer
Data Center Operations Analyst / Network Administrator - Austin, TX
Data Analyst / Integrations Analyst- Intern
Big Data Lead
AI Technology

Data Scientist
Statistical Modeling & Forecasting Scientist
Data Analytics Manager
Strategy & Ops Lead - Twitter Machine Learning Org
Tableau Report Developer
Staff EE Data Americas
Associate Data Scientist
Chief Data Scientist
Finance Data Analyst
Business Intelligence Analyst
Big Data Engineer
Data Engineer
Power BI Developer with T-SQL
Data Scientist
Data Scientist (Computer Vision)
Data Engineer
Sr Manager, Data Architecture
Data Wrangling Engineer
Data Engineer
Enterprise Data Management Specialist
Data Integration Engineer
Google Data Engineer
Data Engineer
Staff Data Scientist
Sr Analyst, GTM Data Operations & Analytics
Data Center Network Engineer
Data Consultant
SAP MDG Data Management Consultant
Team Coordinator EE Data & Rewards Services Americas
Data Scientist
Data Science Manager
Data Engineer
Engineering Manager - Data Engineering
Principal Data Engineer
Manager, Statistical Analyst
Senior-Lead Machine Learning Engineer (Perception)
NESAP for Data Postdoctoral Fellow
Vice

HCS IT DATA & INTEROPERABILITY DEVELOPER SENIOR
HCS IT DATA & INTEROPERABILITY DEVELOPER SENIOR
Manager, Engineering Software , Cloud Native Data
Sr Windchill Data / Migration Consultant
AI Educator Advocate
AI Educator Advocate
Data Scientist
HUMAN SERVICES COORDINATOR III (Provisional Triple P Coordinator/Data Evaluator)
Affirmative Action Data Analyst
Ultimate Hoops Statistician
Business Strategy & Data Analyst
Business Intelligence Developer
Associate Business Intelligence Developer
Data Engineer
Sr. Data Governance Specialist
Statistical Programmer
Data Governance Projects Lead
Principal Statistician
Tableau Administrator
Data Collector in Sampson County
Manager; Data and Analytics Consulting Solutions
Data Science Leader
Senior Statistical Programmer (Remote)
Associate Director of Clinical Data Management, FSP
Senior Data Collection Manager
Senior Statistical Programmer
Field Data Collector
Data Migration Project Manager
Data Project Manager
Senior Data Consultant
Cloud Enablemen

SAP Data Conversion - MDG, BODS, Information Steward, SDI, MDM, DQ, SDQ, SAP Master, Data Management
Data Center Technician Lead
Research Intern - AI Special Projects
Workday Certified Data Consultant
Principal PM for Computer Vision & Machine Learning, Amazon Photos
Internship - Data Engineer
Sr. Data Engineer
Software Engineer, Machine Learning Systems
Enterprise Data Management Associate Manager
Senior Developer - Data Systems
Sr. Data & Applied Scientist Manager
Product Manager, Data Services & Insights
Sr. Data Scientist, Molecular Product Development
Azure Technical Architect - Azure Databricks, Data Factory, Event Hub, Azure SQL DB
Sr. Product Manager - Technical, Alexa Artificial Intelligence
Senior Data Engineer
Software Development Engineer II, Machine Learning
Data Strategy Lead - Cloud, AWS, Azure, GCP, Consulting Experience, Team Lead
Sr Data Engineer - Alexa Shopping
Microsoft AI Enterprise Go-to-Market Lead
Workday- Certified Data Assoc Mgr
Cloud Data Services Sales Repr

Lead Strategic AI & Bot Architect
Programmer/Developer Lead - Tableau Developer - Atlanta, GA
Cloud Enablement, Manager – Data Delivery (Azure)
Lead Software Solutions Specialist, STT Data Management
Cloud Enablement, Senior Consultant – Data Delivery (Azure)
Data Scientist
Data Information Specialist
Data Clerk
Machine Learning Engineer - QuantumBlack
Data Integration Specialist
Data Analyst I
ATS Data Coordinator 2020 Season
Business Intelligence Analyst - Jemstep
Data Scientist
Data Scientist Job
Data Warehouse Developer/Integration
Data Scientist, Mid
Small Business Consultant -Norcross, GA. (Bi
Data Center Operations Technician
Data Analyst
Analyst, Data & Analysis
Machine Learning Engineer - SEAL
Director, Data Governance & Enablement
Sr. Data Scientist
Data Warehouse Architect
Data Scientist - Experienced Associate
Account Executive - Data Guidance
BI/DW Technical Lead
Oracle Data Engineer
Health Statistician, Senior
Azure Data Engineer
Product Owner - Data Services
Small Busine

Senior Data Scientist- Minnesota
Data Privacy Software Developer
Senior Data Analyst - Air
Machine Learning Engineer
Product Manager - Data Science
Data Science Project Manager
Data Systems Analyst
AI/ML Sr Principal Scientist
Sr Mgr Clinical Data Solutions
Senior Data Protection Analyst
Senior Data Engineer (CCDS)
Data Project Manager/ 40 Hours/ Clinical Neurology
Senior Data Protection Analyst
Senior data engineer
Senior IT Architect, Commercial Data Management
Statistical Science Director, Payer Evidence Biostatistics
Senior Data Engineer, Customer Information Management (CIM)
Internship: Machine learning for Contact-rich Robotic Manipulation
Sr. Big Data Solutions Engineer (Digital Health)
Senior Manager, Data Management
Senior Manager, Oncology Data Acquisition
Principal Clinical Data Manager
Senior Clinical Data Manager
Lead Consultant – Data & Analytics Solutions
Dental Receptionist - Bi-Lingual
Analyst I/II, Data Science
Assistant Director/Director I, Data Science
Data Analyst


Event Data and Systems Strategist
Data Architect
Data Intelligence Engineer
CIB - Data Scientist Associate
ESG Data Analytics
Data Analyst (Non-Profit)
Machine Learning Engineer
Machine Learning Solutions Engineer, Google Cloud, Professional Services
ODI Data Analyst
Data Engineer
Tableau Developer F2F GC USC.
Data Engineer
Staff Editor - Statistical Modeling
Senior Data Analyst
Full-Stack Software Engineer, Machine Learning
Data Engineer
Director, North America, LatAm and Global Customer Data Acquisition & Planning
VP of Data Science
Data Modeling Specialist Brooklyn 2312237
Machine Learning Intern
Sr. Data Scientist
Senior Data Scientist
Data Scientist (Senior Associate) - Industry Program
Master Data Coordinator
Data Analyst
Data Specialist
Data Analyst
Data Protection and Privacy Counsel
Associate, Trade & Customs - Data Analytics
Data Analyst
Learning and Development Lead - Data Innovation & Insights
Senior ETL Data Stage Developer
Data Center Director, WTC Environmental Health Ce

Coordinator, Clinical Data - RN / Per Diem Day Shift / Quality Management
Senior Product Manager - Advertising Data
Senior Sales Engineer - Cloud Data Integration Leader
Mechanical / Electrical Design Quality Project Manager & Program Coordinator (Data Centers)
Data Scientist – Pharmacy Analytics (Machine Learning)
Enterprise Data - Business Intelligence
Machine Learning/NLP Engineer
Machine Learning Engineer
Data Scientist – Disney+ Product, Commerce
Data Scientist
Argus Safety Data Manager
Data Scientist
Machine Learning Engineer
Data Modeler - Data Architecture
Data Scientist
Data Analyst (NCU)
Data Strategist
Data Scientist
Technical Product Manager, Data Infrastructure
Senior Software Engineer, Data/Systems
Lead Analyst Data Management Quantitative Analysis-Derivative Analyst
Data Engineer
Sr. Engineer, Enterprise Data Systems
Asset & Wealth Management – Intelligent Digital Solutions – Data Science Engineer- Associate/ Vice President
Senior Data Engineer
Coordinator, Clinical Data

Business Data Analyst (Cyber)
Jr. Data Analyst
Data Scientist
Data Scientist
Sr. Data Analyst (Omniture/Adobe Analytics Required)
Data Analyst
Intermediate Data Analyst
Machine Learning Engineer
Data Scientist
GIS Data/Support Specialist - CII
API Integration and Data Specialist ( Cleared)
Data Entry Operator
Statistical Analysis Performance Metrics Analyst
Business Intelligence Analyst
Data Scientist (Entry)
EFS/Data Specialist
Sr. Tableau Developer/ Sr. Data Analyst
Data Analytics Architect - Junior
Corporate Data Management and Analysis
Data Analyst Supporting the FBI
Data Modeler
Data Assistant
Data Coordinator, KGSP
Workspace Data Analyst
Admissions Data Analyst (15553)
Workspace Data Analyst
Data Scientist (Senior Associate) - Industry Program
DATA ARCHITECT
BI Data Analyst
Data Scientist
Tableau Developer
Principal Data Scientist
Sr. ETL Data Expert
Junior Data Analyst
Data Analytics Engineer – Junior
Data and demand planning scientist, GHSC-PSM
Statistical Analyst
Data Associat

Master Data Engineer - Card Technology
Senior Data Analyst
Senior Data Scientist
Data Modeler, Senior
Data Modeler - GEOINT Standards - Expert Level (1581)
Intelligence Data Analyst, Senior
Data Scientist TOP SECRET CLEARANCE REQUIRED
Security - Senior Data Engineer
Data Modeler - GEOINT Standards - Senior Level (1580)
Data Scientist Senior/Expert TS/SCI
Transplant Quality Data Specialist, Nurse Preferred
Mid-Level Data Analyst
Data Scientist- Senior Associate
Federal - Data Analytics Consultant
Data Analyst - Mid_ANG
Data Center Account Executive - Civilian
Big Data Architect
Senior Data Engineer
Data Scientist
Senior Financial Data Analyst
Data Analyst Mid
Data Architect -- Rosslyn, VA
Data Management Analyst / Digital Strategy / Research / Navy
Senior Data Scientist
Data Analytics Pipelines Section Manager
Master Data Engineer - Card Technology
Senior Data Analyst
Senior Data Scientist
Data Modeler, Senior
Data Modeler - GEOINT Standards - Expert Level (1581)
Intelligence Data Analy

In [8]:
# saving the data-jobs data in a csv file

data_jobs_df.to_csv('../data/all_data_jobs.csv', index=False)

### Scraping Experience Data for the Job Postings 

In [1129]:
# getting the list of URLs for data jobs extracted from the job portal to be used in all scraping functions

url_list = data_jobs_df['url'].tolist()

In [1114]:
# function that uses web scraping to get minimum experience requirement for data job postings

def get_years_of_experience(url_list):
    
    for i in url_list:
      
        try:
            res = requests.get(i)
            res.raise_for_status()
            Soup = bs4.BeautifulSoup(res.text, 'html.parser')
            item = Soup.select('#jobDescriptionText')
            item = str(item[0])
            item_clean = re.findall(r'(?:(\d+)(?=.*years?)(?=.*experience))|(?:experience.*(\d+).*\+years?)|(?:(\d+\+)(?=.*years?)(?=.*experience))', item)[0]
            item_clean = list(filter(None, item_clean))[0]   
            exp_url.append(i)
            exp_years.append(item_clean)

        except Exception:
            pass

In [1115]:
exp_url = []
exp_years = []

# calling the function to get experience data

get_years_of_experience(url_list)

In [1118]:
# creating a dataframe for minimum experience data

exp_years_df = pd.DataFrame({'url': exp_url,
                             'exp': exp_years
                        })

# further cleaning - excluding any invalid experience 'number of year' returned as a result of scraping if it is > 15
exp_years_df['exp'] = pd.to_numeric(exp_years_df['exp'])
exp_years_df = exp_years_df[exp_years_df['exp'] <= 15].reset_index()

exp_years_df.head(3)

Unnamed: 0,index,url,exp
0,0,http://www.indeed.com/viewjob?jk=389c5707bd0ad...,3
1,1,http://www.indeed.com/viewjob?jk=0df9dac32fbac...,3
2,2,http://www.indeed.com/viewjob?jk=385998e9d4244...,2


In [None]:
exp_years_df.head(3)

In [1121]:
# saving minimum experience data in a csv file

exp_years_df.to_csv('../data/experience_data.csv', index=False)

### Scraping Company Rating Data

In [1012]:
# function that uses web scraping to get company rating and review count 

def get_company_reviews(url_list):
    
    for i in url_list:

        try:
            res = requests.get(i)
            res.raise_for_status()
            Soup = bs4.BeautifulSoup(res.text, 'html.parser')
            rating = Soup.select('.icl-Ratings-starsCountWrapper')
            rating = str(rating[-1])    
            rating = re.findall("[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?", rating)

            cmp_rating_url.append(i)
            cmp_rating.append(float(rating[0]))
            cmp_review_count.append(float(rating[-1]))            
      
        except Exception:
            pass
        

In [1013]:
# calling the function to get company ratings 

cmp_rating_url = []
cmp_rating = []
cmp_review_count = []

get_company_reviews(url_list)

In [1014]:
# creating a dataframe for company ratings

comp_rating_df = pd.DataFrame({'url': cmp_rating_url,
                               'rating': cmp_rating
                        })
comp_rating_df.head(3)

Unnamed: 0,url,rating
0,http://www.indeed.com/viewjob?jk=389c5707bd0ad...,3.6
1,http://www.indeed.com/viewjob?jk=0df9dac32fbac...,3.8
2,http://www.indeed.com/viewjob?jk=385998e9d4244...,3.1


In [747]:
# saving company ratings data in a csv file

comp_rating_df.to_csv('../data/company_ratings_data.csv', index=False)

### Scraping the Salary Data

In [1130]:
# function that uses web scraping to get data job salaries  
 
def get_sal(url_list):
    
    for i in url_list:
         
        try:
            response = requests.get(i)
            response.raise_for_status()
            soup = bs4.BeautifulSoup(response.text, 'html.parser')
            salary = str(soup.select('div.jobsearch-JobMetadataHeader-item > span'))
            tag_re = re.compile(r'<[^>]+>|')
            salary = tag_re.sub('', salary)
            salary = salary[1:-1]
            salary = salary.replace('$', '')
            salary = salary.replace(',', '')
            if (not salary == '') and (contains_digits(salary)): 
                sal_url_list.append(i)
                sal_list.append(salary)
        except Exception:
            pass


In [1131]:
def contains_digits(s):
    return any(char.isdigit() for char in s)

In [1132]:
sal_list = []
sal_url_list = []

# calling the function to get salaries for the data jobs
get_sal(url_list)

In [1124]:
# creating a dataframe of data job salaries

salary_df = pd.DataFrame({'url': sal_url_list,
                          'sal': sal_list
                        })

In [1126]:
# function to clean the salary data further

def clean_sal_data(sal):
         
    if 'Part-time' in sal:
        sal = sal.replace('Part-time', '')
        
    if 'Contract' in sal:
        sal = sal.replace('Contract', '')  
        
    if 'Full-time Temporary' in sal:
        sal = sal.replace('Full-time Temporary', '') 
    
    if 'Full-time Temporary Contract' in sal:
        sal = sal.replace('Full-time Temporary Contract', '')
                          
    if 'Full-time Internship' in sal:
        sal = sal.replace('Full-time Internship', '')
                          
    if 'Temporary' in sal:
        sal = sal.replace('Temporary', '')
        
    if 'Full-time' in sal:
        sal = sal.replace('Full-time', '')
                            
    return sal


# Alternate way of writing this function is by using the below RegEx string   

#         tag_re = re.compile(r'\b\w*(?i)(part-time|Contract|Full-time Temporary|Full-time Temporary Contract|Full-time Internship|Temporary|Full-time)\w*\b')
#         salary = re.sub(tag_re, "", sal)



In [1128]:
# cleaning salary data

salary_df['sal'] = salary_df['sal'].apply(clean_sal_data).str.rstrip('-')

In [810]:
# function to calculate minimum salary

def get_min_sal(salary):
    
    salary = salary.split(" - ")
    
    # getting mins  
    sal_min = re.findall("[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?", salary[0])[-1]
   
    # cadence
    cadence = ''.join(filter(lambda x: x.isalpha(), salary[-1]))

    sal_dict.update({
        'min': float(sal_min),
        'cadence': cadence
    }) 

    # logic to take salaries to year digits
    
    if sal_dict['cadence'] == 'aday':
        sal_dict['min'] = float(sal_min) * 260

    elif sal_dict['cadence'] == 'amonth':
        sal_dict['min'] = float(sal_min) * 12
                  
    elif sal_dict['cadence'] == 'aweek':
        sal_dict['min'] = float(sal_min) * 52 
            
    elif sal_dict['cadence'] == 'anhour':
        sal_dict['min'] = float(sal_min) * 2080
        
    
        
    return sal_dict['min']    


In [811]:
# function to calculate maximum salary

def get_max_sal(salary):
    
    salary = salary.split(" - ")
    
    # getting maxs
    sal_max = re.findall("[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?", salary[-1])[-1]

    # cadence
    cadence = ''.join(filter(lambda x: x.isalpha(), salary[-1]))

    sal_dict.update({
        'max': float(sal_max),
        'cadence': cadence
    }) 

    # logic to take salaries to year digits
    
    if sal_dict['cadence'] == 'aday':
        sal_dict['max'] = float(sal_max) * 260

    elif sal_dict['cadence'] == 'amonth':
        sal_dict['max'] = float(sal_max) * 12       
        
    elif sal_dict['cadence'] == 'aweek':        
        sal_dict['max'] = float(sal_max) * 52 
                
    elif sal_dict['cadence'] == 'anhour':       
        sal_dict['max'] = float(sal_max) * 2080
        
    return  sal_dict['max']
       

In [1005]:
# splitting items into a dictionary

sal_dict = {'min': 0,
            'max': 0,
            'cadence': ''
}

salary_df['annual_min_salary'] =  salary_df['sal'].apply(get_min_sal)
salary_df['annual_max_salary'] =  salary_df['sal'].apply(get_max_sal)

In [1136]:
sal_df = salary_df[['url', 'annual_min_salary', 'annual_max_salary']]
sal_df.tail(2)

Unnamed: 0,url,annual_min_salary,annual_max_salary
547,http://www.indeed.com/viewjob?jk=ed09008b730b6...,80017.6,114379.2
548,http://www.indeed.com/viewjob?jk=1373bdf4a40b4...,80017.6,80017.6


In [819]:
# saving the salary data in a csv file

sal_df.to_csv('../data/sal_data.csv', index=False)