# Cleaning Scraped Indeed Data

## Load in Scraped Data

In [274]:
import pandas as pd

# file_name = '2019-09-02_indeed-ds-postings.csv'
file_name = '2019-09-03_indeed-ds-postings.csv'
# # df = pd.read_csv('2019-09-02indeed_ds_postings.csv')
# names = ['UID', 'job_title', 'company_name', 'city', 'state', \
#          'zipcode', 'neighborhood', 'description', 'salary', 'link']
# df = pd.read_csv(file_name, index_col='UID', skiprows=1, names=names)
# names = ['job_title', 'company_name', 'city', 'state', \
#          'zipcode', 'neighborhood', 'description', 'salary_range', 'link']
names = ['job_title', 'company_name', 'location', 'neighborhood', 'description', 'salary_range', 'link']
df = pd.read_csv(file_name, index_col=0, skiprows=1, names=names)

## 1. Getting Rid of Duplicates

In [275]:
print('Before: ', df.shape)
df = df.drop_duplicates()
df.reset_index(drop=True, inplace=True)
print('After: ', df.shape)

Before:  (515, 7)
After:  (305, 7)


## 2. Cleaning Up Salary

### Looking at Salary Values

In [276]:
df['salary_range'].value_counts()

                                              290
$110,000 - $150,000 a year                      1
$75,000 - $120,000 a year                       1
$75,460 - $111,158 a year                       1
$100,000 - $150,000 a year                      1
$150,000 - $220,000 a year                      1
$93,400 - $134,100 a year                       1
$50,000 - $60,000 a year                        1
Similar jobs pay $76,000 - $112,000 a year      1
$100,000 - $120,000 a year                      1
$75,125 a year                                  1
$132,300 - $173,200 a year                      1
$50,794 - $71,864 a year                        1
$85,000 - $105,000 a year                       1
$64,295 - $94,120 a year                        1
$80,000 - $120,000 a year                       1
Name: salary_range, dtype: int64

### Getting Rid of Excess Symbols

In [277]:
df['salary_range'] = df['salary_range'].str.replace(',', '')
df['salary_range'] = df['salary_range'].str.replace('$', '')
df['salary_range'] = df['salary_range'].str.replace('Similar jobs pay ', '')


###  Saving Pay Period

In [278]:
df.loc[df['salary_range'].str.endswith('a year'), 'period'] = 'yearly'
df.loc[df['salary_range'].str.endswith('a month'), 'period'] = 'monthly'
df.loc[df['salary_range'].str.endswith('a week'), 'period'] = 'weekly'
df.loc[df['salary_range'].str.endswith('a day'), 'period'] = 'daily'
df.loc[df['salary_range'].str.endswith('an hour'), 'period'] = 'hourly'

### Getting Rid of String in Salary

In [279]:
# df['salary'].apply(lambda s: s.rstrip('a year'))
df['salary_range'] = df['salary_range'].str.rstrip('a year')
df['salary_range'] = df['salary_range'].str.rstrip('a month')
df['salary_range'] = df['salary_range'].str.rstrip('a week')
df['salary_range'] = df['salary_range'].str.rstrip('a day')
df['salary_range'] = df['salary_range'].str.rstrip('an hour')

In [280]:
df.head()

Unnamed: 0,job_title,company_name,location,neighborhood,description,salary_range,link,period
0,Data Scientist,BD,"Boston, MA",,Job Description Summary Digital Health is a bu...,,4631c716fc96075a,
1,Data Scientist Analyst,Lincoln Financial,"Boston, MA 02109",Central area,"Alternate Locations: Boston, MA (Massachusetts...",,0873f4263546b21a,
2,Data Scientist,Indeed Prime,"Boston, MA",,Indeed Prime is a free service that connects q...,,8c7e78291c43f729,
3,Computational Biologist/Data Scientist,Goldfinch Bio,"Cambridge, MA",,Goldfinch Bio is a biotechnology company that ...,,f14bc6dec8b4f60f,
4,Healthcare Data Scientist,Vertex Pharmaceuticals,"Boston, MA 02210",South Boston area,Job Description: Healthcare Data Scientist Ar...,,cd849809b3b43018,


### Converting Salary Range String to Average Annual Float

In [281]:
periods_in_workyear = {'yearly':1, 'monthly':12, 'weekly':52, 'daily':261, 'hourly': 2088}

def get_salary_avg(row):
    sal_range = row['salary_range']
    sal = sal_range.split('-')
    try:
        avg_salary = (int(sal[0]) + int(sal[1])) // 2
        period = row['period']
        avg_salary_yearly = avg_salary * periods_in_workyear[period]
        return avg_salary_yearly
    except:
        try:
            return int(sal_range)
        except:
            return sal_range

df['annual_salary'] = df.apply(get_salary_avg, axis=1)

In [282]:
df.head(10)
# df

Unnamed: 0,job_title,company_name,location,neighborhood,description,salary_range,link,period,annual_salary
0,Data Scientist,BD,"Boston, MA",,Job Description Summary Digital Health is a bu...,,4631c716fc96075a,,
1,Data Scientist Analyst,Lincoln Financial,"Boston, MA 02109",Central area,"Alternate Locations: Boston, MA (Massachusetts...",,0873f4263546b21a,,
2,Data Scientist,Indeed Prime,"Boston, MA",,Indeed Prime is a free service that connects q...,,8c7e78291c43f729,,
3,Computational Biologist/Data Scientist,Goldfinch Bio,"Cambridge, MA",,Goldfinch Bio is a biotechnology company that ...,,f14bc6dec8b4f60f,,
4,Healthcare Data Scientist,Vertex Pharmaceuticals,"Boston, MA 02210",South Boston area,Job Description: Healthcare Data Scientist Ar...,,cd849809b3b43018,,
5,Data Scientist (Full-Time),proton.ai,"Boston, MA",,*Job Description Data Scientist (Full-Time)Tea...,75000 - 120000,8cacfaa3c21d0129,yearly,97500.0
6,"Data Scientist (Intern, Part-Time)",proton.ai,"Boston, MA",,"Data Scientist (Intern, Part-Time)Team: Data ...",,1a087273ecb3d9e3,,
7,Data Scientist,Park Jockey,"Boston, MA",,Who You’ll Work For REEF Technology is the eco...,,0cc7c0afb827e835,,
8,Data Scientist - University Students,McKinsey & Company,"Boston, MA",,Qualifications University student in their fin...,,d379c7bc7ecd652e,,
9,Data Scientist,Novartis,"Cambridge, MA",,750 million. That’s how many lives our product...,,22776da337e3bc5c,,


### Switch the Columns in Case We Use it Later

In [283]:
cols = df.columns.tolist()
cols

['job_title',
 'company_name',
 'location',
 'neighborhood',
 'description',
 'salary_range',
 'link',
 'period',
 'annual_salary']

In [284]:
cols[5], cols[-1] = cols[-1], cols[5]
cols

['job_title',
 'company_name',
 'location',
 'neighborhood',
 'description',
 'annual_salary',
 'link',
 'period',
 'salary_range']

In [285]:
df = df[cols]
df.head(10)

Unnamed: 0,job_title,company_name,location,neighborhood,description,annual_salary,link,period,salary_range
0,Data Scientist,BD,"Boston, MA",,Job Description Summary Digital Health is a bu...,,4631c716fc96075a,,
1,Data Scientist Analyst,Lincoln Financial,"Boston, MA 02109",Central area,"Alternate Locations: Boston, MA (Massachusetts...",,0873f4263546b21a,,
2,Data Scientist,Indeed Prime,"Boston, MA",,Indeed Prime is a free service that connects q...,,8c7e78291c43f729,,
3,Computational Biologist/Data Scientist,Goldfinch Bio,"Cambridge, MA",,Goldfinch Bio is a biotechnology company that ...,,f14bc6dec8b4f60f,,
4,Healthcare Data Scientist,Vertex Pharmaceuticals,"Boston, MA 02210",South Boston area,Job Description: Healthcare Data Scientist Ar...,,cd849809b3b43018,,
5,Data Scientist (Full-Time),proton.ai,"Boston, MA",,*Job Description Data Scientist (Full-Time)Tea...,97500.0,8cacfaa3c21d0129,yearly,75000 - 120000
6,"Data Scientist (Intern, Part-Time)",proton.ai,"Boston, MA",,"Data Scientist (Intern, Part-Time)Team: Data ...",,1a087273ecb3d9e3,,
7,Data Scientist,Park Jockey,"Boston, MA",,Who You’ll Work For REEF Technology is the eco...,,0cc7c0afb827e835,,
8,Data Scientist - University Students,McKinsey & Company,"Boston, MA",,Qualifications University student in their fin...,,d379c7bc7ecd652e,,
9,Data Scientist,Novartis,"Cambridge, MA",,750 million. That’s how many lives our product...,,22776da337e3bc5c,,


In [297]:
df['annual_salary'].value_counts()

                                   290
185000                               1
97500                                1
95000                                1
Similar jobs pay 76000 - 112000      1
55000                                1
113750                               1
75125                                1
61329                                1
130000                               1
152750                               1
100000                               1
125000                               1
110000                               1
79207                                1
93309                                1
Name: annual_salary, dtype: int64

## 3. Splitting Location into City, State, and Zipcode
Define functions to apply each row

In [None]:
def get_zipcode(location):
    zipcode = ' '
    temp = [ s for s in location.split() if s.isdigit() ]
    if temp:
        zipcode = temp.pop()
    return zipcode

def get_city_and_state(location):
    city_state = location.split(', ')
    state = city_state.pop()
    city = city_state.pop()
    return city, state

def parse_location_info(row):
    location = row['location']
    
    zipcode = get_zipcode(location)
    
    location = location.strip(zipcode)
    city, state = get_city_and_state(location)
    
    row['city'] = city
    row['state'] = state
    row['zipcode'] = zipcode
    return row

Apply the parsing functions

In [None]:
df = df.apply(parse_location_info, axis=1)
df.head()

## 4. Set Description to Lower Case
We're going to do some analysis on the description column. Let's make all of the words lower case so there's no difference between a word that starts a sentence to one that appears anywhere else.

In [292]:
df['description'] = df['description'].str.lower()
df.head()

Unnamed: 0,job_title,company_name,annual_salary,city,state,zipcode,neighborhood,description,link
0,Data Scientist,BD,,Boston,MA,,,job description summary digital health is a bu...,4631c716fc96075a
1,Data Scientist Analyst,Lincoln Financial,,Boston,MA,2109.0,Central area,"alternate locations: boston, ma (massachusetts...",0873f4263546b21a
2,Data Scientist,Indeed Prime,,Boston,MA,,,indeed prime is a free service that connects q...,8c7e78291c43f729
3,Computational Biologist/Data Scientist,Goldfinch Bio,,Cambridge,MA,,,goldfinch bio is a biotechnology company that ...,f14bc6dec8b4f60f
4,Healthcare Data Scientist,Vertex Pharmaceuticals,,Boston,MA,2210.0,South Boston area,job description: healthcare data scientist ar...,cd849809b3b43018


## Get Rid of the Columns We Don't Need and Rearrange

In [293]:
# df = df.drop(['salary_range', 'period'], axis=1)
# df = df
# df.head()
cols = df.columns.tolist()
cols

['job_title',
 'company_name',
 'annual_salary',
 'city',
 'state',
 'zipcode',
 'neighborhood',
 'description',
 'link']

In [294]:
cols = ['job_title',
        'company_name',
        'annual_salary',
        'city',
        'state',
        'zipcode',
        'neighborhood',
        'description',
        'link',
]
df = df[cols]

In [295]:
df.head(10)

Unnamed: 0,job_title,company_name,annual_salary,city,state,zipcode,neighborhood,description,link
0,Data Scientist,BD,,Boston,MA,,,job description summary digital health is a bu...,4631c716fc96075a
1,Data Scientist Analyst,Lincoln Financial,,Boston,MA,2109.0,Central area,"alternate locations: boston, ma (massachusetts...",0873f4263546b21a
2,Data Scientist,Indeed Prime,,Boston,MA,,,indeed prime is a free service that connects q...,8c7e78291c43f729
3,Computational Biologist/Data Scientist,Goldfinch Bio,,Cambridge,MA,,,goldfinch bio is a biotechnology company that ...,f14bc6dec8b4f60f
4,Healthcare Data Scientist,Vertex Pharmaceuticals,,Boston,MA,2210.0,South Boston area,job description: healthcare data scientist ar...,cd849809b3b43018
5,Data Scientist (Full-Time),proton.ai,97500.0,Boston,MA,,,*job description data scientist (full-time)tea...,8cacfaa3c21d0129
6,"Data Scientist (Intern, Part-Time)",proton.ai,,Boston,MA,,,"data scientist (intern, part-time)team: data ...",1a087273ecb3d9e3
7,Data Scientist,Park Jockey,,Boston,MA,,,who you’ll work for reef technology is the eco...,0cc7c0afb827e835
8,Data Scientist - University Students,McKinsey & Company,,Boston,MA,,,qualifications university student in their fin...,d379c7bc7ecd652e
9,Data Scientist,Novartis,,Cambridge,MA,,,750 million. that’s how many lives our product...,22776da337e3bc5c


## Save Cleaned Data

In [296]:
name, ext = file_name.split('.')
df.to_csv(name+'_cleaned.' + ext, encoding='utf-8')

In [86]:
# df['salary_period'] = df['salary'].apply( lambda s : s.endswith('a year') )
# df[df['salary_period'] == True] = 'year'
# df[ lambda s: df.salarys.endswith('a year') ] 
# df['hi'] = df.salary.apply(lambda s: s.endswith('a year'))


# df
# df.loc[0,'description']