In [1]:
import pandas as pd
import numpy as np
import re

df = pd.read_csv("glassdoor_jobs.csv")
df.drop(columns="Unnamed: 0", inplace=True)

## Cleaning steps
* Salary parsing
* Company name text only
* State field
* Age of company
* Parsing of job description (python, etc.)




## Salary parsing

In [2]:
# Filtering out "-1" in salary est field
df = df[df["Salary Estimate"] != "-1"]

salary = df["Salary Estimate"].apply(lambda x: x.split("(")[0])


In [3]:
# Creating a check column for 'per hour' and 'employer provided salary' in the salary column
df["hourly"] = df["Salary Estimate"].apply(lambda x: 1 if "per hour" in x.lower() else 0)
df["employer_provided"] = df["Salary Estimate"].apply(lambda x: 1 if "employer provided salary" in x.lower() else 0)

In [4]:
# Using regex to remove K, $, Per hour and employer provided salary at once
salary_num = salary.apply(lambda x: re.sub(r"[^-0-9\.]","", x))

In [5]:
# Creating a column for min, max ansd avg salary for the cleaned salary
df["min_salary"] = salary_num.apply(lambda x: int(x.split("-")[0]))
df["max_salary"] = salary_num.apply(lambda x: int(x.split("-")[1]))
df["avg_salary"] = (df["min_salary"] + df["max_salary"])/2

In [6]:
# df.head()

## Company name text only
* I removed the rating in the Company name

In [7]:
df["Company Name"] = df.apply(lambda x: x["Company Name"] if x["Rating"] <0  else x["Company Name"][:-4], axis=1)

## State field and Company's Age


In [8]:
df["job_state"] = df["Location"].apply(lambda x: x.split(",")[1] )

# After doing the above, I noticed there is Los Angeles in the job_state, below handled the error
df["job_state"] = df.job_state.apply(lambda x: x.strip() if x.strip().lower( ) != "los angeles" else "CA")

# Check if the job location is same as headquarter
df["is_headquarter"] = df.apply(lambda x: 1 if x["Headquarters"]  == x["Location"] else 0, axis=1 )

# calculating company's age
df["age"] = df["Founded"].apply(lambda x: x if x<1 else (2020-x) )

## Parsing of job description (python, etc.)

In [9]:
# Check what the job description looks like
print(df["Job Description"][1])

What You Will Do:

I. General Summary

The Healthcare Data Scientist position will join our Advanced Analytics group at the University of Maryland Medical System (UMMS) in support of its strategic priority to become a data-driven and outcomes-oriented organization. The successful candidate will have 3+ years of experience with Machine Learning, Predictive Modeling, Statistical Analysis, Mathematical Optimization, Algorithm Development and a passion for working with healthcare data. Previous experience with various computational approaches along with an ability to demonstrate a portfolio of relevant prior projects is essential. This position will report to the UMMS Vice President for Enterprise Data and Analytics (ED&A).

II. Principal Responsibilities and Tasks

• Develops predictive and prescriptive analytic models in support of the organization’s clinical, operations and business initiatives and priorities.
• Deploys solutions so that they provide actionable insights to the organizat

In [10]:
df["python"] = df["Job Description"].apply(lambda x: 1 if "python" in x.lower() else 0)
df["sql"] = df["Job Description"].apply(lambda x: 1 if "sql" in x.lower() else 0)
df["spark"] = df["Job Description"].apply(lambda x: 1 if "spark" in x.lower() else 0)
df["aws"] = df["Job Description"].apply(lambda x: 1 if "aws" in x.lower() else 0)
df["gcp"] = df["Job Description"].apply(lambda x: 1 if "gcp" in x.lower() else 0)
df["azure"] = df["Job Description"].apply(lambda x: 1 if "azure" in x.lower() else 0)
df["java"] = df["Job Description"].apply(lambda x: 1 if "java" in x.lower() else 0)
df["tableau"] = df["Job Description"].apply(lambda x: 1 if "tableau" in x.lower() else 0)
df["powerbi"] = df["Job Description"].apply(lambda x: 1 if "power bi" in x.lower() else 0)
df["excel"] = df["Job Description"].apply(lambda x: 1 if "excel" in x.lower() else 0)

In [11]:
# Getting job specifics using keyword
def job_spec(x):
    x = x.lower()
    job = ["data scientist", "data engineer", "data analyst", "ml", "machine learning", "data modeler", "manager", "director"]

    if job[3] in x:
        return job[4]
    if job[4] in x:
        return job[4]
    if job[0] in x:
        return job[0]
    if job[1]  in x:
        return job[1]  
    if job[2] in x:
        return job[2]
    if job[5] in x:
        return job[1]
    if job[6] in x:
        return job[6]
    if job[7] in x:
        return job[7]    

    return 'n'

# Getting the seniority in job title
def seniority(title):
    if 'sr' in title.lower() or "senior" in title.lower() or 'lead' in title.lower() or 'principle' in title.lower():
        return 'senior'
    elif 'jr' in title.lower() or "junior" in title.lower():
        return 'junior'
    return 'na'


In [12]:
df["job_spec"] = df["Job Title"].apply(job_spec)
df.job_spec.value_counts()

data scientist      269
n                   180
data engineer       124
data analyst         99
machine learning     34
manager              22
director             14
Name: job_spec, dtype: int64

In [13]:
df["seniority"] = df["Job Title"].apply(seniority)
df.seniority.value_counts()

na        544
senior    195
junior      3
Name: seniority, dtype: int64

In [14]:
df["min_salary"] = df.apply(lambda x: x.min_salary*2 if x.hourly == 1 else x.min_salary, axis=1)
df["max_salary"] = df.apply(lambda x: x.max_salary*2 if x.hourly == 1 else x.max_salary, axis=1)

In [15]:
df.to_csv("cleaned_glassdoor_jobs.csv", index=False)