In [1]:
import pandas as pd
import re

df = pd.read_csv("glassdoor_jobs.csv")
df.drop(columns="Unnamed: 0", inplace=True)

## Cleaning steps
* Salary parsing
* Company name text only
* State field
* Age of company
* Parsing of job description (python, etc.)




## Salary parsing

In [10]:
# Filtering out "-1" in salary est field
df = df[df["Salary Estimate"] != "-1"]

salary = df["Salary Estimate"].apply(lambda x: x.split("(")[0])


In [20]:
# Creating a check column for 'per hour' and 'employer provided salary' in the salary column
df["hourly"] = df["Salary Estimate"].apply(lambda x: 1 if "per hour" in x.lower() else 0)
df["employer_provided"] = df["Salary Estimate"].apply(lambda x: 1 if "employer provided salary" in x.lower() else 0)

In [41]:
# Using regex to remove K, $, Per hour and employer provided salary at once
salary_num = salary.apply(lambda x: re.sub(r"[^-0-9\.]","", x))

In [42]:
# Creating a column for min, max ansd avg salary for the cleaned salary
df["min_salary"] = salary_num.apply(lambda x: int(x.split("-")[0]))
df["max_salary"] = salary_num.apply(lambda x: int(x.split("-")[1]))
df["avg_salary"] = (df["min_salary"] + df["max_salary"])/2

In [62]:
df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,...,Sector,Revenue,Competitors,hourly,employer_provided,min_salary,max_salary,avg_salary,job_state,is_headquarter
0,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research,"Albuquerque, NM","Goleta, CA",501 to 1000 employees,1973,Company - Private,...,Aerospace & Defense,$50 to $100 million (USD),-1,0,0,53,91,72.0,NM,0
1,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\n\nI. General Summary\n\nThe...,3.4,University of Maryland Medical System,"Linthicum, MD","Baltimore, MD",10000+ employees,1984,Other Organization,...,Health Care,$2 to $5 billion (USD),-1,0,0,63,112,87.5,MD,0
2,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4,"Clearwater, FL","Clearwater, FL",501 to 1000 employees,2010,Company - Private,...,Business Services,$100 to $500 million (USD),-1,0,0,80,90,85.0,FL,1
3,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\nJob ID: 310709\n\n...,3.8,PNNL,"Richland, WA","Richland, WA",1001 to 5000 employees,1965,Government,...,"Oil, Gas, Energy & Utilities",$500 million to $1 billion (USD),"Oak Ridge National Laboratory, National Renewa...",0,0,56,97,76.5,WA,1
4,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,...,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee",0,0,86,143,114.5,NY,1


## Company name text only
* I removed the rating in the Company name

In [48]:
df["Company Name"] = df.apply(lambda x: x["Company Name"] if x["Rating"] <0  else x["Company Name"][:-4], axis=1)

## State field and Company's Age


In [63]:
df["job_state"] = df["Location"].apply(lambda x: x.split(",")[1] )

# Check if the job location is same as headquarter
df["is_headquarter"] = df.apply(lambda x: 1 if x["Headquarters"]  == x["Location"] else 0, axis=1 )

# calculating company's age
df["age"] = df["Founded"].apply(lambda x: x if x<1 else (2020-x) )

## Parsing of job description (python, etc.)

In [66]:
print(df["Job Description"][1])

What You Will Do:

I. General Summary

The Healthcare Data Scientist position will join our Advanced Analytics group at the University of Maryland Medical System (UMMS) in support of its strategic priority to become a data-driven and outcomes-oriented organization. The successful candidate will have 3+ years of experience with Machine Learning, Predictive Modeling, Statistical Analysis, Mathematical Optimization, Algorithm Development and a passion for working with healthcare data. Previous experience with various computational approaches along with an ability to demonstrate a portfolio of relevant prior projects is essential. This position will report to the UMMS Vice President for Enterprise Data and Analytics (ED&A).

II. Principal Responsibilities and Tasks

• Develops predictive and prescriptive analytic models in support of the organization’s clinical, operations and business initiatives and priorities.
• Deploys solutions so that they provide actionable insights to the organizat

In [73]:
df["python_yn"] = df["Job Description"].apply(lambda x: 1 if "python" in x.lower() else 0)
df["spark_yn"] = df["Job Description"].apply(lambda x: 1 if "spark" in x.lower() else 0)
df["aws_yn"] = df["Job Description"].apply(lambda x: 1 if "aws" in x.lower() else 0)
df["gcp_yn"] = df["Job Description"].apply(lambda x: 1 if "gcp" or 'google cloud platform' in x.lower() else 0)
df["azure_yn"] = df["Job Description"].apply(lambda x: 1 if "azure" in x.lower() else 0)
df["java_yn"] = df["Job Description"].apply(lambda x: 1 if "java" in x.lower() else 0)
df["tableau_yn"] = df["Job Description"].apply(lambda x: 1 if "tableau" in x.lower() else 0)
df["powerbi_yn"] = df["Job Description"].apply(lambda x: 1 if "power bi" in x.lower() else 0)
df["excel_yn"] = df["Job Description"].apply(lambda x: 1 if "excel" in x.lower() else 0)

In [74]:
df.columns

Index(['Job Title', 'Salary Estimate', 'Job Description', 'Rating',
       'Company Name', 'Location', 'Headquarters', 'Size', 'Founded',
       'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Competitors',
       'hourly', 'employer_provided', 'min_salary', 'max_salary', 'avg_salary',
       'job_state', 'is_headquarter', 'age', 'python_yn', 'spark_yn', 'aws_yn',
       'gcp_yn', 'azure_yn', 'java_yn', 'tableau_yn', 'powerbi_yn',
       'excel_yn'],
      dtype='object')

In [83]:
df.drop(columns=['Salary Estimate', "Job Description", "Size",  'Competitors', "Revenue", "Industry"] )

Unnamed: 0,Job Title,Rating,Company Name,Location,Headquarters,Founded,Type of ownership,Industry,Sector,hourly,...,age,python_yn,spark_yn,aws_yn,gcp_yn,azure_yn,java_yn,tableau_yn,powerbi_yn,excel_yn
0,Data Scientist,3.8,Tecolote Research,"Albuquerque, NM","Goleta, CA",1973,Company - Private,Aerospace & Defense,Aerospace & Defense,0,...,47,1,0,0,1,0,0,1,1,1
1,Healthcare Data Scientist,3.4,University of Maryland Medical System,"Linthicum, MD","Baltimore, MD",1984,Other Organization,Health Care Services & Hospitals,Health Care,0,...,36,1,0,0,1,0,1,0,0,0
2,Data Scientist,4.8,KnowBe4,"Clearwater, FL","Clearwater, FL",2010,Company - Private,Security Services,Business Services,0,...,10,1,1,0,1,0,0,0,0,1
3,Data Scientist,3.8,PNNL,"Richland, WA","Richland, WA",1965,Government,Energy,"Oil, Gas, Energy & Utilities",0,...,55,1,0,0,1,0,0,0,0,0
4,Data Scientist,2.9,Affinity Solutions,"New York, NY","New York, NY",1998,Company - Private,Advertising & Marketing,Business Services,0,...,22,1,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
950,"Sr Scientist, Immuno-Oncology - Oncology",3.9,GSK,"Cambridge, MA","Brentford, United Kingdom",1830,Company - Public,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,0,...,190,0,0,1,1,0,0,0,0,0
951,Senior Data Engineer,4.4,Eventbrite,"Nashville, TN","San Francisco, CA",2006,Company - Public,Internet,Information Technology,0,...,14,1,1,1,1,0,1,0,0,0
952,"Project Scientist - Auton Lab, Robotics Institute",2.6,Software Engineering Institute,"Pittsburgh, PA","Pittsburgh, PA",1984,College / University,Colleges & Universities,Education,0,...,36,0,0,0,1,0,0,0,0,1
953,Data Science Manager,3.2,"Numeric, LLC","Allentown, PA","Chadds Ford, PA",-1,Company - Private,Staffing & Outsourcing,Business Services,0,...,-1,0,0,0,1,0,0,0,0,1


In [84]:
df["Type of ownership"].value_counts()

Company - Private                 410
Company - Public                  193
Nonprofit Organization             55
Subsidiary or Business Segment     34
Government                         15
Hospital                           15
College / University               13
Other Organization                  3
School / School District            2
-1                                  1
Unknown                             1
Name: Type of ownership, dtype: int64

In [88]:
a = df["Job Title"].value_counts()

In [197]:
def job_spec(x):
    x = x.lower()
    job = ["data scientist", "data engineer", "data analyst", "science"]

    if job[0] in x:
        return job[0]
    elif job[1]  in x:
        return job[1]  
    elif job[2] in x:
        return job[2]
    
    return 'n'
   
    
    

df["job_spec"] = df["Job Title"].apply(job_spec)

In [198]:
a = df[["Job Title", "job_spec"]]
a["job_spec"].value_counts()

data scientist    279
n                 245
data engineer     119
data analyst       99
Name: job_spec, dtype: int64

In [180]:
df.drop(columns="job_spec", inplace=True)

In [133]:
df["Job Title"].value_counts()

Data Scientist                                     131
Data Engineer                                       53
Senior Data Scientist                               34
Data Analyst                                        15
Senior Data Engineer                                14
                                                  ... 
Ag Data Scientist                                    1
Program/Data Analyst                                 1
Data Engineer 4 - Contract                           1
Research Scientist, Machine Learning Department      1
Supply Chain Data Analyst                            1
Name: Job Title, Length: 264, dtype: int64

In [175]:
# import numpy as np

# j = ["data scientist", "data engineer", "data analyst"]

# df["job_spec"] = (np.select(
#     condlist=[ j[0] in df["Job Title"].apply(lambda x: x.lower()),
#                     j[1] in df["Job Title"].apply(lambda x: x.lower()),
#                     j[2] in df["Job Title"].apply(lambda x: x.lower()) ],
#     choicelist=[ j[0], j[1], j[2] ],
#     default= "daat"
# ))

# df["job_spec"] = df["Job Title"].apply(lambda x:  j[0] if j[0] in x.lower() j[1] elif j[1] in x.lower() j[2] elif j[2] in x.lower() else "Daat")

df["job_spec"] = df["Job Title"].apply(lambda x:  j[1] if j[1] in x.lower() else "n")

In [158]:
def flag_df(df):

    if (df['trigger1'] <= df['score'] < df['trigger2']) and (df['height'] < 8):
        return 'Red'
    elif (df['trigger2'] <= df['score'] < df['trigger3']) and (df['height'] < 8):
        return 'Yellow'
    elif (df['trigger3'] <= df['score']) and (df['height'] < 8):
        return 'Orange'
    elif (df['height'] > 8):
        return np.nan

df2['Flag'] = df2.apply(flag_df, axis = 1)


def job_spec(df):
    job = ["data scientist", "data engineer", "data analyst"]

    if job[0] in df["Job"]:
        return job[0]
    
    if job[2] in x:
        return job[2]
    if job[1] or "data engineering" in x:
        return job[1]
    else:  
        return "n"
    

df["job_spec"] = df["Job Title"].apply(job_spec)

SyntaxError: invalid syntax (<ipython-input-158-26ab3508f9a9>, line 1)